PdfPig/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs

324 lines
12 KiB
C#
Raw Normal View History

namespace UglyToad.PdfPig.Parser.FileStructure
{
using System;
using System.Collections.Generic;
using Core;
using CrossReference;
using Logging;
using Parts.CrossReference;
using Tokenization.Scanner;
using Tokens;
internal class CrossReferenceParser
{
private readonly ILog log;
2018-01-21 22:48:49 +08:00
private readonly XrefOffsetValidator offsetValidator;
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
2018-01-21 22:48:49 +08:00
public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator,
CrossReferenceStreamParser crossReferenceStreamParser)
{
this.log = log;
2018-01-21 22:48:49 +08:00
this.offsetValidator = offsetValidator;
this.crossReferenceStreamParser = crossReferenceStreamParser;
}
2018-01-21 02:42:29 +08:00
public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long crossReferenceLocation,
long offsetCorrection,
IPdfTokenScanner pdfScanner,
ISeekableTokenScanner tokenScanner)
{
long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceLocation, tokenScanner, bytes, isLenientParsing);
if (fixedOffset > -1)
{
crossReferenceLocation = fixedOffset;
2018-01-21 02:42:29 +08:00
log.Debug($"Found the first cross reference table or stream at {fixedOffset}.");
}
var table = new CrossReferenceTableBuilder();
2018-01-21 02:42:29 +08:00
var prevSet = new HashSet<long>();
long previousCrossReferenceLocation = crossReferenceLocation;
2018-01-21 02:42:29 +08:00
var missedAttempts = 0;
2018-01-21 02:42:29 +08:00
// Parse all cross reference tables and streams.
while (previousCrossReferenceLocation > 0 && missedAttempts < 100)
{
2018-01-21 02:42:29 +08:00
log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}.");
if (previousCrossReferenceLocation >= bytes.Length)
{
break;
}
// seek to xref table
2018-01-21 02:42:29 +08:00
tokenScanner.Seek(previousCrossReferenceLocation);
2018-01-21 02:42:29 +08:00
tokenScanner.MoveNext();
2018-01-21 02:42:29 +08:00
if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
{
missedAttempts = 0;
2018-01-21 02:42:29 +08:00
log.Debug("Element was cross reference table.");
CrossReferenceTablePart tablePart = CrossReferenceTableParser.Parse(tokenScanner,
2018-01-21 02:42:29 +08:00
previousCrossReferenceLocation, isLenientParsing);
var nextOffset = tablePart.GetPreviousOffset();
if (nextOffset >= 0)
{
nextOffset += offsetCorrection;
}
previousCrossReferenceLocation = nextOffset;
2018-01-21 02:42:29 +08:00
DictionaryToken tableDictionary = tablePart.Dictionary;
CrossReferenceTablePart streamPart = null;
2018-01-21 02:42:29 +08:00
// check for a XRef stream, it may contain some object ids of compressed objects
2018-01-21 02:42:29 +08:00
if (tableDictionary.ContainsKey(NameToken.XrefStm))
{
2018-01-21 02:42:29 +08:00
log.Debug("Cross reference table contained referenced to stream. Reading the stream.");
int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;
// check the xref stream reference
2018-01-22 02:08:00 +08:00
fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, bytes, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != streamOffset)
{
2018-01-21 02:42:29 +08:00
log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");
streamOffset = (int)fixedOffset;
2018-01-21 02:42:29 +08:00
// Update the cross reference table to be a stream instead.
tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset));
tablePart = new CrossReferenceTablePart(tablePart.ObjectOffsets, streamOffset,
tablePart.Previous, tableDictionary, tablePart.Type);
}
2018-01-21 02:42:29 +08:00
// Read the stream from the table.
if (streamOffset > 0)
{
try
{
TryParseCrossReferenceStream(streamOffset, pdfScanner, out streamPart);
}
catch (InvalidOperationException ex)
{
if (isLenientParsing)
{
log.Error("Failed to parse /XRefStm at offset " + streamOffset, ex);
}
else
{
2018-01-21 02:42:29 +08:00
throw;
}
}
}
else
{
if (isLenientParsing)
{
2018-01-21 02:42:29 +08:00
log.Error("Skipped XRef stream due to a corrupt offset:" + streamOffset);
}
else
{
2018-01-21 02:42:29 +08:00
throw new PdfDocumentFormatException("Skipped XRef stream due to a corrupt offset:" + streamOffset);
}
}
}
2018-01-21 02:42:29 +08:00
table.Add(tablePart);
if (streamPart != null)
{
table.Add(streamPart);
}
}
2018-01-21 02:42:29 +08:00
else if (tokenScanner.CurrentToken is NumericToken)
{
2018-01-21 02:42:29 +08:00
log.Debug("Element was cross reference stream.");
missedAttempts = 0;
2018-01-21 02:42:29 +08:00
// Unread the numeric token.
tokenScanner.Seek(previousCrossReferenceLocation);
// parse xref stream
if (!TryParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner, out var tablePart))
{
if (!TryBruteForceXrefTableLocate(bytes, previousCrossReferenceLocation, out var actualOffset))
{
throw new PdfDocumentFormatException();
}
previousCrossReferenceLocation = actualOffset;
missedAttempts++;
continue;
}
table.Add(tablePart);
previousCrossReferenceLocation = tablePart.Previous;
if (previousCrossReferenceLocation >= 0)
{
previousCrossReferenceLocation += offsetCorrection;
}
if (previousCrossReferenceLocation > 0)
{
// check the xref table reference
2018-01-22 02:08:00 +08:00
fixedOffset = offsetValidator.CheckXRefOffset(previousCrossReferenceLocation, tokenScanner, bytes, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
{
previousCrossReferenceLocation = fixedOffset;
tablePart.FixOffset(previousCrossReferenceLocation);
}
}
}
2018-01-21 02:42:29 +08:00
else
{
log.Debug($"The cross reference found at this location ({previousCrossReferenceLocation}) was not a table or stream. " +
$"Found token ({tokenScanner.CurrentToken}) ending at {tokenScanner.CurrentPosition} instead. Seeking next token.");
previousCrossReferenceLocation = tokenScanner.CurrentPosition;
missedAttempts++;
2018-01-21 02:42:29 +08:00
continue;
2018-01-21 02:42:29 +08:00
}
if (prevSet.Contains(previousCrossReferenceLocation))
{
2018-01-21 02:42:29 +08:00
throw new PdfDocumentFormatException("The cross references formed an infinite loop.");
}
2018-01-21 02:42:29 +08:00
prevSet.Add(previousCrossReferenceLocation);
}
if (missedAttempts == 100)
{
// TODO: scan the document to find the correct token.
throw new PdfDocumentFormatException("The cross reference was not found.");
}
var resolved = table.Build(crossReferenceLocation, log);
// check the offsets of all referenced objects
if (!CrossReferenceObjectOffsetValidator.ValidateCrossReferenceOffsets(bytes, resolved, log, out var actualOffsets))
{
resolved = new CrossReferenceTable(resolved.Type, actualOffsets, resolved.Trailer, resolved.CrossReferenceOffsets);
}
return resolved;
}
2018-01-21 02:42:29 +08:00
private bool TryParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner,
out CrossReferenceTablePart xrefTablePart)
{
xrefTablePart = null;
2018-01-21 02:42:29 +08:00
pdfScanner.Seek(objByteOffset);
2018-01-21 02:42:29 +08:00
pdfScanner.MoveNext();
2018-01-21 02:42:29 +08:00
var streamObjectToken = (ObjectToken)pdfScanner.CurrentToken;
2018-01-21 02:42:29 +08:00
if (streamObjectToken == null || !(streamObjectToken.Data is StreamToken objectStream))
{
log.Error($"When reading a cross reference stream object found a non-stream object: {streamObjectToken?.Data}");
return false;
2018-01-21 02:42:29 +08:00
}
xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream);
return true;
}
private bool TryBruteForceXrefTableLocate(IInputBytes bytes, long expectedOffset,
out long actualOffset)
{
actualOffset = expectedOffset;
bytes.Seek(expectedOffset - 1);
var currentByte = bytes.CurrentByte;
// Forward:
while (bytes.MoveNext())
{
var previousByte = currentByte;
currentByte = bytes.CurrentByte;
if (currentByte != 'x' || !ReadHelper.IsWhitespace(previousByte))
{
continue;
}
if (!ReadHelper.IsString(bytes, "xref"))
{
continue;
}
actualOffset = bytes.CurrentOffset;
return true;
}
var lastOffset = expectedOffset - 1;
if (lastOffset < 0)
{
return false;
}
bytes.Seek(lastOffset);
var buffer = new byte[5];
while (bytes.Read(buffer) == buffer.Length)
{
for (var i = 1; i < buffer.Length; i++)
{
var p = buffer[i - 1];
var b = buffer[i];
var couldBeXrefStartWhitespacePrecedes = b == 'x' && ReadHelper.IsWhitespace(p);
var couldBeXrefBufferAligned = p == 'x' && b == 'r';
if (!couldBeXrefBufferAligned && !couldBeXrefStartWhitespacePrecedes)
{
continue;
}
var xLocation = lastOffset + i + (couldBeXrefStartWhitespacePrecedes ? 1 : 0);
bytes.Seek(xLocation);
if (ReadHelper.IsString(bytes, "xref"))
{
actualOffset = xLocation;
return true;
}
}
lastOffset -= buffer.Length;
if (lastOffset < 0)
{
break;
}
bytes.Seek(lastOffset);
}
bytes.Read(buffer);
return false;
}
}
}