PdfPig/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs

233 lines
9.5 KiB
C#
Raw Normal View History

namespace UglyToad.PdfPig.Parser.FileStructure
{
using System;
using System.Collections.Generic;
using Core;
using CrossReference;
using Exceptions;
using Logging;
using Parts.CrossReference;
using Tokenization.Scanner;
using Tokens;
internal class CrossReferenceParser
{
private readonly ILog log;
2018-01-21 22:48:49 +08:00
private readonly XrefOffsetValidator offsetValidator;
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
private readonly CrossReferenceTableParser crossReferenceTableParser;
2018-01-21 02:42:29 +08:00
private readonly XrefCosOffsetChecker xrefCosChecker;
2018-01-21 22:48:49 +08:00
public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator,
2018-01-22 02:08:00 +08:00
XrefCosOffsetChecker xrefCosChecker,
CrossReferenceStreamParser crossReferenceStreamParser,
2018-01-21 02:42:29 +08:00
CrossReferenceTableParser crossReferenceTableParser)
{
this.log = log;
2018-01-21 22:48:49 +08:00
this.offsetValidator = offsetValidator;
this.crossReferenceStreamParser = crossReferenceStreamParser;
this.crossReferenceTableParser = crossReferenceTableParser;
2018-01-22 02:08:00 +08:00
this.xrefCosChecker = xrefCosChecker;
}
2018-01-21 02:42:29 +08:00
public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long crossReferenceLocation,
long offsetCorrection,
IPdfTokenScanner pdfScanner,
ISeekableTokenScanner tokenScanner)
{
long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceLocation, tokenScanner, bytes, isLenientParsing);
if (fixedOffset > -1)
{
crossReferenceLocation = fixedOffset;
2018-01-21 02:42:29 +08:00
log.Debug($"Found the first cross reference table or stream at {fixedOffset}.");
}
var table = new CrossReferenceTableBuilder();
2018-01-21 02:42:29 +08:00
var prevSet = new HashSet<long>();
long previousCrossReferenceLocation = crossReferenceLocation;
2018-01-21 02:42:29 +08:00
var missedAttempts = 0;
2018-01-21 02:42:29 +08:00
// Parse all cross reference tables and streams.
while (previousCrossReferenceLocation > 0 && missedAttempts < 100)
{
2018-01-21 02:42:29 +08:00
log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}.");
if (previousCrossReferenceLocation >= bytes.Length)
{
break;
}
// seek to xref table
2018-01-21 02:42:29 +08:00
tokenScanner.Seek(previousCrossReferenceLocation);
2018-01-21 02:42:29 +08:00
tokenScanner.MoveNext();
2018-01-21 02:42:29 +08:00
if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
{
missedAttempts = 0;
2018-01-21 02:42:29 +08:00
log.Debug("Element was cross reference table.");
CrossReferenceTablePart tablePart = crossReferenceTableParser.Parse(tokenScanner,
previousCrossReferenceLocation, isLenientParsing);
var nextOffset = tablePart.GetPreviousOffset();
if (nextOffset >= 0)
{
nextOffset += offsetCorrection;
}
previousCrossReferenceLocation = nextOffset;
2018-01-21 02:42:29 +08:00
DictionaryToken tableDictionary = tablePart.Dictionary;
CrossReferenceTablePart streamPart = null;
2018-01-21 02:42:29 +08:00
// check for a XRef stream, it may contain some object ids of compressed objects
2018-01-21 02:42:29 +08:00
if (tableDictionary.ContainsKey(NameToken.XrefStm))
{
2018-01-21 02:42:29 +08:00
log.Debug("Cross reference table contained referenced to stream. Reading the stream.");
int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;
// check the xref stream reference
2018-01-22 02:08:00 +08:00
fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, bytes, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != streamOffset)
{
2018-01-21 02:42:29 +08:00
log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");
streamOffset = (int)fixedOffset;
2018-01-21 02:42:29 +08:00
// Update the cross reference table to be a stream instead.
tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset));
tablePart = new CrossReferenceTablePart(tablePart.ObjectOffsets, streamOffset,
tablePart.Previous, tableDictionary, tablePart.Type);
}
2018-01-21 02:42:29 +08:00
// Read the stream from the table.
if (streamOffset > 0)
{
try
{
2018-01-21 04:20:40 +08:00
streamPart = ParseCrossReferenceStream(streamOffset, pdfScanner);
}
catch (InvalidOperationException ex)
{
if (isLenientParsing)
{
log.Error("Failed to parse /XRefStm at offset " + streamOffset, ex);
}
else
{
2018-01-21 02:42:29 +08:00
throw;
}
}
}
else
{
if (isLenientParsing)
{
2018-01-21 02:42:29 +08:00
log.Error("Skipped XRef stream due to a corrupt offset:" + streamOffset);
}
else
{
2018-01-21 02:42:29 +08:00
throw new PdfDocumentFormatException("Skipped XRef stream due to a corrupt offset:" + streamOffset);
}
}
}
2018-01-21 02:42:29 +08:00
table.Add(tablePart);
if (streamPart != null)
{
table.Add(streamPart);
}
}
2018-01-21 02:42:29 +08:00
else if (tokenScanner.CurrentToken is NumericToken)
{
2018-01-21 02:42:29 +08:00
log.Debug("Element was cross reference stream.");
missedAttempts = 0;
2018-01-21 02:42:29 +08:00
// Unread the numeric token.
tokenScanner.Seek(previousCrossReferenceLocation);
// parse xref stream
2018-01-21 02:42:29 +08:00
var tablePart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner);
table.Add(tablePart);
previousCrossReferenceLocation = tablePart.Previous;
if (previousCrossReferenceLocation >= 0)
{
previousCrossReferenceLocation += offsetCorrection;
}
if (previousCrossReferenceLocation > 0)
{
// check the xref table reference
2018-01-22 02:08:00 +08:00
fixedOffset = offsetValidator.CheckXRefOffset(previousCrossReferenceLocation, tokenScanner, bytes, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
{
previousCrossReferenceLocation = fixedOffset;
tablePart.FixOffset(previousCrossReferenceLocation);
}
}
}
2018-01-21 02:42:29 +08:00
else
{
log.Debug($"The cross reference found at this location ({previousCrossReferenceLocation}) was not a table or stream. " +
$"Found token ({tokenScanner.CurrentToken}) ending at {tokenScanner.CurrentPosition} instead. Seeking next token.");
previousCrossReferenceLocation = tokenScanner.CurrentPosition;
missedAttempts++;
2018-01-21 02:42:29 +08:00
continue;
2018-01-21 02:42:29 +08:00
}
if (prevSet.Contains(previousCrossReferenceLocation))
{
2018-01-21 02:42:29 +08:00
throw new PdfDocumentFormatException("The cross references formed an infinite loop.");
}
2018-01-21 02:42:29 +08:00
prevSet.Add(previousCrossReferenceLocation);
}
if (missedAttempts == 100)
{
// TODO: scan the document to find the correct token.
throw new PdfDocumentFormatException("The cross reference was not found.");
}
var resolved = table.Build(crossReferenceLocation, log);
// check the offsets of all referenced objects
2018-01-22 02:08:00 +08:00
xrefCosChecker.CheckCrossReferenceOffsets(bytes, resolved, isLenientParsing);
return resolved;
}
2018-01-21 02:42:29 +08:00
private CrossReferenceTablePart ParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner)
{
2018-01-21 02:42:29 +08:00
pdfScanner.Seek(objByteOffset);
2018-01-21 02:42:29 +08:00
pdfScanner.MoveNext();
2018-01-21 02:42:29 +08:00
var streamObjectToken = (ObjectToken)pdfScanner.CurrentToken;
2018-01-21 02:42:29 +08:00
if (streamObjectToken == null || !(streamObjectToken.Data is StreamToken objectStream))
{
throw new PdfDocumentFormatException($"When reading a cross reference stream object found a non-stream object: {streamObjectToken?.Data}");
}
CrossReferenceTablePart xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream);
return xrefTablePart;
}
}
}