PdfPig/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs

202 lines
8.6 KiB
C#
Raw Normal View History

namespace UglyToad.PdfPig.Parser.FileStructure
{
using System;
using System.Collections.Generic;
using Cos;
using Exceptions;
using IO;
using Logging;
using Parts;
using Parts.CrossReference;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class CrossReferenceParser
{
private readonly ILog log;
private readonly CosDictionaryParser dictionaryParser;
private readonly CosBaseParser baseParser;
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
private readonly CrossReferenceTableParser crossReferenceTableParser;
2018-01-21 02:42:29 +08:00
private readonly XrefCosOffsetChecker xrefCosChecker;
2018-01-21 02:42:29 +08:00
public CrossReferenceParser(ILog log, CosDictionaryParser dictionaryParser, CosBaseParser baseParser,
CrossReferenceStreamParser crossReferenceStreamParser,
2018-01-21 02:42:29 +08:00
CrossReferenceTableParser crossReferenceTableParser)
{
this.log = log;
this.dictionaryParser = dictionaryParser;
this.baseParser = baseParser;
this.crossReferenceStreamParser = crossReferenceStreamParser;
this.crossReferenceTableParser = crossReferenceTableParser;
2018-01-21 02:42:29 +08:00
xrefCosChecker = new XrefCosOffsetChecker();
}
2018-01-21 02:42:29 +08:00
public CrossReferenceTable Parse(IRandomAccessRead reader, bool isLenientParsing, long xrefLocation,
2018-01-21 02:42:29 +08:00
CosObjectPool pool, IPdfTokenScanner pdfScanner, ISeekableTokenScanner tokenScanner)
{
var xrefOffsetValidator = new XrefOffsetValidator(log, reader, dictionaryParser, baseParser, pool);
long fixedOffset = xrefOffsetValidator.CheckXRefOffset(xrefLocation, isLenientParsing);
if (fixedOffset > -1)
{
xrefLocation = fixedOffset;
2018-01-21 02:42:29 +08:00
log.Debug($"Found the first cross reference table or stream at {fixedOffset}.");
}
var table = new CrossReferenceTableBuilder();
2018-01-21 02:42:29 +08:00
var prevSet = new HashSet<long>();
long previousCrossReferenceLocation = xrefLocation;
2018-01-21 02:42:29 +08:00
// Parse all cross reference tables and streams.
while (previousCrossReferenceLocation > 0)
{
2018-01-21 02:42:29 +08:00
log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}.");
// seek to xref table
2018-01-21 02:42:29 +08:00
tokenScanner.Seek(previousCrossReferenceLocation);
2018-01-21 02:42:29 +08:00
tokenScanner.MoveNext();
2018-01-21 02:42:29 +08:00
if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
{
2018-01-21 02:42:29 +08:00
log.Debug("Element was cross reference table.");
CrossReferenceTablePart tablePart = crossReferenceTableParser.Parse(tokenScanner,
previousCrossReferenceLocation, isLenientParsing);
previousCrossReferenceLocation = tablePart.GetPreviousOffset();
DictionaryToken tableDictionary = tablePart.Dictionary;
CrossReferenceTablePart streamPart = null;
2018-01-21 02:42:29 +08:00
// check for a XRef stream, it may contain some object ids of compressed objects
2018-01-21 02:42:29 +08:00
if (tableDictionary.ContainsKey(NameToken.XrefStm))
{
2018-01-21 02:42:29 +08:00
log.Debug("Cross reference table contained referenced to stream. Reading the stream.");
int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;
// check the xref stream reference
fixedOffset = xrefOffsetValidator.CheckXRefOffset(streamOffset, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != streamOffset)
{
2018-01-21 02:42:29 +08:00
log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");
streamOffset = (int)fixedOffset;
2018-01-21 02:42:29 +08:00
// Update the cross reference table to be a stream instead.
tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset));
tablePart = new CrossReferenceTablePart(tablePart.ObjectOffsets, streamOffset,
tablePart.Previous, tableDictionary, tablePart.Type);
}
2018-01-21 02:42:29 +08:00
// Read the stream from the table.
if (streamOffset > 0)
{
try
{
2018-01-21 04:20:40 +08:00
streamPart = ParseCrossReferenceStream(streamOffset, pdfScanner);
}
catch (InvalidOperationException ex)
{
if (isLenientParsing)
{
log.Error("Failed to parse /XRefStm at offset " + streamOffset, ex);
}
else
{
2018-01-21 02:42:29 +08:00
throw;
}
}
}
else
{
if (isLenientParsing)
{
2018-01-21 02:42:29 +08:00
log.Error("Skipped XRef stream due to a corrupt offset:" + streamOffset);
}
else
{
2018-01-21 02:42:29 +08:00
throw new PdfDocumentFormatException("Skipped XRef stream due to a corrupt offset:" + streamOffset);
}
}
}
2018-01-21 02:42:29 +08:00
table.Add(tablePart);
if (streamPart != null)
{
table.Add(streamPart);
}
}
2018-01-21 02:42:29 +08:00
else if (tokenScanner.CurrentToken is NumericToken)
{
2018-01-21 02:42:29 +08:00
log.Debug("Element was cross reference stream.");
// Unread the numeric token.
tokenScanner.Seek(previousCrossReferenceLocation);
// parse xref stream
2018-01-21 02:42:29 +08:00
var tablePart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner);
table.Add(tablePart);
previousCrossReferenceLocation = tablePart.Previous;
if (previousCrossReferenceLocation > 0)
{
// check the xref table reference
fixedOffset = xrefOffsetValidator.CheckXRefOffset(previousCrossReferenceLocation, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
{
previousCrossReferenceLocation = fixedOffset;
tablePart.FixOffset(previousCrossReferenceLocation);
}
}
}
2018-01-21 02:42:29 +08:00
else
{
log.Debug("Element was invalid.");
throw new PdfDocumentFormatException("The cross reference found at this location was not a " +
$"table or a stream: Location - {previousCrossReferenceLocation}, {tokenScanner.CurrentPosition}.");
}
if (prevSet.Contains(previousCrossReferenceLocation))
{
2018-01-21 02:42:29 +08:00
throw new PdfDocumentFormatException("The cross references formed an infinite loop.");
}
2018-01-21 02:42:29 +08:00
prevSet.Add(previousCrossReferenceLocation);
}
var resolved = table.Build(xrefLocation, log);
// check the offsets of all referenced objects
xrefCosChecker.checkXrefOffsets(reader, resolved, isLenientParsing);
return resolved;
}
2018-01-21 02:42:29 +08:00
private CrossReferenceTablePart ParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner)
{
2018-01-21 02:42:29 +08:00
pdfScanner.Seek(objByteOffset);
2018-01-21 02:42:29 +08:00
pdfScanner.MoveNext();
2018-01-21 02:42:29 +08:00
var streamObjectToken = (ObjectToken)pdfScanner.CurrentToken;
2018-01-21 02:42:29 +08:00
if (streamObjectToken == null || !(streamObjectToken.Data is StreamToken objectStream))
{
throw new PdfDocumentFormatException($"When reading a cross reference stream object found a non-stream object: {streamObjectToken?.Data}");
}
CrossReferenceTablePart xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream);
return xrefTablePart;
}
}
}