2018-01-11 03:49:32 +08:00
|
|
|
|
namespace UglyToad.PdfPig.Parser.FileStructure
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
using System;
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
using Cos;
|
2018-01-05 05:09:47 +08:00
|
|
|
|
using Exceptions;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
using IO;
|
|
|
|
|
using Logging;
|
2018-01-05 05:09:47 +08:00
|
|
|
|
using Parts;
|
|
|
|
|
using Parts.CrossReference;
|
|
|
|
|
using Tokenization.Scanner;
|
|
|
|
|
using Tokenization.Tokens;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-05 05:09:47 +08:00
|
|
|
|
internal class CrossReferenceParser
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
private readonly ILog log;
|
|
|
|
|
private readonly CosDictionaryParser dictionaryParser;
|
|
|
|
|
private readonly CosBaseParser baseParser;
|
|
|
|
|
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
|
|
|
|
|
private readonly CrossReferenceTableParser crossReferenceTableParser;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
private readonly XrefCosOffsetChecker xrefCosChecker;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
public CrossReferenceParser(ILog log, CosDictionaryParser dictionaryParser, CosBaseParser baseParser,
|
2017-11-10 03:14:09 +08:00
|
|
|
|
CrossReferenceStreamParser crossReferenceStreamParser,
|
2018-01-21 02:42:29 +08:00
|
|
|
|
CrossReferenceTableParser crossReferenceTableParser)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
this.log = log;
|
|
|
|
|
this.dictionaryParser = dictionaryParser;
|
|
|
|
|
this.baseParser = baseParser;
|
|
|
|
|
this.crossReferenceStreamParser = crossReferenceStreamParser;
|
|
|
|
|
this.crossReferenceTableParser = crossReferenceTableParser;
|
2018-01-05 05:09:47 +08:00
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
xrefCosChecker = new XrefCosOffsetChecker();
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
public CrossReferenceTable Parse(IRandomAccessRead reader, bool isLenientParsing, long xrefLocation,
|
2018-01-21 02:42:29 +08:00
|
|
|
|
CosObjectPool pool, IPdfTokenScanner pdfScanner, ISeekableTokenScanner tokenScanner)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
var xrefOffsetValidator = new XrefOffsetValidator(log, reader, dictionaryParser, baseParser, pool);
|
|
|
|
|
long fixedOffset = xrefOffsetValidator.CheckXRefOffset(xrefLocation, isLenientParsing);
|
|
|
|
|
if (fixedOffset > -1)
|
|
|
|
|
{
|
|
|
|
|
xrefLocation = fixedOffset;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
|
|
|
|
log.Debug($"Found the first cross reference table or stream at {fixedOffset}.");
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var table = new CrossReferenceTableBuilder();
|
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
var prevSet = new HashSet<long>();
|
2017-12-30 20:56:46 +08:00
|
|
|
|
long previousCrossReferenceLocation = xrefLocation;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
|
|
|
|
// Parse all cross reference tables and streams.
|
2017-12-30 20:56:46 +08:00
|
|
|
|
while (previousCrossReferenceLocation > 0)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}.");
|
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
// seek to xref table
|
2018-01-21 02:42:29 +08:00
|
|
|
|
tokenScanner.Seek(previousCrossReferenceLocation);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
tokenScanner.MoveNext();
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
log.Debug("Element was cross reference table.");
|
|
|
|
|
|
|
|
|
|
CrossReferenceTablePart tablePart = crossReferenceTableParser.Parse(tokenScanner,
|
|
|
|
|
previousCrossReferenceLocation, isLenientParsing);
|
|
|
|
|
|
|
|
|
|
previousCrossReferenceLocation = tablePart.GetPreviousOffset();
|
|
|
|
|
|
|
|
|
|
DictionaryToken tableDictionary = tablePart.Dictionary;
|
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
CrossReferenceTablePart streamPart = null;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
// check for a XRef stream, it may contain some object ids of compressed objects
|
2018-01-21 02:42:29 +08:00
|
|
|
|
if (tableDictionary.ContainsKey(NameToken.XrefStm))
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
log.Debug("Cross reference table contained referenced to stream. Reading the stream.");
|
|
|
|
|
|
|
|
|
|
int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;
|
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
// check the xref stream reference
|
|
|
|
|
fixedOffset = xrefOffsetValidator.CheckXRefOffset(streamOffset, isLenientParsing);
|
|
|
|
|
if (fixedOffset > -1 && fixedOffset != streamOffset)
|
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");
|
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
streamOffset = (int)fixedOffset;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
|
|
|
|
// Update the cross reference table to be a stream instead.
|
|
|
|
|
tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset));
|
|
|
|
|
tablePart = new CrossReferenceTablePart(tablePart.ObjectOffsets, streamOffset,
|
|
|
|
|
tablePart.Previous, tableDictionary, tablePart.Type);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
|
|
|
|
// Read the stream from the table.
|
2017-11-10 03:14:09 +08:00
|
|
|
|
if (streamOffset > 0)
|
|
|
|
|
{
|
|
|
|
|
try
|
|
|
|
|
{
|
2018-01-21 04:20:40 +08:00
|
|
|
|
streamPart = ParseCrossReferenceStream(streamOffset, pdfScanner);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
catch (InvalidOperationException ex)
|
|
|
|
|
{
|
|
|
|
|
if (isLenientParsing)
|
|
|
|
|
{
|
|
|
|
|
log.Error("Failed to parse /XRefStm at offset " + streamOffset, ex);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
throw;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (isLenientParsing)
|
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
log.Error("Skipped XRef stream due to a corrupt offset:" + streamOffset);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
throw new PdfDocumentFormatException("Skipped XRef stream due to a corrupt offset:" + streamOffset);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
|
|
|
|
table.Add(tablePart);
|
2017-12-30 20:56:46 +08:00
|
|
|
|
|
|
|
|
|
if (streamPart != null)
|
|
|
|
|
{
|
|
|
|
|
table.Add(streamPart);
|
|
|
|
|
}
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
else if (tokenScanner.CurrentToken is NumericToken)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
log.Debug("Element was cross reference stream.");
|
|
|
|
|
|
|
|
|
|
// Unread the numeric token.
|
|
|
|
|
tokenScanner.Seek(previousCrossReferenceLocation);
|
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
// parse xref stream
|
2018-01-21 02:42:29 +08:00
|
|
|
|
var tablePart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
table.Add(tablePart);
|
|
|
|
|
|
2017-12-30 20:56:46 +08:00
|
|
|
|
previousCrossReferenceLocation = tablePart.Previous;
|
|
|
|
|
if (previousCrossReferenceLocation > 0)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
// check the xref table reference
|
2017-12-30 20:56:46 +08:00
|
|
|
|
fixedOffset = xrefOffsetValidator.CheckXRefOffset(previousCrossReferenceLocation, isLenientParsing);
|
|
|
|
|
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2017-12-30 20:56:46 +08:00
|
|
|
|
previousCrossReferenceLocation = fixedOffset;
|
|
|
|
|
tablePart.FixOffset(previousCrossReferenceLocation);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
log.Debug("Element was invalid.");
|
|
|
|
|
|
|
|
|
|
throw new PdfDocumentFormatException("The cross reference found at this location was not a " +
|
|
|
|
|
$"table or a stream: Location - {previousCrossReferenceLocation}, {tokenScanner.CurrentPosition}.");
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-30 20:56:46 +08:00
|
|
|
|
if (prevSet.Contains(previousCrossReferenceLocation))
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
throw new PdfDocumentFormatException("The cross references formed an infinite loop.");
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
2017-12-30 20:56:46 +08:00
|
|
|
|
prevSet.Add(previousCrossReferenceLocation);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var resolved = table.Build(xrefLocation, log);
|
|
|
|
|
|
|
|
|
|
// check the offsets of all referenced objects
|
|
|
|
|
xrefCosChecker.checkXrefOffsets(reader, resolved, isLenientParsing);
|
|
|
|
|
|
|
|
|
|
return resolved;
|
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
|
|
|
|
private CrossReferenceTablePart ParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
pdfScanner.Seek(objByteOffset);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
pdfScanner.MoveNext();
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
var streamObjectToken = (ObjectToken)pdfScanner.CurrentToken;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
if (streamObjectToken == null || !(streamObjectToken.Data is StreamToken objectStream))
|
|
|
|
|
{
|
|
|
|
|
throw new PdfDocumentFormatException($"When reading a cross reference stream object found a non-stream object: {streamObjectToken?.Data}");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CrossReferenceTablePart xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
|
|
|
|
return xrefTablePart;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|