2018-01-11 03:49:32 +08:00
|
|
|
|
namespace UglyToad.PdfPig.Parser.FileStructure
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
using System;
|
|
|
|
|
using System.Collections.Generic;
|
2020-01-05 00:38:18 +08:00
|
|
|
|
using Core;
|
2018-11-25 03:02:06 +08:00
|
|
|
|
using CrossReference;
|
2018-01-05 05:09:47 +08:00
|
|
|
|
using Exceptions;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
using Logging;
|
2018-01-05 05:09:47 +08:00
|
|
|
|
using Parts.CrossReference;
|
|
|
|
|
using Tokenization.Scanner;
|
2018-11-17 04:00:12 +08:00
|
|
|
|
using Tokens;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-05 05:09:47 +08:00
|
|
|
|
internal class CrossReferenceParser
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
private readonly ILog log;
|
2018-01-21 22:48:49 +08:00
|
|
|
|
private readonly XrefOffsetValidator offsetValidator;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
|
|
|
|
|
private readonly CrossReferenceTableParser crossReferenceTableParser;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
private readonly XrefCosOffsetChecker xrefCosChecker;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-21 22:48:49 +08:00
|
|
|
|
public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator,
|
2018-01-22 02:08:00 +08:00
|
|
|
|
XrefCosOffsetChecker xrefCosChecker,
|
2017-11-10 03:14:09 +08:00
|
|
|
|
CrossReferenceStreamParser crossReferenceStreamParser,
|
2018-01-21 02:42:29 +08:00
|
|
|
|
CrossReferenceTableParser crossReferenceTableParser)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
this.log = log;
|
2018-01-21 22:48:49 +08:00
|
|
|
|
this.offsetValidator = offsetValidator;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
this.crossReferenceStreamParser = crossReferenceStreamParser;
|
|
|
|
|
this.crossReferenceTableParser = crossReferenceTableParser;
|
2018-01-22 02:08:00 +08:00
|
|
|
|
this.xrefCosChecker = xrefCosChecker;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
2020-01-26 23:30:20 +08:00
|
|
|
|
public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long crossReferenceLocation,
|
|
|
|
|
long offsetCorrection,
|
|
|
|
|
IPdfTokenScanner pdfScanner,
|
|
|
|
|
ISeekableTokenScanner tokenScanner)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-11-25 03:02:06 +08:00
|
|
|
|
long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceLocation, tokenScanner, bytes, isLenientParsing);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
if (fixedOffset > -1)
|
|
|
|
|
{
|
2018-11-25 03:02:06 +08:00
|
|
|
|
crossReferenceLocation = fixedOffset;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
|
|
|
|
log.Debug($"Found the first cross reference table or stream at {fixedOffset}.");
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var table = new CrossReferenceTableBuilder();
|
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
var prevSet = new HashSet<long>();
|
2018-11-25 03:02:06 +08:00
|
|
|
|
long previousCrossReferenceLocation = crossReferenceLocation;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
2019-06-23 19:05:21 +08:00
|
|
|
|
var missedAttempts = 0;
|
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
// Parse all cross reference tables and streams.
|
2019-06-23 19:05:21 +08:00
|
|
|
|
while (previousCrossReferenceLocation > 0 && missedAttempts < 100)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}.");
|
|
|
|
|
|
2020-01-07 20:37:41 +08:00
|
|
|
|
if (previousCrossReferenceLocation >= bytes.Length)
|
|
|
|
|
{
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
// seek to xref table
|
2018-01-21 02:42:29 +08:00
|
|
|
|
tokenScanner.Seek(previousCrossReferenceLocation);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
tokenScanner.MoveNext();
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2019-06-23 19:05:21 +08:00
|
|
|
|
missedAttempts = 0;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
log.Debug("Element was cross reference table.");
|
|
|
|
|
|
|
|
|
|
CrossReferenceTablePart tablePart = crossReferenceTableParser.Parse(tokenScanner,
|
|
|
|
|
previousCrossReferenceLocation, isLenientParsing);
|
|
|
|
|
|
2020-01-26 23:30:20 +08:00
|
|
|
|
var nextOffset = tablePart.GetPreviousOffset();
|
|
|
|
|
|
|
|
|
|
if (nextOffset >= 0)
|
|
|
|
|
{
|
|
|
|
|
nextOffset += offsetCorrection;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
previousCrossReferenceLocation = nextOffset;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
|
|
|
|
DictionaryToken tableDictionary = tablePart.Dictionary;
|
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
CrossReferenceTablePart streamPart = null;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
// check for a XRef stream, it may contain some object ids of compressed objects
|
2018-01-21 02:42:29 +08:00
|
|
|
|
if (tableDictionary.ContainsKey(NameToken.XrefStm))
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
log.Debug("Cross reference table contained referenced to stream. Reading the stream.");
|
|
|
|
|
|
|
|
|
|
int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;
|
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
// check the xref stream reference
|
2018-01-22 02:08:00 +08:00
|
|
|
|
fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, bytes, isLenientParsing);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
if (fixedOffset > -1 && fixedOffset != streamOffset)
|
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");
|
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
streamOffset = (int)fixedOffset;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
|
|
|
|
// Update the cross reference table to be a stream instead.
|
|
|
|
|
tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset));
|
|
|
|
|
tablePart = new CrossReferenceTablePart(tablePart.ObjectOffsets, streamOffset,
|
|
|
|
|
tablePart.Previous, tableDictionary, tablePart.Type);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
|
|
|
|
// Read the stream from the table.
|
2017-11-10 03:14:09 +08:00
|
|
|
|
if (streamOffset > 0)
|
|
|
|
|
{
|
|
|
|
|
try
|
|
|
|
|
{
|
2018-01-21 04:20:40 +08:00
|
|
|
|
streamPart = ParseCrossReferenceStream(streamOffset, pdfScanner);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
catch (InvalidOperationException ex)
|
|
|
|
|
{
|
|
|
|
|
if (isLenientParsing)
|
|
|
|
|
{
|
|
|
|
|
log.Error("Failed to parse /XRefStm at offset " + streamOffset, ex);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
throw;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (isLenientParsing)
|
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
log.Error("Skipped XRef stream due to a corrupt offset:" + streamOffset);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
throw new PdfDocumentFormatException("Skipped XRef stream due to a corrupt offset:" + streamOffset);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
|
|
|
|
table.Add(tablePart);
|
2017-12-30 20:56:46 +08:00
|
|
|
|
|
|
|
|
|
if (streamPart != null)
|
|
|
|
|
{
|
|
|
|
|
table.Add(streamPart);
|
|
|
|
|
}
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
else if (tokenScanner.CurrentToken is NumericToken)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
log.Debug("Element was cross reference stream.");
|
|
|
|
|
|
2019-06-23 19:05:21 +08:00
|
|
|
|
missedAttempts = 0;
|
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
// Unread the numeric token.
|
|
|
|
|
tokenScanner.Seek(previousCrossReferenceLocation);
|
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
// parse xref stream
|
2018-01-21 02:42:29 +08:00
|
|
|
|
var tablePart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
table.Add(tablePart);
|
|
|
|
|
|
2017-12-30 20:56:46 +08:00
|
|
|
|
previousCrossReferenceLocation = tablePart.Previous;
|
2020-01-26 23:30:20 +08:00
|
|
|
|
|
|
|
|
|
if (previousCrossReferenceLocation >= 0)
|
|
|
|
|
{
|
|
|
|
|
previousCrossReferenceLocation += offsetCorrection;
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-30 20:56:46 +08:00
|
|
|
|
if (previousCrossReferenceLocation > 0)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
// check the xref table reference
|
2018-01-22 02:08:00 +08:00
|
|
|
|
fixedOffset = offsetValidator.CheckXRefOffset(previousCrossReferenceLocation, tokenScanner, bytes, isLenientParsing);
|
2017-12-30 20:56:46 +08:00
|
|
|
|
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2017-12-30 20:56:46 +08:00
|
|
|
|
previousCrossReferenceLocation = fixedOffset;
|
|
|
|
|
tablePart.FixOffset(previousCrossReferenceLocation);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
else
|
|
|
|
|
{
|
2019-06-23 19:05:21 +08:00
|
|
|
|
log.Debug($"The cross reference found at this location ({previousCrossReferenceLocation}) was not a table or stream. " +
|
|
|
|
|
$"Found token ({tokenScanner.CurrentToken}) ending at {tokenScanner.CurrentPosition} instead. Seeking next token.");
|
|
|
|
|
|
|
|
|
|
previousCrossReferenceLocation = tokenScanner.CurrentPosition;
|
|
|
|
|
|
|
|
|
|
missedAttempts++;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
2019-06-23 19:05:21 +08:00
|
|
|
|
continue;
|
2018-01-21 02:42:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
2017-12-30 20:56:46 +08:00
|
|
|
|
if (prevSet.Contains(previousCrossReferenceLocation))
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
throw new PdfDocumentFormatException("The cross references formed an infinite loop.");
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
2017-12-30 20:56:46 +08:00
|
|
|
|
prevSet.Add(previousCrossReferenceLocation);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
|
2019-06-23 19:05:21 +08:00
|
|
|
|
if (missedAttempts == 100)
|
|
|
|
|
{
|
|
|
|
|
// TODO: scan the document to find the correct token.
|
|
|
|
|
throw new PdfDocumentFormatException("The cross reference was not found.");
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-25 03:02:06 +08:00
|
|
|
|
var resolved = table.Build(crossReferenceLocation, log);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
|
|
|
|
// check the offsets of all referenced objects
|
2018-01-22 02:08:00 +08:00
|
|
|
|
xrefCosChecker.CheckCrossReferenceOffsets(bytes, resolved, isLenientParsing);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
|
|
|
|
return resolved;
|
|
|
|
|
}
|
2018-01-21 02:42:29 +08:00
|
|
|
|
|
|
|
|
|
private CrossReferenceTablePart ParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-21 02:42:29 +08:00
|
|
|
|
pdfScanner.Seek(objByteOffset);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
pdfScanner.MoveNext();
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
var streamObjectToken = (ObjectToken)pdfScanner.CurrentToken;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-21 02:42:29 +08:00
|
|
|
|
if (streamObjectToken == null || !(streamObjectToken.Data is StreamToken objectStream))
|
|
|
|
|
{
|
|
|
|
|
throw new PdfDocumentFormatException($"When reading a cross reference stream object found a non-stream object: {streamObjectToken?.Data}");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CrossReferenceTablePart xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
|
|
|
|
return xrefTablePart;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|