PdfPig/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs
2024-04-12 07:42:19 +01:00

357 lines
13 KiB
C#

namespace UglyToad.PdfPig.Parser.FileStructure
{
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using Core;
using CrossReference;
using Logging;
using Parts.CrossReference;
using Tokenization.Scanner;
using Tokens;
internal class CrossReferenceParser
{
private readonly ILog log;
private readonly XrefOffsetValidator offsetValidator;
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator,
CrossReferenceStreamParser crossReferenceStreamParser)
{
this.log = log;
this.offsetValidator = offsetValidator;
this.crossReferenceStreamParser = crossReferenceStreamParser;
}
public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long crossReferenceLocation,
long offsetCorrection,
IPdfTokenScanner pdfScanner,
ISeekableTokenScanner tokenScanner)
{
long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceLocation, tokenScanner, bytes, isLenientParsing);
if (fixedOffset > -1)
{
crossReferenceLocation = fixedOffset;
log.Debug($"Found the first cross reference table or stream at {fixedOffset}.");
}
var table = new CrossReferenceTableBuilder();
var prevSet = new HashSet<long>();
long previousCrossReferenceLocation = crossReferenceLocation;
var missedAttempts = 0;
// Parse all cross reference tables and streams.
while (previousCrossReferenceLocation > 0 && missedAttempts < 100)
{
log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}.");
if (previousCrossReferenceLocation >= bytes.Length)
{
break;
}
// seek to xref table
tokenScanner.Seek(previousCrossReferenceLocation);
tokenScanner.MoveNext();
if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
{
missedAttempts = 0;
log.Debug("Element was cross reference table.");
CrossReferenceTablePart tablePart = CrossReferenceTableParser.Parse(tokenScanner,
previousCrossReferenceLocation, isLenientParsing);
var nextOffset = tablePart.GetPreviousOffset();
if (nextOffset >= 0)
{
nextOffset += offsetCorrection;
}
previousCrossReferenceLocation = nextOffset;
DictionaryToken tableDictionary = tablePart.Dictionary;
CrossReferenceTablePart? streamPart = null;
// check for a XRef stream, it may contain some object ids of compressed objects
if (tableDictionary.ContainsKey(NameToken.XrefStm))
{
log.Debug("Cross reference table contained reference to stream. Reading the stream.");
var tiedToTableAtOffset = tablePart.Offset;
int streamOffset = ((NumericToken) tableDictionary.Data[NameToken.XrefStm]).Int;
// check the xref stream reference
fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, bytes, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != streamOffset)
{
log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");
streamOffset = (int)fixedOffset;
// Update the cross reference table to be a stream instead.
tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset));
tablePart = new CrossReferenceTablePart(
tablePart.ObjectOffsets,
streamOffset,
tablePart.Previous,
tableDictionary,
tablePart.Type,
tiedToTableAtOffset);
}
// Read the stream from the table.
if (streamOffset > 0)
{
try
{
TryParseCrossReferenceStream(streamOffset, pdfScanner, tiedToTableAtOffset, out streamPart);
}
catch (InvalidOperationException ex)
{
if (isLenientParsing)
{
log.Error("Failed to parse /XRefStm at offset " + streamOffset, ex);
}
else
{
throw;
}
}
}
else
{
if (isLenientParsing)
{
log.Error("Skipped XRef stream due to a corrupt offset:" + streamOffset);
}
else
{
throw new PdfDocumentFormatException("Skipped XRef stream due to a corrupt offset:" + streamOffset);
}
}
}
table.Add(tablePart);
if (streamPart != null)
{
table.Add(streamPart);
}
}
else if (tokenScanner.CurrentToken is NumericToken)
{
log.Debug("Element was cross reference stream.");
missedAttempts = 0;
// Unread the numeric token.
tokenScanner.Seek(previousCrossReferenceLocation);
// parse xref stream
if (!TryParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner, null, out var tablePart))
{
if (!TryBruteForceXrefTableLocate(bytes, previousCrossReferenceLocation, out var actualOffset))
{
throw new PdfDocumentFormatException();
}
previousCrossReferenceLocation = actualOffset;
missedAttempts++;
continue;
}
table.Add(tablePart);
previousCrossReferenceLocation = tablePart.Previous;
if (previousCrossReferenceLocation >= 0)
{
previousCrossReferenceLocation += offsetCorrection;
}
if (previousCrossReferenceLocation > 0)
{
// check the xref table reference
fixedOffset = offsetValidator.CheckXRefOffset(previousCrossReferenceLocation, tokenScanner, bytes, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
{
previousCrossReferenceLocation = fixedOffset;
tablePart.FixOffset(previousCrossReferenceLocation);
}
}
}
else
{
log.Debug($"The cross reference found at this location ({previousCrossReferenceLocation}) was not a table or stream. " +
$"Found token ({tokenScanner.CurrentToken}) ending at {tokenScanner.CurrentPosition} instead. Seeking next token.");
var storedCurrentTokenScannerPosition = tokenScanner.CurrentPosition;
if (missedAttempts == 0)
{
// We might only be a little bit out so let's just check the neighbourhood (for tables only).
const int bufferSize = 128;
var from = Math.Max(0, previousCrossReferenceLocation - bufferSize / 2);
bytes.Seek(from);
var buffer = new byte[bufferSize];
bytes.Read(buffer);
var content = OtherEncodings.BytesAsLatin1String(buffer);
var xrefAt = content.IndexOf("xref", StringComparison.OrdinalIgnoreCase);
if (xrefAt >= 0)
{
previousCrossReferenceLocation = from + xrefAt;
missedAttempts++;
continue;
}
}
previousCrossReferenceLocation = storedCurrentTokenScannerPosition;
missedAttempts++;
continue;
}
if (prevSet.Contains(previousCrossReferenceLocation))
{
throw new PdfDocumentFormatException("The cross references formed an infinite loop.");
}
prevSet.Add(previousCrossReferenceLocation);
}
if (missedAttempts == 100)
{
// TODO: scan the document to find the correct token.
throw new PdfDocumentFormatException("The cross reference was not found.");
}
var resolved = table.Build(crossReferenceLocation, offsetCorrection, log);
// check the offsets of all referenced objects
if (!CrossReferenceObjectOffsetValidator.ValidateCrossReferenceOffsets(bytes, resolved, log, out var actualOffsets))
{
resolved = new CrossReferenceTable(resolved.Type, actualOffsets, resolved.Trailer, resolved.CrossReferenceOffsets);
}
return resolved;
}
private bool TryParseCrossReferenceStream(
long objByteOffset,
IPdfTokenScanner pdfScanner,
long? fromTableAtOffset,
[NotNullWhen(true)] out CrossReferenceTablePart? xrefTablePart)
{
xrefTablePart = null;
pdfScanner.Seek(objByteOffset);
pdfScanner.MoveNext();
var streamObjectToken = (ObjectToken)pdfScanner.CurrentToken;
if (streamObjectToken is null || !(streamObjectToken.Data is StreamToken objectStream))
{
log.Error($"When reading a cross reference stream object found a non-stream object: {streamObjectToken?.Data}");
return false;
}
xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, fromTableAtOffset, objectStream);
return true;
}
private bool TryBruteForceXrefTableLocate(IInputBytes bytes, long expectedOffset,
out long actualOffset)
{
actualOffset = expectedOffset;
bytes.Seek(expectedOffset - 1);
var currentByte = bytes.CurrentByte;
// Forward:
while (bytes.MoveNext())
{
var previousByte = currentByte;
currentByte = bytes.CurrentByte;
if (currentByte != 'x' || !ReadHelper.IsWhitespace(previousByte))
{
continue;
}
if (!ReadHelper.IsString(bytes, "xref"))
{
continue;
}
actualOffset = bytes.CurrentOffset;
return true;
}
var lastOffset = expectedOffset - 1;
if (lastOffset < 0)
{
return false;
}
bytes.Seek(lastOffset);
Span<byte> buffer = stackalloc byte[5];
while (bytes.Read(buffer) == buffer.Length)
{
for (var i = 1; i < buffer.Length; i++)
{
var p = buffer[i - 1];
var b = buffer[i];
var couldBeXrefStartWhitespacePrecedes = b == 'x' && ReadHelper.IsWhitespace(p);
var couldBeXrefBufferAligned = p == 'x' && b == 'r';
if (!couldBeXrefBufferAligned && !couldBeXrefStartWhitespacePrecedes)
{
continue;
}
var xLocation = lastOffset + i + (couldBeXrefStartWhitespacePrecedes ? 1 : 0);
bytes.Seek(xLocation);
if (ReadHelper.IsString(bytes, "xref"))
{
actualOffset = xLocation;
return true;
}
}
lastOffset -= buffer.Length;
if (lastOffset < 0)
{
break;
}
bytes.Seek(lastOffset);
}
bytes.Read(buffer);
return false;
}
}
}