Try to repair xref offset by looking for all startxref and fix #1040
Some checks are pending
Build and test / build (push) Waiting to run
Build and test [MacOS] / build (push) Waiting to run
Run Integration Tests / build (push) Waiting to run

This commit is contained in:
BobLd
2025-05-18 14:52:21 +01:00
parent bf7c3c01d0
commit 6911f31b49
5 changed files with 140 additions and 21 deletions

View File

@@ -189,7 +189,34 @@
return found; return found;
} }
/// <summary>
/// Whether the given string is at this position in the input.
/// Resets to the current offset once read.
/// </summary>
public static bool IsString(IInputBytes bytes, ReadOnlySpan<byte> s)
{
bool found = true;
var startOffset = bytes.CurrentOffset;
foreach (var c in s)
{
if (bytes.CurrentByte != c)
{
found = false;
break;
}
bytes.MoveNext();
}
bytes.Seek(startOffset);
return found;
}
/// <summary> /// <summary>
/// Read a long from the input. /// Read a long from the input.
/// </summary> /// </summary>
@@ -252,14 +279,6 @@
throw new PdfDocumentFormatException($"Error: Expected an integer type at offset {bytes.CurrentOffset}, instead got \'{OtherEncodings.BytesAsLatin1String(intBytes)}\'"); throw new PdfDocumentFormatException($"Error: Expected an integer type at offset {bytes.CurrentOffset}, instead got \'{OtherEncodings.BytesAsLatin1String(intBytes)}\'");
} }
} }
/// <summary>
/// Whether the given character is a space.
/// </summary>
public static bool IsSpace(int c)
{
return c == ' ';
}
/// <summary> /// <summary>
/// Whether the given character value is a valid hex value. /// Whether the given character value is a valid hex value.

View File

@@ -7,6 +7,21 @@
public class GithubIssuesTests public class GithubIssuesTests
{ {
[Fact]
public void Issue1040()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("pdfpig-issue-1040.pdf");
using (var document = PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true}))
{
var page1 = document.GetPage(1);
Assert.NotEmpty(page1.Letters);
var page2 = document.GetPage(2);
Assert.NotEmpty(page2.Letters);
}
}
[Fact] [Fact]
public void Issue1013() public void Issue1013()
{ {

View File

@@ -1,7 +1,6 @@
namespace UglyToad.PdfPig.Parser.FileStructure namespace UglyToad.PdfPig.Parser.FileStructure
{ {
using System; using System;
using System.Collections.Generic;
using Core; using Core;
using Tokenization.Scanner; using Tokenization.Scanner;
using Tokens; using Tokens;
@@ -26,7 +25,7 @@
/// </summary> /// </summary>
private const int EndOfFileSearchRange = 2048; private const int EndOfFileSearchRange = 2048;
private static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8; internal static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
public static long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing) public static long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
{ {

View File

@@ -7,12 +7,16 @@
using Tokenization.Scanner; using Tokenization.Scanner;
using Tokens; using Tokens;
internal class XrefOffsetValidator internal sealed class XrefOffsetValidator
{ {
private static readonly long MinimumSearchOffset = 6; private const long MinimumSearchOffset = 6;
private static ReadOnlySpan<byte> XRefBytes => "xref"u8;
private static ReadOnlySpan<byte> SpaceObjBytes => " obj"u8;
private readonly ILog log; private readonly ILog log;
private List<long>? bfSearchStartXRefTablesOffsets;
private List<long>? bfSearchXRefTablesOffsets; private List<long>? bfSearchXRefTablesOffsets;
private List<long>? bfSearchXRefStreamsOffsets; private List<long>? bfSearchXRefStreamsOffsets;
@@ -90,16 +94,18 @@
BfSearchForXRefStreams(reader); BfSearchForXRefStreams(reader);
if (bfSearchXRefTablesOffsets != null) if (bfSearchXRefTablesOffsets != null && bfSearchXRefTablesOffsets.Count > 0)
{ {
// TODO to be optimized, this won't work in every case // TODO to be optimized, this won't work in every case
newOffsetTable = SearchNearestValue(bfSearchXRefTablesOffsets, xrefOffset); newOffsetTable = SearchNearestValue(bfSearchXRefTablesOffsets, xrefOffset);
} }
if (bfSearchXRefStreamsOffsets != null)
if (bfSearchXRefStreamsOffsets != null && bfSearchXRefStreamsOffsets.Count > 0)
{ {
// TODO to be optimized, this won't work in every case // TODO to be optimized, this won't work in every case
newOffsetStream = SearchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset); newOffsetStream = SearchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset);
} }
// choose the nearest value // choose the nearest value
if (newOffsetTable > -1 && newOffsetStream > -1) if (newOffsetTable > -1 && newOffsetStream > -1)
{ {
@@ -126,9 +132,91 @@
newOffset = newOffsetStream; newOffset = newOffsetStream;
bfSearchXRefStreamsOffsets!.Remove(newOffsetStream); bfSearchXRefStreamsOffsets!.Remove(newOffsetStream);
} }
else
{
log.Warn("Trying to repair xref offset by looking for all startxref.");
if (TryBruteForceSearchForXrefFromStartxref(xrefOffset, scanner, reader, out long newOffsetFromStartxref))
{
newOffset = newOffsetFromStartxref;
}
}
return newOffset; return newOffset;
} }
private bool TryBruteForceSearchForXrefFromStartxref(long xrefOffset, ISeekableTokenScanner scanner, IInputBytes reader, out long newOffset)
{
newOffset = -1;
BruteForceSearchForStartxref(reader);
long newStartXRefOffset = SearchNearestValue(bfSearchStartXRefTablesOffsets, xrefOffset);
if (newStartXRefOffset < reader.Length)
{
long tempNewOffset = -1;
var startOffset = scanner.CurrentPosition;
scanner.Seek(newStartXRefOffset + 9);
if (scanner.MoveNext() && scanner.CurrentToken is NumericToken token)
{
tempNewOffset = token.Long;
}
if (tempNewOffset > -1)
{
scanner.Seek(tempNewOffset);
scanner.MoveNext();
if (ReferenceEquals(scanner.CurrentToken, OperatorToken.Xref))
{
newOffset = tempNewOffset;
}
if (CheckXRefStreamOffset(tempNewOffset, scanner, true))
{
newOffset = tempNewOffset;
}
}
scanner.Seek(startOffset);
}
return newOffset != -1;
}
private void BruteForceSearchForStartxref(IInputBytes bytes)
{
if (bfSearchStartXRefTablesOffsets != null)
{
return;
}
// a pdf may contain more than one startxref entry
bfSearchStartXRefTablesOffsets = new List<long>();
var startOffset = bytes.CurrentOffset;
bytes.Seek(MinimumSearchOffset);
// search for startxref
while (bytes.MoveNext() && !bytes.IsAtEnd())
{
if (ReadHelper.IsString(bytes, FileTrailerParser.StartXRefBytes))
{
var newOffset = bytes.CurrentOffset;
bytes.Seek(newOffset - 1);
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
bfSearchStartXRefTablesOffsets.Add(newOffset);
}
bytes.Seek(newOffset + 9);
}
}
bytes.Seek(startOffset);
}
private void BruteForceSearchForTables(IInputBytes bytes) private void BruteForceSearchForTables(IInputBytes bytes)
{ {
if (bfSearchXRefTablesOffsets != null) if (bfSearchXRefTablesOffsets != null)
@@ -146,7 +234,7 @@
// search for xref tables // search for xref tables
while (bytes.MoveNext() && !bytes.IsAtEnd()) while (bytes.MoveNext() && !bytes.IsAtEnd())
{ {
if (ReadHelper.IsString(bytes, "xref")) if (ReadHelper.IsString(bytes, XRefBytes))
{ {
var newOffset = bytes.CurrentOffset; var newOffset = bytes.CurrentOffset;
@@ -180,11 +268,9 @@
bytes.Seek(MinimumSearchOffset); bytes.Seek(MinimumSearchOffset);
// search for XRef streams // search for XRef streams
var objString = " obj";
while (bytes.MoveNext() && !bytes.IsAtEnd()) while (bytes.MoveNext() && !bytes.IsAtEnd())
{ {
if (!ReadHelper.IsString(bytes, "xref")) if (!ReadHelper.IsString(bytes, XRefBytes))
{ {
continue; continue;
} }
@@ -209,7 +295,7 @@
for (int j = 0; j < 10; j++) for (int j = 0; j < 10; j++)
{ {
if (ReadHelper.IsString(bytes, objString)) if (ReadHelper.IsString(bytes, SpaceObjBytes))
{ {
long tempOffset = currentOffset - 1; long tempOffset = currentOffset - 1;
@@ -224,7 +310,7 @@
bytes.Seek(tempOffset); bytes.Seek(tempOffset);
// is the digit preceded by a space? // is the digit preceded by a space?
if (ReadHelper.IsSpace(bytes.CurrentByte)) if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{ {
int length = 0; int length = 0;
bytes.Seek(--tempOffset); bytes.Seek(--tempOffset);