Try to repair xref offset by looking for all startxref and fix #1040
Some checks are pending
Build and test / build (push) Waiting to run
Build and test [MacOS] / build (push) Waiting to run
Run Integration Tests / build (push) Waiting to run

This commit is contained in:
BobLd 2025-05-18 14:52:21 +01:00
parent bf7c3c01d0
commit 6911f31b49
5 changed files with 140 additions and 21 deletions

View File

@ -189,7 +189,34 @@
return found;
}
/// <summary>
/// Whether the given string is at this position in the input.
/// Resets to the current offset once read.
/// </summary>
public static bool IsString(IInputBytes bytes, ReadOnlySpan<byte> s)
{
bool found = true;
var startOffset = bytes.CurrentOffset;
foreach (var c in s)
{
if (bytes.CurrentByte != c)
{
found = false;
break;
}
bytes.MoveNext();
}
bytes.Seek(startOffset);
return found;
}
/// <summary>
/// Read a long from the input.
/// </summary>
@ -252,14 +279,6 @@
throw new PdfDocumentFormatException($"Error: Expected an integer type at offset {bytes.CurrentOffset}, instead got \'{OtherEncodings.BytesAsLatin1String(intBytes)}\'");
}
}
/// <summary>
/// Whether the given character is a space.
/// </summary>
public static bool IsSpace(int c)
{
return c == ' ';
}
/// <summary>
/// Whether the given character value is a valid hex value.

View File

@ -7,6 +7,21 @@
public class GithubIssuesTests
{
[Fact]
public void Issue1040()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("pdfpig-issue-1040.pdf");
using (var document = PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true}))
{
var page1 = document.GetPage(1);
Assert.NotEmpty(page1.Letters);
var page2 = document.GetPage(2);
Assert.NotEmpty(page2.Letters);
}
}
[Fact]
public void Issue1013()
{

View File

@ -1,7 +1,6 @@
namespace UglyToad.PdfPig.Parser.FileStructure
{
using System;
using System.Collections.Generic;
using Core;
using Tokenization.Scanner;
using Tokens;
@ -26,7 +25,7 @@
/// </summary>
private const int EndOfFileSearchRange = 2048;
private static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
internal static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
public static long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
{

View File

@ -7,12 +7,16 @@
using Tokenization.Scanner;
using Tokens;
internal class XrefOffsetValidator
internal sealed class XrefOffsetValidator
{
private static readonly long MinimumSearchOffset = 6;
private const long MinimumSearchOffset = 6;
private static ReadOnlySpan<byte> XRefBytes => "xref"u8;
private static ReadOnlySpan<byte> SpaceObjBytes => " obj"u8;
private readonly ILog log;
private List<long>? bfSearchStartXRefTablesOffsets;
private List<long>? bfSearchXRefTablesOffsets;
private List<long>? bfSearchXRefStreamsOffsets;
@ -90,16 +94,18 @@
BfSearchForXRefStreams(reader);
if (bfSearchXRefTablesOffsets != null)
if (bfSearchXRefTablesOffsets != null && bfSearchXRefTablesOffsets.Count > 0)
{
// TODO to be optimized, this won't work in every case
newOffsetTable = SearchNearestValue(bfSearchXRefTablesOffsets, xrefOffset);
}
if (bfSearchXRefStreamsOffsets != null)
if (bfSearchXRefStreamsOffsets != null && bfSearchXRefStreamsOffsets.Count > 0)
{
// TODO to be optimized, this won't work in every case
newOffsetStream = SearchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset);
}
// choose the nearest value
if (newOffsetTable > -1 && newOffsetStream > -1)
{
@ -126,9 +132,91 @@
newOffset = newOffsetStream;
bfSearchXRefStreamsOffsets!.Remove(newOffsetStream);
}
else
{
log.Warn("Trying to repair xref offset by looking for all startxref.");
if (TryBruteForceSearchForXrefFromStartxref(xrefOffset, scanner, reader, out long newOffsetFromStartxref))
{
newOffset = newOffsetFromStartxref;
}
}
return newOffset;
}
private bool TryBruteForceSearchForXrefFromStartxref(long xrefOffset, ISeekableTokenScanner scanner, IInputBytes reader, out long newOffset)
{
newOffset = -1;
BruteForceSearchForStartxref(reader);
long newStartXRefOffset = SearchNearestValue(bfSearchStartXRefTablesOffsets, xrefOffset);
if (newStartXRefOffset < reader.Length)
{
long tempNewOffset = -1;
var startOffset = scanner.CurrentPosition;
scanner.Seek(newStartXRefOffset + 9);
if (scanner.MoveNext() && scanner.CurrentToken is NumericToken token)
{
tempNewOffset = token.Long;
}
if (tempNewOffset > -1)
{
scanner.Seek(tempNewOffset);
scanner.MoveNext();
if (ReferenceEquals(scanner.CurrentToken, OperatorToken.Xref))
{
newOffset = tempNewOffset;
}
if (CheckXRefStreamOffset(tempNewOffset, scanner, true))
{
newOffset = tempNewOffset;
}
}
scanner.Seek(startOffset);
}
return newOffset != -1;
}
private void BruteForceSearchForStartxref(IInputBytes bytes)
{
if (bfSearchStartXRefTablesOffsets != null)
{
return;
}
// a pdf may contain more than one startxref entry
bfSearchStartXRefTablesOffsets = new List<long>();
var startOffset = bytes.CurrentOffset;
bytes.Seek(MinimumSearchOffset);
// search for startxref
while (bytes.MoveNext() && !bytes.IsAtEnd())
{
if (ReadHelper.IsString(bytes, FileTrailerParser.StartXRefBytes))
{
var newOffset = bytes.CurrentOffset;
bytes.Seek(newOffset - 1);
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
bfSearchStartXRefTablesOffsets.Add(newOffset);
}
bytes.Seek(newOffset + 9);
}
}
bytes.Seek(startOffset);
}
private void BruteForceSearchForTables(IInputBytes bytes)
{
if (bfSearchXRefTablesOffsets != null)
@ -146,7 +234,7 @@
// search for xref tables
while (bytes.MoveNext() && !bytes.IsAtEnd())
{
if (ReadHelper.IsString(bytes, "xref"))
if (ReadHelper.IsString(bytes, XRefBytes))
{
var newOffset = bytes.CurrentOffset;
@ -180,11 +268,9 @@
bytes.Seek(MinimumSearchOffset);
// search for XRef streams
var objString = " obj";
while (bytes.MoveNext() && !bytes.IsAtEnd())
{
if (!ReadHelper.IsString(bytes, "xref"))
if (!ReadHelper.IsString(bytes, XRefBytes))
{
continue;
}
@ -209,7 +295,7 @@
for (int j = 0; j < 10; j++)
{
if (ReadHelper.IsString(bytes, objString))
if (ReadHelper.IsString(bytes, SpaceObjBytes))
{
long tempOffset = currentOffset - 1;
@ -224,7 +310,7 @@
bytes.Seek(tempOffset);
// is the digit preceded by a space?
if (ReadHelper.IsSpace(bytes.CurrentByte))
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
int length = 0;
bytes.Seek(--tempOffset);