fix off-by-one and optimize brute force xref search #1078 (#1079)

* fix off-by-one and optimize brute force xref search #1078

when performing a brute force xref search we were ending up
off-by-one, update the search to use a ring buffer to reduce
seeking and fix xref detection

* make method testable and add test coverage

* normalize test input on other platforms

* seal circular buffer class
This commit is contained in:
Eliot Jones 2025-07-16 01:35:24 -05:00 committed by GitHub
parent 016b754c5b
commit 9503f9c137
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 287 additions and 19 deletions

View File

@ -0,0 +1,128 @@
namespace UglyToad.PdfPig.Tests.Parser.FileStructure;
using PdfPig.Parser.FileStructure;
using PdfPig.Tokenization.Scanner;
using PdfPig.Tokens;
public class XrefOffsetValidatorTests
{
[Fact]
public void FindsTwoXrefs()
{
var content =
"""
%PDF-1.7
%âãÏÓ
5 0 obj
<</Filter/FlateDecode/Length 66>>stream
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
endstream
endobj
xref0 1
0000000000 65535 f
0000000500 00000 n
4 0 obj
<</Contents 5 0 R/MediaBox[0 0 595 842]/Parent 2 0 R/Resources<</Font<</F1 6 0 R>>>>/TrimBox[0 0 595 842]/Type/Page>>
endobj
xref
0 3
0000000000 65535 f
0000000443 00000 n
0000000576 00000 n
trailer
<< /Size 100 /Root 100 >>
startxref
9000
%%EOF
""";
if (Environment.NewLine == "\n")
{
content = content.Replace("\n", "\r\n");
}
var ib = StringBytesTestConverter.Convert(content, false);
var results = XrefOffsetValidator.BruteForceSearchForTables(ib.Bytes);
Assert.Contains(144, results);
Assert.Contains(331, results);
}
[Fact]
public void FindsXrefsFromRealFileTruncated()
{
var content =
"""
%PDF-1.5
%âãÏÓ
5 0 obj <</Linearized 1/L 4631/O 8/E 1125/N 1/T 4485/H [ 436 129]>>
endobj
xref
5 7
0000000016 00000 n
0000000565 00000 n
0000000436 00000 n
0000000639 00000 n
0000000796 00000 n
0000001001 00000 n
0000001098 00000 n
trailer
<</Size 12/Prev 4475/Root 6 0 R/Info 4 0 R/ID[<2c9c3edf9641f1459e947e7f933f6da0><2c9c3edf9641f1459e947e7f933f6da0>]>>
startxref
0
%%EOF
7 0 obj<</Length 52/Filter/FlateDecode/O 67/S 38>>stream
3842973893927327893237832738923732923782987348
endstream
endobj
6 0 obj<</Pages 2 0 R/Outlines 1 0 R/Type/Catalog/Metadata 3 0 R>>
endobj
8 0 obj<</Contents 9 0 R/Type/Page/Parent 2 0 R/Rotate 0/MediaBox[0 0 612 792]/CropBox[0 0 612 792]/Resources<</Font<</F1 10 0 R>>/ProcSet 11 0 R>>>>
endobj
9 0 obj<</Length 137/Filter/FlateDecode>>stream
abajsgiwgbkeeuuehxh9x3oihx2h802chc280h2082x
endstream
endobj
10 0 obj<</Type/Font/Name/F1/Encoding/MacRomanEncoding/BaseFont/Helvetica/Subtype/Type1>>
endobj
11 0 obj[/PDF/Text]
endobj
1 0 obj<</Count 0/Type/Outlines>>
endobj
2 0 obj<</Count 1/Kids[8 0 R]/Type/Pages>>
endobj
4 0 obj<</ModDate(D:20070213222810-05'00')/CreationDate(D:20070213222810-05'00')>>
endobj
xref
0 5
0000000000 65535 f
0000001125 00000 n
0000001166 00000 n
0000001216 00000 n
0000004385 00000 n
trailer
<</Size 5>>
startxref
116
%%EOF
""";
if (Environment.NewLine == "\n")
{
content = content.Replace("\n", "\r\n");
}
var ib = StringBytesTestConverter.Convert(content, false);
var results = XrefOffsetValidator.BruteForceSearchForTables(ib.Bytes);
Assert.Contains(98, results);
Assert.Contains(1186, results);
ib.Bytes.Seek(98);
var scanner = new CoreTokenScanner(ib.Bytes, false);
scanner.MoveNext();
Assert.Equal(scanner.CurrentToken, OperatorToken.Xref);
}
}

View File

@ -0,0 +1,47 @@
namespace UglyToad.PdfPig.Tests.Util;
using PdfPig.Util;
using System;
public class CircularByteBufferTests
{
[Fact]
public void CanExceedCapacity()
{
var buffer = new CircularByteBuffer(3);
var input = "123456"u8;
for (var i = 0; i < input.Length; i++)
{
buffer.Add(input[i]);
}
Assert.True(buffer.IsCurrentlyEqual("456"));
Assert.True("456"u8.SequenceEqual(buffer.AsSpan()));
Assert.True(buffer.EndsWith("6"));
Assert.True(buffer.EndsWith("56"));
Assert.True(buffer.EndsWith("456"));
Assert.False(buffer.EndsWith("3456"));
}
[Fact]
public void CanUndershootCapacity()
{
var buffer = new CircularByteBuffer(9);
var input = "123456"u8;
for (var i = 0; i < input.Length; i++)
{
buffer.Add(input[i]);
}
Assert.True(buffer.IsCurrentlyEqual("123456"));
Assert.True(buffer.EndsWith("3456"));
Assert.False(buffer.EndsWith("123"));
Assert.True("123456"u8.SequenceEqual(buffer.AsSpan()));
}
}

View File

@ -6,6 +6,7 @@
using Logging;
using Tokenization.Scanner;
using Tokens;
using Util;
internal sealed class XrefOffsetValidator
{
@ -90,7 +91,10 @@
long newOffsetTable = -1;
long newOffsetStream = -1;
BruteForceSearchForTables(reader);
if (bfSearchXRefTablesOffsets == null)
{
bfSearchXRefTablesOffsets = BruteForceSearchForTables(reader);
}
BfSearchForXRefStreams(reader);
@ -217,40 +221,39 @@
bytes.Seek(startOffset);
}
private void BruteForceSearchForTables(IInputBytes bytes)
public static List<long> BruteForceSearchForTables(IInputBytes bytes)
{
if (bfSearchXRefTablesOffsets != null)
{
return;
}
// a pdf may contain more than one xref entry
bfSearchXRefTablesOffsets = new List<long>();
var resultOffsets = new List<long>();
var startOffset = bytes.CurrentOffset;
bytes.Seek(MinimumSearchOffset);
var buffer = new CircularByteBuffer(XRefBytes.Length + 1);
// search for xref tables
while (bytes.MoveNext() && !bytes.IsAtEnd())
{
if (ReadHelper.IsString(bytes, XRefBytes))
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
var newOffset = bytes.CurrentOffset;
// Normalize whitespace
buffer.Add((byte)' ');
}
else
{
buffer.Add(bytes.CurrentByte);
}
bytes.Seek(newOffset - 1);
// ensure that we don't read "startxref" instead of "xref"
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
bfSearchXRefTablesOffsets.Add(newOffset);
}
bytes.Seek(newOffset + 4);
if (buffer.IsCurrentlyEqual(" xref"))
{
resultOffsets.Add(bytes.CurrentOffset - 4);
}
}
bytes.Seek(startOffset);
return resultOffsets;
}
private void BfSearchForXRefStreams(IInputBytes bytes)

View File

@ -0,0 +1,90 @@
namespace UglyToad.PdfPig.Util;
using System;
using System.Text;
internal sealed class CircularByteBuffer(int size)
{
private readonly byte[] buffer = new byte[size];
private int start;
private int count;
public void Add(byte b)
{
var insertionPosition = (start + count) % buffer.Length;
buffer[insertionPosition] = b;
if (count < buffer.Length)
{
count++;
}
else
{
start = (start + 1) % buffer.Length;
}
}
public bool EndsWith(string s)
{
if (s.Length > count)
{
return false;
}
for (var i = 0; i < s.Length; i++)
{
var str = s[i];
var inBuffer = count - (s.Length - i);
var buff = buffer[IndexToBufferIndex(inBuffer)];
if (buff != str)
{
return false;
}
}
return true;
}
public bool IsCurrentlyEqual(string s)
{
if (s.Length > buffer.Length)
{
return false;
}
for (var i = 0; i < s.Length; i++)
{
var b = (byte)s[i];
var buff = buffer[IndexToBufferIndex(i)];
if (b != buff)
{
return false;
}
}
return true;
}
public ReadOnlySpan<byte> AsSpan()
{
Span<byte> tmp = new byte[count];
for (int i = 0; i < count; i++)
{
tmp[i] = buffer[IndexToBufferIndex(i)];
}
return tmp;
}
public override string ToString()
{
return Encoding.ASCII.GetString(AsSpan());
}
private int IndexToBufferIndex(int i) => (start + i) % buffer.Length;
}