mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-08-20 04:48:53 +08:00
* fix off-by-one and optimize brute force xref search #1078 when performing a brute force xref search we were ending up off-by-one, update the search to use a ring buffer to reduce seeking and fix xref detection * make method testable and add test coverage * normalize test input on other platforms * seal circular buffer class
This commit is contained in:
parent
016b754c5b
commit
9503f9c137
@ -0,0 +1,128 @@
|
||||
namespace UglyToad.PdfPig.Tests.Parser.FileStructure;
|
||||
|
||||
using PdfPig.Parser.FileStructure;
|
||||
using PdfPig.Tokenization.Scanner;
|
||||
using PdfPig.Tokens;
|
||||
|
||||
public class XrefOffsetValidatorTests
|
||||
{
|
||||
[Fact]
|
||||
public void FindsTwoXrefs()
|
||||
{
|
||||
var content =
|
||||
"""
|
||||
%PDF-1.7
|
||||
%âãÏÓ
|
||||
5 0 obj
|
||||
<</Filter/FlateDecode/Length 66>>stream
|
||||
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
|
||||
endstream
|
||||
endobj
|
||||
xref0 1
|
||||
0000000000 65535 f
|
||||
0000000500 00000 n
|
||||
4 0 obj
|
||||
<</Contents 5 0 R/MediaBox[0 0 595 842]/Parent 2 0 R/Resources<</Font<</F1 6 0 R>>>>/TrimBox[0 0 595 842]/Type/Page>>
|
||||
endobj
|
||||
xref
|
||||
0 3
|
||||
0000000000 65535 f
|
||||
0000000443 00000 n
|
||||
0000000576 00000 n
|
||||
trailer
|
||||
<< /Size 100 /Root 100 >>
|
||||
startxref
|
||||
9000
|
||||
%%EOF
|
||||
""";
|
||||
|
||||
if (Environment.NewLine == "\n")
|
||||
{
|
||||
content = content.Replace("\n", "\r\n");
|
||||
}
|
||||
|
||||
var ib = StringBytesTestConverter.Convert(content, false);
|
||||
|
||||
var results = XrefOffsetValidator.BruteForceSearchForTables(ib.Bytes);
|
||||
|
||||
Assert.Contains(144, results);
|
||||
Assert.Contains(331, results);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FindsXrefsFromRealFileTruncated()
|
||||
{
|
||||
var content =
|
||||
"""
|
||||
%PDF-1.5
|
||||
%âãÏÓ
|
||||
5 0 obj <</Linearized 1/L 4631/O 8/E 1125/N 1/T 4485/H [ 436 129]>>
|
||||
endobj
|
||||
xref
|
||||
5 7
|
||||
0000000016 00000 n
|
||||
0000000565 00000 n
|
||||
0000000436 00000 n
|
||||
0000000639 00000 n
|
||||
0000000796 00000 n
|
||||
0000001001 00000 n
|
||||
0000001098 00000 n
|
||||
trailer
|
||||
<</Size 12/Prev 4475/Root 6 0 R/Info 4 0 R/ID[<2c9c3edf9641f1459e947e7f933f6da0><2c9c3edf9641f1459e947e7f933f6da0>]>>
|
||||
startxref
|
||||
0
|
||||
%%EOF
|
||||
7 0 obj<</Length 52/Filter/FlateDecode/O 67/S 38>>stream
|
||||
3842973893927327893237832738923732923782987348
|
||||
endstream
|
||||
endobj
|
||||
6 0 obj<</Pages 2 0 R/Outlines 1 0 R/Type/Catalog/Metadata 3 0 R>>
|
||||
endobj
|
||||
8 0 obj<</Contents 9 0 R/Type/Page/Parent 2 0 R/Rotate 0/MediaBox[0 0 612 792]/CropBox[0 0 612 792]/Resources<</Font<</F1 10 0 R>>/ProcSet 11 0 R>>>>
|
||||
endobj
|
||||
9 0 obj<</Length 137/Filter/FlateDecode>>stream
|
||||
abajsgiwgbkeeuuehxh9x3oihx2h802chc280h2082x
|
||||
endstream
|
||||
endobj
|
||||
10 0 obj<</Type/Font/Name/F1/Encoding/MacRomanEncoding/BaseFont/Helvetica/Subtype/Type1>>
|
||||
endobj
|
||||
11 0 obj[/PDF/Text]
|
||||
endobj
|
||||
1 0 obj<</Count 0/Type/Outlines>>
|
||||
endobj
|
||||
2 0 obj<</Count 1/Kids[8 0 R]/Type/Pages>>
|
||||
endobj
|
||||
4 0 obj<</ModDate(D:20070213222810-05'00')/CreationDate(D:20070213222810-05'00')>>
|
||||
endobj
|
||||
xref
|
||||
0 5
|
||||
0000000000 65535 f
|
||||
0000001125 00000 n
|
||||
0000001166 00000 n
|
||||
0000001216 00000 n
|
||||
0000004385 00000 n
|
||||
trailer
|
||||
<</Size 5>>
|
||||
startxref
|
||||
116
|
||||
%%EOF
|
||||
""";
|
||||
|
||||
if (Environment.NewLine == "\n")
|
||||
{
|
||||
content = content.Replace("\n", "\r\n");
|
||||
}
|
||||
|
||||
var ib = StringBytesTestConverter.Convert(content, false);
|
||||
|
||||
var results = XrefOffsetValidator.BruteForceSearchForTables(ib.Bytes);
|
||||
|
||||
Assert.Contains(98, results);
|
||||
Assert.Contains(1186, results);
|
||||
|
||||
ib.Bytes.Seek(98);
|
||||
var scanner = new CoreTokenScanner(ib.Bytes, false);
|
||||
scanner.MoveNext();
|
||||
Assert.Equal(scanner.CurrentToken, OperatorToken.Xref);
|
||||
}
|
||||
}
|
47
src/UglyToad.PdfPig.Tests/Util/CircularByteBufferTests.cs
Normal file
47
src/UglyToad.PdfPig.Tests/Util/CircularByteBufferTests.cs
Normal file
@ -0,0 +1,47 @@
|
||||
namespace UglyToad.PdfPig.Tests.Util;
|
||||
|
||||
using PdfPig.Util;
|
||||
using System;
|
||||
|
||||
public class CircularByteBufferTests
|
||||
{
|
||||
[Fact]
|
||||
public void CanExceedCapacity()
|
||||
{
|
||||
var buffer = new CircularByteBuffer(3);
|
||||
|
||||
var input = "123456"u8;
|
||||
for (var i = 0; i < input.Length; i++)
|
||||
{
|
||||
buffer.Add(input[i]);
|
||||
}
|
||||
|
||||
Assert.True(buffer.IsCurrentlyEqual("456"));
|
||||
|
||||
Assert.True("456"u8.SequenceEqual(buffer.AsSpan()));
|
||||
|
||||
Assert.True(buffer.EndsWith("6"));
|
||||
Assert.True(buffer.EndsWith("56"));
|
||||
Assert.True(buffer.EndsWith("456"));
|
||||
Assert.False(buffer.EndsWith("3456"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanUndershootCapacity()
|
||||
{
|
||||
var buffer = new CircularByteBuffer(9);
|
||||
|
||||
var input = "123456"u8;
|
||||
for (var i = 0; i < input.Length; i++)
|
||||
{
|
||||
buffer.Add(input[i]);
|
||||
}
|
||||
|
||||
Assert.True(buffer.IsCurrentlyEqual("123456"));
|
||||
|
||||
Assert.True(buffer.EndsWith("3456"));
|
||||
Assert.False(buffer.EndsWith("123"));
|
||||
|
||||
Assert.True("123456"u8.SequenceEqual(buffer.AsSpan()));
|
||||
}
|
||||
}
|
@ -6,6 +6,7 @@
|
||||
using Logging;
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
using Util;
|
||||
|
||||
internal sealed class XrefOffsetValidator
|
||||
{
|
||||
@ -90,7 +91,10 @@
|
||||
long newOffsetTable = -1;
|
||||
long newOffsetStream = -1;
|
||||
|
||||
BruteForceSearchForTables(reader);
|
||||
if (bfSearchXRefTablesOffsets == null)
|
||||
{
|
||||
bfSearchXRefTablesOffsets = BruteForceSearchForTables(reader);
|
||||
}
|
||||
|
||||
BfSearchForXRefStreams(reader);
|
||||
|
||||
@ -217,40 +221,39 @@
|
||||
bytes.Seek(startOffset);
|
||||
}
|
||||
|
||||
private void BruteForceSearchForTables(IInputBytes bytes)
|
||||
public static List<long> BruteForceSearchForTables(IInputBytes bytes)
|
||||
{
|
||||
if (bfSearchXRefTablesOffsets != null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// a pdf may contain more than one xref entry
|
||||
bfSearchXRefTablesOffsets = new List<long>();
|
||||
var resultOffsets = new List<long>();
|
||||
|
||||
var startOffset = bytes.CurrentOffset;
|
||||
|
||||
bytes.Seek(MinimumSearchOffset);
|
||||
|
||||
var buffer = new CircularByteBuffer(XRefBytes.Length + 1);
|
||||
|
||||
// search for xref tables
|
||||
while (bytes.MoveNext() && !bytes.IsAtEnd())
|
||||
{
|
||||
if (ReadHelper.IsString(bytes, XRefBytes))
|
||||
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
|
||||
{
|
||||
var newOffset = bytes.CurrentOffset;
|
||||
// Normalize whitespace
|
||||
buffer.Add((byte)' ');
|
||||
}
|
||||
else
|
||||
{
|
||||
buffer.Add(bytes.CurrentByte);
|
||||
}
|
||||
|
||||
bytes.Seek(newOffset - 1);
|
||||
|
||||
// ensure that we don't read "startxref" instead of "xref"
|
||||
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
|
||||
{
|
||||
bfSearchXRefTablesOffsets.Add(newOffset);
|
||||
}
|
||||
|
||||
bytes.Seek(newOffset + 4);
|
||||
if (buffer.IsCurrentlyEqual(" xref"))
|
||||
{
|
||||
resultOffsets.Add(bytes.CurrentOffset - 4);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.Seek(startOffset);
|
||||
|
||||
return resultOffsets;
|
||||
}
|
||||
|
||||
private void BfSearchForXRefStreams(IInputBytes bytes)
|
||||
|
90
src/UglyToad.PdfPig/Util/CircularByteBuffer.cs
Normal file
90
src/UglyToad.PdfPig/Util/CircularByteBuffer.cs
Normal file
@ -0,0 +1,90 @@
|
||||
namespace UglyToad.PdfPig.Util;
|
||||
|
||||
using System;
|
||||
using System.Text;
|
||||
|
||||
internal sealed class CircularByteBuffer(int size)
|
||||
{
|
||||
private readonly byte[] buffer = new byte[size];
|
||||
|
||||
private int start;
|
||||
private int count;
|
||||
|
||||
public void Add(byte b)
|
||||
{
|
||||
var insertionPosition = (start + count) % buffer.Length;
|
||||
|
||||
buffer[insertionPosition] = b;
|
||||
if (count < buffer.Length)
|
||||
{
|
||||
count++;
|
||||
}
|
||||
else
|
||||
{
|
||||
start = (start + 1) % buffer.Length;
|
||||
}
|
||||
}
|
||||
|
||||
public bool EndsWith(string s)
|
||||
{
|
||||
if (s.Length > count)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
for (var i = 0; i < s.Length; i++)
|
||||
{
|
||||
var str = s[i];
|
||||
|
||||
var inBuffer = count - (s.Length - i);
|
||||
|
||||
var buff = buffer[IndexToBufferIndex(inBuffer)];
|
||||
|
||||
if (buff != str)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public bool IsCurrentlyEqual(string s)
|
||||
{
|
||||
if (s.Length > buffer.Length)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
for (var i = 0; i < s.Length; i++)
|
||||
{
|
||||
var b = (byte)s[i];
|
||||
var buff = buffer[IndexToBufferIndex(i)];
|
||||
|
||||
if (b != buff)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public ReadOnlySpan<byte> AsSpan()
|
||||
{
|
||||
Span<byte> tmp = new byte[count];
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
tmp[i] = buffer[IndexToBufferIndex(i)];
|
||||
}
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return Encoding.ASCII.GetString(AsSpan());
|
||||
}
|
||||
|
||||
private int IndexToBufferIndex(int i) => (start + i) % buffer.Length;
|
||||
}
|
Loading…
Reference in New Issue
Block a user