change pdf stream end token to use queue

This commit is contained in:
EliotJones 2025-07-06 16:24:30 -05:00
parent bf664c3f0b
commit 282146f389
5 changed files with 307 additions and 134 deletions

View File

@ -1,36 +1,25 @@
namespace UglyToad.PdfPig.Tests.Integration namespace UglyToad.PdfPig.Tests.Integration
{ {
//using System.Diagnostics; using System.Diagnostics;
/// <summary> /// <summary>
/// A class for testing files which are not checked in to source control. /// A class for testing files which are not checked in to source control.
/// </summary> /// </summary>
public class LocalTests public class LocalTests
{ {
//[Fact] [Fact]
//public void Tests() public void Tests()
//{ {
// var files = Directory.GetFiles(@"C:\temp\pdfs", "*.pdf"); var file = File.ReadAllBytes(@"D:\temp\200708170550023.pdf");
using (var document = PdfDocument.Open(file, new ParsingOptions { UseLenientParsing = false }))
// foreach (var file in files) {
// { for (var i = 1; i <= document.NumberOfPages; i++)
// try {
// { var page = document.GetPage(i);
// using (var document = PdfDocument.Open(file, new ParsingOptions { UseLenientParsing = false })) var text = page.Text;
// { Trace.WriteLine(text);
// for (var i = 1; i <= document.NumberOfPages; i++) }
// { }
// var page = document.GetPage(i); }
// var text = page.Text;
// Trace.WriteLine(text);
// }
// }
// }
// catch (Exception ex)
// {
// throw new InvalidOperationException($"Error parsing: {Path.GetFileName(file)}.", ex);
// }
// }
//}
} }
} }

View File

@ -406,6 +406,49 @@ endobj";
Assert.Equal(7, token.Number.ObjectNumber); Assert.Equal(7, token.Number.ObjectNumber);
} }
[Fact]
public void ReadsStreamWithDoubleEndstreamSimple()
{
const string s =
"""
250 0 obj
<< /Filter /FlateDecode >>
stream
012endstream
endstream
endobj
""";
var scanner = GetScanner(s);
var tokens = ReadToEnd(scanner);
}
[Fact]
public void ReadsStreamWithDoubleEndstream()
{
const string s =
"""
1974 0 obj
<<
/Filter /FlateDecode
/Length 1975 0 R
>>
stream
]ÔÏ@ñ'ð;øØ"Œg !Ué…Cÿ¨´ ö:B*Æ2äÀÛw¿™MZõ'þ°½ë]<ï>ïÆÓ­^|Ÿ/Ý>Ýêá4ösº^^ç.ÕÇôr«e[÷§îVÎü؝SµÈ7ïï×[:ïÆáRm6ÕâGþðzïõ‡Oýå˜>VosŸæÓøRøõ¼ÏçûםÎi¼ÕMµÝÖ}òƒ¾¦¯‡sª~ÛîϟŸn÷‡|Ïß+~Þ§T·~¾ŒÉt—>]§C—æÃøªM»ÜÖ­ÒØÿ÷ÙJããðïµ~†&msh ­YK4BK0yÈ¿rXVzš°Žà}$<zЁðDxò`þÐáAGÂ1:BÏða{B{$$Bа& „!ÂSÒä¿ýCC€B£ePHx´x
R<˜º@!á!>,âW@!á!¼œ@!áÑ2uBÂC=@!á¡þP(¤xðU
R< (¤°PHx(SW(4<S(4<´#@¡á¡ÌT¡Ð²><@¡á¡Œ¢PhxSW(4<õ¡Phxè Ç£PhY|Q
GëÃB¡e}à¡Phx˜¿ B¡áÑú°Phx´ÆÔ
+,ƒÂÂ#/× °²>3(¬.¡nPXx˜_ùC¡°²>x}ƒÂÂCx9ƒÂНoPXxˆš&ù!ÙگŠÿ jbky yyÛJØlØßw±îužóæ¦ï\ìY§1½ï«Ó.ÿùz°gAendstream
endstream
endobj
""";
var scanner = GetScanner(s);
var tokens = ReadToEnd(scanner);
}
[Fact] [Fact]
public void ReadsStringsWithMissingEndBracket() public void ReadsStringsWithMissingEndBracket()
{ {

View File

@ -0,0 +1,46 @@
namespace UglyToad.PdfPig.Tests.Util;
using PdfPig.Util;
public class CircularByteBufferTests
{
[Fact]
public void CanExceedCapacity()
{
var buffer = new CircularByteBuffer(3);
var input = "123456"u8;
for (var i = 0; i < input.Length; i++)
{
buffer.Add(input[i]);
}
Assert.True(buffer.IsCurrentlyEqual("456"));
Assert.True("456"u8.SequenceEqual(buffer.AsSpan()));
Assert.True(buffer.EndsWith("6"));
Assert.True(buffer.EndsWith("56"));
Assert.True(buffer.EndsWith("456"));
Assert.False(buffer.EndsWith("3456"));
}
[Fact]
public void CanUndershootCapacity()
{
var buffer = new CircularByteBuffer(9);
var input = "123456"u8;
for (var i = 0; i < input.Length; i++)
{
buffer.Add(input[i]);
}
Assert.True(buffer.IsCurrentlyEqual("123456"));
Assert.True(buffer.EndsWith("3456"));
Assert.False(buffer.EndsWith("123"));
Assert.True("123456"u8.SequenceEqual(buffer.AsSpan()));
}
}

View File

@ -10,11 +10,15 @@
using Core; using Core;
using Encryption; using Encryption;
using Filters; using Filters;
using System.Text;
using Tokens; using Tokens;
using Util;
internal class PdfTokenScanner : IPdfTokenScanner internal class PdfTokenScanner : IPdfTokenScanner
{ {
private static ReadOnlySpan<byte> EndstreamBytes => "endstream"u8; private static ReadOnlySpan<byte> EndstreamBytes => "endstream"u8;
private static ReadOnlySpan<byte> EndObjBytes => "endobj"u8;
private static ReadOnlySpan<byte> StartstreamBytes => "stream"u8;
private static readonly Regex EndsWithNumberRegex = new Regex(@"(?<=^[^\s\d]+)\d+$"); private static readonly Regex EndsWithNumberRegex = new Regex(@"(?<=^[^\s\d]+)\d+$");
@ -178,20 +182,20 @@
coreTokenScanner.Seek(previousTokenPositions[2]); coreTokenScanner.Seek(previousTokenPositions[2]);
break; break;
} }
if (readTokens.Count == 1) if (readTokens.Count == 1)
{ {
// An obj was encountered after reading the actual token and the object and generation number of the following token. // An obj was encountered after reading the actual token and the object and generation number of the following token.
var actualReference = new IndirectReference(objectNumber.Int, generation.Int); var actualReference = new IndirectReference(objectNumber.Int, generation.Int);
var actualToken = encryptionHandler.Decrypt(actualReference, readTokens[0]); var actualToken = encryptionHandler.Decrypt(actualReference, readTokens[0]);
CurrentToken = new ObjectToken(startPosition, actualReference, actualToken); CurrentToken = new ObjectToken(startPosition, actualReference, actualToken);
readTokens.Clear(); readTokens.Clear();
coreTokenScanner.Seek(previousTokenPositions[2]); coreTokenScanner.Seek(previousTokenPositions[2]);
return true; return true;
} }
// This should never happen. // This should never happen.
Debug.Assert(false, $"Encountered a '{coreTokenScanner.CurrentToken}' operator before the end of the previous object."); Debug.Assert(false, $"Encountered a '{coreTokenScanner.CurrentToken}' operator before the end of the previous object.");
return false; return false;
@ -311,6 +315,9 @@
{ {
stream = null; stream = null;
// Used for shared reading of "stream", "endstream" and "endobj" candidates.
var buffer = new byte[EndstreamBytes.Length];
DictionaryToken streamDictionaryToken = GetStreamDictionary(); DictionaryToken streamDictionaryToken = GetStreamDictionary();
// Get the expected length from the stream dictionary if present. // Get the expected length from the stream dictionary if present.
@ -322,7 +329,7 @@
} }
// Verify again that we start with "stream" // Verify again that we start with "stream"
var hasStartStreamToken = ReadStreamTokenStart(inputBytes, startStreamTokenOffset); var hasStartStreamToken = ReadStreamTokenStart(inputBytes, startStreamTokenOffset, buffer);
if (!hasStartStreamToken) if (!hasStartStreamToken)
{ {
return false; return false;
@ -349,9 +356,9 @@
{ {
inputBytes.Seek(inputBytes.CurrentOffset - 1); inputBytes.Seek(inputBytes.CurrentOffset - 1);
} }
break; break;
} }
} while ((char)inputBytes.CurrentByte != '\n'); } while ((char)inputBytes.CurrentByte != '\n');
// Store where we started reading the first byte of data. // Store where we started reading the first byte of data.
@ -360,16 +367,7 @@
// Store how many bytes we have read for checking against Length. // Store how many bytes we have read for checking against Length.
long read = 0; long read = 0;
// We want to check if we ever read 'endobj' or 'endstream'. if (TryReadUsingLength(inputBytes, length, startDataOffset, buffer, out var streamData))
int endObjPosition = 0;
int endStreamPosition = 0;
int commonPartPosition = 0;
const string endWordPart = "end";
const string streamPart = "stream";
const string objPart = "obj";
if (TryReadUsingLength(inputBytes, length, startDataOffset, out var streamData))
{ {
stream = new StreamToken(streamDictionaryToken, streamData); stream = new StreamToken(streamDictionaryToken, streamData);
return true; return true;
@ -379,99 +377,100 @@
PossibleStreamEndLocation? possibleEndLocation = null; PossibleStreamEndLocation? possibleEndLocation = null;
// We're looking for either 'endobj' or 'endstream', so we look at every 'e'.
const byte sentinelByte = (byte)'e';
var queue = new CircularByteBuffer(EndstreamBytes.Length + 1);
var sentinelPosQueue = new Queue<long>();
var endLocations = new Stack<long>();
while (inputBytes.MoveNext()) while (inputBytes.MoveNext())
{ {
if (length.HasValue && read == length) if (inputBytes.CurrentByte == sentinelByte)
{ {
// TODO: read ahead and check we're at the end... sentinelPosQueue.Enqueue(inputBytes.CurrentOffset);
// break; queue.Add(inputBytes.CurrentByte);
} }
else if (sentinelPosQueue.Count > 0)
// We are reading 'end' (possibly).
if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition])
{ {
commonPartPosition++; if (ReadHelper.IsWhitespace(inputBytes.CurrentByte))
}
else if (commonPartPosition == endWordPart.Length)
{
// We are reading 'stream' after 'end'
if (inputBytes.CurrentByte == streamPart[endStreamPosition])
{ {
endObjPosition = 0; // Normalize whitespace
endStreamPosition++; queue.Add((byte)' ');
// We've finished reading 'endstream', add it to the end tokens we've seen.
if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
{
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
possibleEndLocation = token;
if (length.HasValue && read > length)
{
break;
}
endStreamPosition = 0;
}
}
else if (inputBytes.CurrentByte == objPart[endObjPosition])
{
// We are reading 'obj' after 'end'
endStreamPosition = 0;
endObjPosition++;
// We have finished reading 'endobj'.
if (endObjPosition == objPart.Length)
{
// If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
if (possibleEndLocation != null)
{
var lastEndToken = possibleEndLocation.Value;
inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
break;
}
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
possibleEndLocation = token;
if (read > length)
{
break;
}
}
} }
else else
{ {
// We were reading 'end' but then we had a character mismatch. queue.Add(inputBytes.CurrentByte);
// Reset all the counters.
endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = 0;
} }
}
else
{
// For safety reset every counter in case we had a partial read.
endStreamPosition = 0; bool hasDequeuePotential;
endObjPosition = 0; do
commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0; {
hasDequeuePotential = false;
var currPos = sentinelPosQueue.Peek();
var distanceFromSentinel = inputBytes.CurrentOffset - currPos;
if (distanceFromSentinel > EndstreamBytes.Length)
{
sentinelPosQueue.Dequeue();
hasDequeuePotential = sentinelPosQueue.Count > 0;
}
if (distanceFromSentinel == EndstreamBytes.Length)
{
var isEndStream = queue.EndsWith("endstream ");
if (isEndStream)
{
endLocations.Push(currPos);
sentinelPosQueue.Clear();
}
else
{
sentinelPosQueue.Dequeue();
}
}
else if (distanceFromSentinel == EndObjBytes.Length)
{
var isEndObj = queue.EndsWith("endobj ");
if (isEndObj)
{
endLocations.Push(-currPos);
sentinelPosQueue.Clear();
}
else
{
sentinelPosQueue.Dequeue();
}
}
} while (hasDequeuePotential);
} }
read++; read++;
} }
if (sentinelPosQueue.Count > 0)
{
var isEndObj = queue.EndsWith("endobj");
if (isEndObj)
{
var location = inputBytes.CurrentOffset - EndObjBytes.Length + 1;
endLocations.Push(-location);
}
else
{
var isEndStr = queue.EndsWith("endstream");
if (isEndStr)
{
endLocations.Push(inputBytes.CurrentOffset - EndstreamBytes.Length + 1);
}
}
}
long streamDataEnd = inputBytes.CurrentOffset + 1; long streamDataEnd = inputBytes.CurrentOffset + 1;
if (possibleEndLocation == null) if (possibleEndLocation == null)
{
return false; return false;
}
var lastEnd = possibleEndLocation; var lastEnd = possibleEndLocation;
@ -502,7 +501,12 @@
return true; return true;
} }
private static bool TryReadUsingLength(IInputBytes inputBytes, long? length, long startDataOffset, [NotNullWhen(true)] out byte[]? data) private static bool TryReadUsingLength(
IInputBytes inputBytes,
long? length,
long startDataOffset,
byte[] buffer,
[NotNullWhen(true)] out byte[]? data)
{ {
data = null; data = null;
@ -511,8 +515,6 @@
return false; return false;
} }
var readBuffer = new byte[EndstreamBytes.Length];
var newlineCount = 0; var newlineCount = 0;
inputBytes.Seek(length.Value + startDataOffset); inputBytes.Seek(length.Value + startDataOffset);
@ -533,20 +535,17 @@
} }
} }
var readLength = inputBytes.Read(readBuffer); var readLength = inputBytes.Read(buffer);
if (readLength != readBuffer.Length) if (readLength != EndstreamBytes.Length)
{ {
return false; return false;
} }
for (var i = 0; i < EndstreamBytes.Length; i++) if (!ByteArraysEqual(buffer, EndstreamBytes))
{ {
if (readBuffer[i] != EndstreamBytes[i]) inputBytes.Seek(startDataOffset);
{ return false;
inputBytes.Seek(startDataOffset);
return false;
}
} }
inputBytes.Seek(startDataOffset); inputBytes.Seek(startDataOffset);
@ -560,7 +559,7 @@
throw new InvalidOperationException($"Reading using the stream length failed to read as many bytes as the stream specified. Wanted {length.Value}, got {countRead} at {startDataOffset + 1}."); throw new InvalidOperationException($"Reading using the stream length failed to read as many bytes as the stream specified. Wanted {length.Value}, got {countRead} at {startDataOffset + 1}.");
} }
inputBytes.Read(readBuffer); inputBytes.Read(buffer);
// Skip for the line break before 'endstream'. // Skip for the line break before 'endstream'.
for (var i = 0; i < newlineCount; i++) for (var i = 0; i < newlineCount; i++)
{ {
@ -657,22 +656,29 @@
return length; return length;
} }
private static bool ReadStreamTokenStart(IInputBytes input, long tokenStart) private static bool ReadStreamTokenStart(IInputBytes input, long tokenStart, byte[] buffer)
{ {
input.Seek(tokenStart); input.Seek(tokenStart);
for (var i = 0; i < OperatorToken.StartStream.Data.Length; i++) var readCount = input.Read(buffer);
if (readCount < StartstreamBytes.Length
|| !ByteArraysEqual(buffer.AsSpan(0, StartstreamBytes.Length), StartstreamBytes))
{ {
if (!input.MoveNext() || input.CurrentByte != OperatorToken.StartStream.Data[i]) input.Seek(tokenStart);
{ return false;
input.Seek(tokenStart);
return false;
}
} }
input.Seek(tokenStart + StartstreamBytes.Length);
return true; return true;
} }
private static bool ByteArraysEqual(ReadOnlySpan<byte> array1, ReadOnlySpan<byte> array2)
{
return array1.SequenceEqual(array2);
}
public bool TryReadToken<T>(out T token) where T : class, IToken public bool TryReadToken<T>(out T token) where T : class, IToken
{ {
if (isDisposed) if (isDisposed)
@ -712,7 +718,7 @@
coreTokenScanner.DeregisterCustomTokenizer(tokenizer); coreTokenScanner.DeregisterCustomTokenizer(tokenizer);
} }
public ObjectToken? Get(IndirectReference reference) public ObjectToken? Get(IndirectReference reference)
{ {
if (isDisposed) if (isDisposed)

View File

@ -0,0 +1,89 @@
namespace UglyToad.PdfPig.Util;
using System.Text;
internal class CircularByteBuffer(int size)
{
private readonly byte[] buffer = new byte[size];
private int start;
private int count;
public void Add(byte b)
{
var insertionPosition = (start + count) % buffer.Length;
buffer[insertionPosition] = b;
if (count < buffer.Length)
{
count++;
}
else
{
start = (start + 1) % buffer.Length;
}
}
public bool EndsWith(string s)
{
if (s.Length > count)
{
return false;
}
for (var i = 0; i < s.Length; i++)
{
var str = s[i];
var inBuffer = count - (s.Length - i);
var buff = buffer[IndexToBufferIndex(inBuffer)];
if (buff != str)
{
return false;
}
}
return true;
}
public bool IsCurrentlyEqual(string s)
{
if (s.Length > buffer.Length)
{
return false;
}
for (var i = 0; i < s.Length; i++)
{
var b = (byte)s[i];
var buff = buffer[IndexToBufferIndex(i)];
if (b != buff)
{
return false;
}
}
return true;
}
public ReadOnlySpan<byte> AsSpan()
{
Span<byte> tmp = new byte[count];
for (int i = 0; i < count; i++)
{
tmp[i] = buffer[IndexToBufferIndex(i)];
}
return tmp;
}
public override string ToString()
{
return Encoding.ASCII.GetString(AsSpan());
}
private int IndexToBufferIndex(int i) => (start + i) % buffer.Length;
}