diff --git a/src/UglyToad.PdfPig.Tests/Integration/LocalTests.cs b/src/UglyToad.PdfPig.Tests/Integration/LocalTests.cs index ec34b3d7..517d9b1e 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/LocalTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/LocalTests.cs @@ -1,36 +1,25 @@ namespace UglyToad.PdfPig.Tests.Integration { - //using System.Diagnostics; + using System.Diagnostics; /// /// A class for testing files which are not checked in to source control. /// public class LocalTests { - //[Fact] - //public void Tests() - //{ - // var files = Directory.GetFiles(@"C:\temp\pdfs", "*.pdf"); - - // foreach (var file in files) - // { - // try - // { - // using (var document = PdfDocument.Open(file, new ParsingOptions { UseLenientParsing = false })) - // { - // for (var i = 1; i <= document.NumberOfPages; i++) - // { - // var page = document.GetPage(i); - // var text = page.Text; - // Trace.WriteLine(text); - // } - // } - // } - // catch (Exception ex) - // { - // throw new InvalidOperationException($"Error parsing: {Path.GetFileName(file)}.", ex); - // } - // } - //} + [Fact] + public void Tests() + { + var file = File.ReadAllBytes(@"D:\temp\200708170550023.pdf"); + using (var document = PdfDocument.Open(file, new ParsingOptions { UseLenientParsing = false })) + { + for (var i = 1; i <= document.NumberOfPages; i++) + { + var page = document.GetPage(i); + var text = page.Text; + Trace.WriteLine(text); + } + } + } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs index 1e867d71..2d8f9c86 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs @@ -406,6 +406,49 @@ endobj"; Assert.Equal(7, token.Number.ObjectNumber); } + [Fact] + public void ReadsStreamWithDoubleEndstreamSimple() + { + const string s = + """ + 250 0 obj + << /Filter /FlateDecode >> + stream + 012endstream + endstream + endobj + """; + + var scanner = GetScanner(s); + + var tokens = ReadToEnd(scanner); + } + + [Fact] + public void ReadsStreamWithDoubleEndstream() + { + const string s = + """ + 1974 0 obj + << + /Filter /FlateDecode + /Length 1975 0 R + >> + stream + xœ]ÔÏnÚ@€ñ'ð;øØ"Œg !Ué…Cÿ¨´ ö:B*Æ2äÀÛw¿™MZõ'þ°½ë›]<ï>ïÆÓ­^|Ÿ/Ý>Ýêá4ösº^^ç.ÕÇôr«e[÷§îVÎü؝SµÈ7ïï×[:ïÆáRm6ÕâGþðz›ïõ‡Oýå˜>V‹osŸæÓøRøõ¼Ïçû×iúÎi¼ÕMµÝÖ}òƒ¾¦¯‡sª~ÛîϟŸn÷‡|Ïß+~Þ§T·~¾ŒÉt—>]§C—æÃø’ªM»ÜÖ›U³­ÒØÿ÷ÙJã–ãðïµ~†&msh ­Y„ –K‚4BK0‚yÈ¿rXVzš°Žà}$,âW@!á!¼œ@!áÑ2uBÂC=@!á¡þP(¤xðU + R< (¤xø°PHx(SW(4<”—S(4<´#@¡á¡ÌT¡Ð²><@¡á¡Œ¢PhxSW(4<”õ¡Phxè‘ …†Ç’£PhY|Q + …†GëÃB¡e}à¡Phx˜¿ †‡B¡áÑú°Phx´ÆÔ + +,ƒÂÂ#/× °²>3(¬xð.……‡¡nPXx˜_……‡ùC¡°²>x}ƒÂÂCx9ƒÂНoPXxˆ…š&ùPø!ÙÚ¯€ÂŠÿ•……‡ ¶jbky y‡yÛJØlØßw±îužó曦ï\ìY§1½ï«Óeâ.ÿùz°gAendstream + endstream + endobj + """; + + var scanner = GetScanner(s); + + var tokens = ReadToEnd(scanner); + } + [Fact] public void ReadsStringsWithMissingEndBracket() { diff --git a/src/UglyToad.PdfPig.Tests/Util/CircularByteBufferTests.cs b/src/UglyToad.PdfPig.Tests/Util/CircularByteBufferTests.cs new file mode 100644 index 00000000..f926282a --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Util/CircularByteBufferTests.cs @@ -0,0 +1,46 @@ +namespace UglyToad.PdfPig.Tests.Util; + +using PdfPig.Util; + +public class CircularByteBufferTests +{ + [Fact] + public void CanExceedCapacity() + { + var buffer = new CircularByteBuffer(3); + + var input = "123456"u8; + for (var i = 0; i < input.Length; i++) + { + buffer.Add(input[i]); + } + + Assert.True(buffer.IsCurrentlyEqual("456")); + + Assert.True("456"u8.SequenceEqual(buffer.AsSpan())); + + Assert.True(buffer.EndsWith("6")); + Assert.True(buffer.EndsWith("56")); + Assert.True(buffer.EndsWith("456")); + Assert.False(buffer.EndsWith("3456")); + } + + [Fact] + public void CanUndershootCapacity() + { + var buffer = new CircularByteBuffer(9); + + var input = "123456"u8; + for (var i = 0; i < input.Length; i++) + { + buffer.Add(input[i]); + } + + Assert.True(buffer.IsCurrentlyEqual("123456")); + + Assert.True(buffer.EndsWith("3456")); + Assert.False(buffer.EndsWith("123")); + + Assert.True("123456"u8.SequenceEqual(buffer.AsSpan())); + } +} diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 16f05707..e9b938b7 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -10,11 +10,15 @@ using Core; using Encryption; using Filters; + using System.Text; using Tokens; + using Util; internal class PdfTokenScanner : IPdfTokenScanner { private static ReadOnlySpan EndstreamBytes => "endstream"u8; + private static ReadOnlySpan EndObjBytes => "endobj"u8; + private static ReadOnlySpan StartstreamBytes => "stream"u8; private static readonly Regex EndsWithNumberRegex = new Regex(@"(?<=^[^\s\d]+)\d+$"); @@ -178,20 +182,20 @@ coreTokenScanner.Seek(previousTokenPositions[2]); break; } - + if (readTokens.Count == 1) { // An obj was encountered after reading the actual token and the object and generation number of the following token. var actualReference = new IndirectReference(objectNumber.Int, generation.Int); var actualToken = encryptionHandler.Decrypt(actualReference, readTokens[0]); - + CurrentToken = new ObjectToken(startPosition, actualReference, actualToken); readTokens.Clear(); coreTokenScanner.Seek(previousTokenPositions[2]); - + return true; } - + // This should never happen. Debug.Assert(false, $"Encountered a '{coreTokenScanner.CurrentToken}' operator before the end of the previous object."); return false; @@ -311,6 +315,9 @@ { stream = null; + // Used for shared reading of "stream", "endstream" and "endobj" candidates. + var buffer = new byte[EndstreamBytes.Length]; + DictionaryToken streamDictionaryToken = GetStreamDictionary(); // Get the expected length from the stream dictionary if present. @@ -322,7 +329,7 @@ } // Verify again that we start with "stream" - var hasStartStreamToken = ReadStreamTokenStart(inputBytes, startStreamTokenOffset); + var hasStartStreamToken = ReadStreamTokenStart(inputBytes, startStreamTokenOffset, buffer); if (!hasStartStreamToken) { return false; @@ -349,9 +356,9 @@ { inputBytes.Seek(inputBytes.CurrentOffset - 1); } + break; } - } while ((char)inputBytes.CurrentByte != '\n'); // Store where we started reading the first byte of data. @@ -360,16 +367,7 @@ // Store how many bytes we have read for checking against Length. long read = 0; - // We want to check if we ever read 'endobj' or 'endstream'. - int endObjPosition = 0; - int endStreamPosition = 0; - int commonPartPosition = 0; - - const string endWordPart = "end"; - const string streamPart = "stream"; - const string objPart = "obj"; - - if (TryReadUsingLength(inputBytes, length, startDataOffset, out var streamData)) + if (TryReadUsingLength(inputBytes, length, startDataOffset, buffer, out var streamData)) { stream = new StreamToken(streamDictionaryToken, streamData); return true; @@ -379,99 +377,100 @@ PossibleStreamEndLocation? possibleEndLocation = null; + // We're looking for either 'endobj' or 'endstream', so we look at every 'e'. + const byte sentinelByte = (byte)'e'; + var queue = new CircularByteBuffer(EndstreamBytes.Length + 1); + var sentinelPosQueue = new Queue(); + var endLocations = new Stack(); while (inputBytes.MoveNext()) { - if (length.HasValue && read == length) + if (inputBytes.CurrentByte == sentinelByte) { - // TODO: read ahead and check we're at the end... - // break; + sentinelPosQueue.Enqueue(inputBytes.CurrentOffset); + queue.Add(inputBytes.CurrentByte); } - - // We are reading 'end' (possibly). - if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition]) + else if (sentinelPosQueue.Count > 0) { - commonPartPosition++; - } - else if (commonPartPosition == endWordPart.Length) - { - // We are reading 'stream' after 'end' - if (inputBytes.CurrentByte == streamPart[endStreamPosition]) + if (ReadHelper.IsWhitespace(inputBytes.CurrentByte)) { - endObjPosition = 0; - endStreamPosition++; - - // We've finished reading 'endstream', add it to the end tokens we've seen. - if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte))) - { - var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream); - - possibleEndLocation = token; - - if (length.HasValue && read > length) - { - break; - } - - endStreamPosition = 0; - } - } - else if (inputBytes.CurrentByte == objPart[endObjPosition]) - { - // We are reading 'obj' after 'end' - - endStreamPosition = 0; - endObjPosition++; - - // We have finished reading 'endobj'. - if (endObjPosition == objPart.Length) - { - // If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now. - if (possibleEndLocation != null) - { - var lastEndToken = possibleEndLocation.Value; - - inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1); - - break; - } - - var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject); - - possibleEndLocation = token; - - if (read > length) - { - break; - } - } + // Normalize whitespace + queue.Add((byte)' '); } else { - // We were reading 'end' but then we had a character mismatch. - // Reset all the counters. - - endStreamPosition = 0; - endObjPosition = 0; - commonPartPosition = 0; + queue.Add(inputBytes.CurrentByte); } - } - else - { - // For safety reset every counter in case we had a partial read. - endStreamPosition = 0; - endObjPosition = 0; - commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0; + bool hasDequeuePotential; + do + { + hasDequeuePotential = false; + var currPos = sentinelPosQueue.Peek(); + var distanceFromSentinel = inputBytes.CurrentOffset - currPos; + if (distanceFromSentinel > EndstreamBytes.Length) + { + sentinelPosQueue.Dequeue(); + hasDequeuePotential = sentinelPosQueue.Count > 0; + } + if (distanceFromSentinel == EndstreamBytes.Length) + { + var isEndStream = queue.EndsWith("endstream "); + + if (isEndStream) + { + endLocations.Push(currPos); + sentinelPosQueue.Clear(); + } + else + { + sentinelPosQueue.Dequeue(); + } + } + else if (distanceFromSentinel == EndObjBytes.Length) + { + var isEndObj = queue.EndsWith("endobj "); + + if (isEndObj) + { + endLocations.Push(-currPos); + sentinelPosQueue.Clear(); + } + else + { + sentinelPosQueue.Dequeue(); + } + } + } while (hasDequeuePotential); } read++; } + if (sentinelPosQueue.Count > 0) + { + var isEndObj = queue.EndsWith("endobj"); + if (isEndObj) + { + var location = inputBytes.CurrentOffset - EndObjBytes.Length + 1; + endLocations.Push(-location); + } + else + { + var isEndStr = queue.EndsWith("endstream"); + if (isEndStr) + { + endLocations.Push(inputBytes.CurrentOffset - EndstreamBytes.Length + 1); + } + } + } + long streamDataEnd = inputBytes.CurrentOffset + 1; if (possibleEndLocation == null) + { return false; + } var lastEnd = possibleEndLocation; @@ -502,7 +501,12 @@ return true; } - private static bool TryReadUsingLength(IInputBytes inputBytes, long? length, long startDataOffset, [NotNullWhen(true)] out byte[]? data) + private static bool TryReadUsingLength( + IInputBytes inputBytes, + long? length, + long startDataOffset, + byte[] buffer, + [NotNullWhen(true)] out byte[]? data) { data = null; @@ -511,8 +515,6 @@ return false; } - var readBuffer = new byte[EndstreamBytes.Length]; - var newlineCount = 0; inputBytes.Seek(length.Value + startDataOffset); @@ -533,20 +535,17 @@ } } - var readLength = inputBytes.Read(readBuffer); + var readLength = inputBytes.Read(buffer); - if (readLength != readBuffer.Length) + if (readLength != EndstreamBytes.Length) { return false; } - for (var i = 0; i < EndstreamBytes.Length; i++) + if (!ByteArraysEqual(buffer, EndstreamBytes)) { - if (readBuffer[i] != EndstreamBytes[i]) - { - inputBytes.Seek(startDataOffset); - return false; - } + inputBytes.Seek(startDataOffset); + return false; } inputBytes.Seek(startDataOffset); @@ -560,7 +559,7 @@ throw new InvalidOperationException($"Reading using the stream length failed to read as many bytes as the stream specified. Wanted {length.Value}, got {countRead} at {startDataOffset + 1}."); } - inputBytes.Read(readBuffer); + inputBytes.Read(buffer); // Skip for the line break before 'endstream'. for (var i = 0; i < newlineCount; i++) { @@ -657,22 +656,29 @@ return length; } - private static bool ReadStreamTokenStart(IInputBytes input, long tokenStart) + private static bool ReadStreamTokenStart(IInputBytes input, long tokenStart, byte[] buffer) { input.Seek(tokenStart); - for (var i = 0; i < OperatorToken.StartStream.Data.Length; i++) + var readCount = input.Read(buffer); + + if (readCount < StartstreamBytes.Length + || !ByteArraysEqual(buffer.AsSpan(0, StartstreamBytes.Length), StartstreamBytes)) { - if (!input.MoveNext() || input.CurrentByte != OperatorToken.StartStream.Data[i]) - { - input.Seek(tokenStart); - return false; - } + input.Seek(tokenStart); + return false; } + input.Seek(tokenStart + StartstreamBytes.Length); + return true; } + private static bool ByteArraysEqual(ReadOnlySpan array1, ReadOnlySpan array2) + { + return array1.SequenceEqual(array2); + } + public bool TryReadToken(out T token) where T : class, IToken { if (isDisposed) @@ -712,7 +718,7 @@ coreTokenScanner.DeregisterCustomTokenizer(tokenizer); } - + public ObjectToken? Get(IndirectReference reference) { if (isDisposed) diff --git a/src/UglyToad.PdfPig/Util/CircularByteBuffer.cs b/src/UglyToad.PdfPig/Util/CircularByteBuffer.cs new file mode 100644 index 00000000..a160a712 --- /dev/null +++ b/src/UglyToad.PdfPig/Util/CircularByteBuffer.cs @@ -0,0 +1,89 @@ +namespace UglyToad.PdfPig.Util; + +using System.Text; + +internal class CircularByteBuffer(int size) +{ + private readonly byte[] buffer = new byte[size]; + + private int start; + private int count; + + public void Add(byte b) + { + var insertionPosition = (start + count) % buffer.Length; + + buffer[insertionPosition] = b; + if (count < buffer.Length) + { + count++; + } + else + { + start = (start + 1) % buffer.Length; + } + } + + public bool EndsWith(string s) + { + if (s.Length > count) + { + return false; + } + + for (var i = 0; i < s.Length; i++) + { + var str = s[i]; + + var inBuffer = count - (s.Length - i); + + var buff = buffer[IndexToBufferIndex(inBuffer)]; + + if (buff != str) + { + return false; + } + } + + return true; + } + + public bool IsCurrentlyEqual(string s) + { + if (s.Length > buffer.Length) + { + return false; + } + + for (var i = 0; i < s.Length; i++) + { + var b = (byte)s[i]; + var buff = buffer[IndexToBufferIndex(i)]; + + if (b != buff) + { + return false; + } + } + + return true; + } + + public ReadOnlySpan AsSpan() + { + Span tmp = new byte[count]; + for (int i = 0; i < count; i++) + { + tmp[i] = buffer[IndexToBufferIndex(i)]; + } + + return tmp; + } + + public override string ToString() + { + return Encoding.ASCII.GetString(AsSpan()); + } + + private int IndexToBufferIndex(int i) => (start + i) % buffer.Length; +}