diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs index 1e867d71..a9bf2a2c 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs @@ -406,6 +406,52 @@ endobj"; Assert.Equal(7, token.Number.ObjectNumber); } + [Fact] + public void ReadsStreamWithDoubleEndstreamSimple() + { + const string s = + """ + 250 0 obj + << /Filter /FlateDecode >> + stream + 012endstream + endstream + endobj + """; + + var scanner = GetScanner(s); + + var tokens = ReadToEnd(scanner); + } + + [Fact] + public void ReadsStreamWithDoubleEndstream() + { + const string s = + """ + 1974 0 obj + << + /Filter /FlateDecode + /Length 1975 0 R + >> + stream + xœ]ÔÏnÚ@€ñ'ð;øØ"Œg !Ué…Cÿ¨´ ö:B*Æ2äÀÛw¿™MZõ'þ°½ë›]<ï>ïÆÓ­^|Ÿ/Ý>Ýêá4ösº^^ç.ÕÇôr«e[÷§îVÎü؝SµÈ7ïï×[:ïÆáRm6ÕâGþðz›ïõ‡Oýå˜>V‹osŸæÓøRøõ¼Ïçû×iúÎi¼ÕMµÝÖ}òƒ¾¦¯‡sª~ÛîϟŸn÷‡|Ïß+~Þ§T·~¾ŒÉt—>]§C—æÃø’ªM»ÜÖ›U³­ÒØÿ÷ÙJã–ãðïµ~†&msh ­Y„ –K‚4BK0‚yÈ¿rXVzš°Žà}$,âW@!á!¼œ@!áÑ2uBÂC=@!á¡þP(¤xðU + R< (¤xø°PHx(SW(4<”—S(4<´#@¡á¡ÌT¡Ð²><@¡á¡Œ¢PhxSW(4<”õ¡Phxè‘ …†Ç’£PhY|Q + …†GëÃB¡e}à¡Phx˜¿ + †‡B¡áÑú°Phx´ÆÔ + +,ƒÂÂ#/× °²>3(¬xð.……‡¡nPXx˜_……‡ùC¡°²>x}ƒÂÂCx9ƒÂНoPXxˆ…š&ùPø!ÙÚ¯€ÂŠÿ•……‡ ¶jbky + y‡yÛJØlØßw±îužó曦ï\ìY§1½ï«Óeâ.ÿùz°gAendstream + endstream + endobj + """; + + var scanner = GetScanner(s); + + var tokens = ReadToEnd(scanner); + } + [Fact] public void ReadsStringsWithMissingEndBracket() { @@ -661,7 +707,6 @@ endobj"; var tokens = ReadToEnd(strictScanner); Assert.Empty(tokens); - var lenientScanner = GetScanner(input, useLenientParsing: true); tokens = ReadToEnd(lenientScanner); diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 4bfd802d..b876bbfd 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -377,17 +377,11 @@ long streamDataStart = inputBytes.CurrentOffset; - PossibleStreamEndLocation? possibleEndLocation = null; - - + // Negative indicates endobj. + bool isEndData = false; + Stack endLocations = new Stack(); while (inputBytes.MoveNext()) { - if (length.HasValue && read == length) - { - // TODO: read ahead and check we're at the end... - // break; - } - // We are reading 'end' (possibly). if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition]) { @@ -401,44 +395,48 @@ endObjPosition = 0; endStreamPosition++; - // We've finished reading 'endstream', add it to the end tokens we've seen. - if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte))) + if (endStreamPosition == streamPart.Length) { - var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream); - - possibleEndLocation = token; - - if (length.HasValue && read > length) + // Token is at end of stream or is followed by whitespace + if (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)) { - break; - } + var location = inputBytes.CurrentOffset - EndstreamBytes.Length; + endLocations.Push(new EndLoc(true, location, !isEndData)); + isEndData = true; - endStreamPosition = 0; + if (length.HasValue && read > length) + { + break; + } + + endStreamPosition = 0; + commonPartPosition = 0; + } } } else if (inputBytes.CurrentByte == objPart[endObjPosition]) { // We are reading 'obj' after 'end' - endStreamPosition = 0; endObjPosition++; // We have finished reading 'endobj'. if (endObjPosition == objPart.Length) { + var hasPreviousEndToken = endLocations.Count > 0; // If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now. - if (possibleEndLocation != null) + if (hasPreviousEndToken) { - var lastEndToken = possibleEndLocation.Value; + var lastEndTokenLocation = endLocations.Peek(); - inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1); + var correction = lastEndTokenLocation.IsEndStream ? EndstreamBytes.Length : "endobj".Length; + inputBytes.Seek(lastEndTokenLocation.Offset + correction + 1); break; } - var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject); - - possibleEndLocation = token; + endLocations.Push(new EndLoc(false, inputBytes.CurrentOffset, !isEndData)); + isEndData = true; if (read > length) { @@ -454,6 +452,7 @@ endStreamPosition = 0; endObjPosition = 0; commonPartPosition = 0; + isEndData = false; } } else @@ -463,6 +462,11 @@ endStreamPosition = 0; endObjPosition = 0; commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0; + + if (commonPartPosition == 0 && !ReadHelper.IsWhitespace(inputBytes.CurrentByte)) + { + isEndData = false; + } } read++; @@ -470,15 +474,22 @@ long streamDataEnd = inputBytes.CurrentOffset + 1; - if (possibleEndLocation == null) + if (endLocations.Count == 0) + { return false; + } - var lastEnd = possibleEndLocation; + // Read until the first endstream or obj token indicator preceded by data. + EndLoc endLoc; + do + { + endLoc = endLocations.Pop(); + } while (!endLoc.HasDataPreceding && endLocations.Count > 0); - var dataLength = lastEnd.Value.Offset - startDataOffset; + var dataLength = endLoc.Offset - startDataOffset; // 3 characters, 'e', '\n' and possibly '\r' - inputBytes.Seek(lastEnd.Value.Offset - 3); + inputBytes.Seek(endLoc.Offset - 3); inputBytes.MoveNext(); if (inputBytes.CurrentByte == '\r') @@ -902,5 +913,21 @@ inputBytes?.Dispose(); isDisposed = true; } + + private record EndLoc + { + public bool IsEndStream { get; } + + public long Offset { get; } + + public bool HasDataPreceding { get; } + + public EndLoc(bool isEndStream, long offset, bool hasDataPreceding) + { + IsEndStream = isEndStream; + Offset = offset; + HasDataPreceding = hasDataPreceding; + } + } } } diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PossibleStreamEndLocation.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PossibleStreamEndLocation.cs deleted file mode 100644 index 6a5bb7c3..00000000 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PossibleStreamEndLocation.cs +++ /dev/null @@ -1,35 +0,0 @@ -namespace UglyToad.PdfPig.Tokenization.Scanner -{ - using System; - using Tokens; - - /// - /// Used internally by the when reading streams to store any occurrences of 'endobj' or 'endstream' observed. - /// - internal readonly struct PossibleStreamEndLocation - { - /// - /// The offset at which the token started in the file. - /// - public long Offset { get; } - - /// - /// The type, one of either or . - /// - public OperatorToken Type { get; } - - /// - /// Create a new - /// - public PossibleStreamEndLocation(long offset, OperatorToken type) - { - Offset = offset; - Type = type ?? throw new ArgumentNullException(nameof(type)); - } - - public override string ToString() - { - return $"{Offset}: {Type}"; - } - } -} \ No newline at end of file