fix for issue #670 with double endstream

when a stream contains two endstream declarations with no data between them then the first declared endstream should be obeyed
2025-10-08 00:14:35 +08:00 · 2025-07-06 17:09:59 -05:00
parent daaac9350d
commit 7134032188
3 changed files with 103 additions and 66 deletions
--- a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs
@@ -406,6 +406,52 @@ endobj";
            Assert.Equal(7, token.Number.ObjectNumber);
        }

+        [Fact]
+        public void ReadsStreamWithDoubleEndstreamSimple()
+        {
+            const string s =
+                """
+                250 0 obj
+                << /Filter /FlateDecode >>
+                stream
+                012endstream
+                endstream
+                endobj
+                """;
+
+            var scanner = GetScanner(s);
+
+            var tokens = ReadToEnd(scanner);
+        }
+
+        [Fact]
+        public void ReadsStreamWithDoubleEndstream()
+        {
+            const string s =
+                """
+                1974 0 obj
+                <<
+                /Filter /FlateDecode
+                /Length 1975 0 R
+                >>
+                stream
+                xœ]ÔÏnÚ@€ñ'ð;øØ"Œg	!Ué…Cÿ¨´ ö:B*Æ2äÀÛw¿™MZõ'þ°½ë›]<ï>ïÆÓ^|Ÿ/Ý>Ýêá4ösº^^ç.ÕÇôr«e[÷§îVÎüØSµÈ7ïï×[:ïÆáRm6ÕâGþðz›ïõ‡Oýå˜>V‹osŸæÓøRøõ¼Ïçû×iúÎi¼ÕMµÝÖ}òƒ¾¦¯‡sª~ÛÃ®ÏŸŸn÷‡|Ïß+~Þ§T·~¾ŒÉt—>]§C—æÃø’ªM»ÜÖ›U³ÒØÿ÷ÙJã–ãðïµ~†&msh	Y„ –K‚4BK0‚yÈ¿rXVzÂš°Žà}$<zÐðDxò`þÐáAGÂ1‚:BÏða{B{$$BŠ°&
+                „!ÂSÒä¿ýCC€BÂ£e…PHx´x-Ã
+                R<˜º@!á!>,âW@!á!¼œ@!áÑ2uBÂC=@!á¡þP(¤xðU
+                R< (¤xø°PHx(SW(4<”—S(4<´#@¡á¡ÌT¡Ð²><@¡á¡Œ¢PhxSW(4<”õ¡Phxè‘ …†Ç’£PhY|Q
+                …†GëÃB¡e}à¡Phx˜¿
+                †‡B¡áÑú°Phx´ÆÔ
+                +,ƒÂÂ#/× °²>3(¬xð.……‡¡nPXx˜_……‡ùC¡°²>x}ƒÂÂCx9ƒÂŠ¯oPXxˆ…š&ùPø!ÙÚ¯€ÂŠÿ•……‡ ¶jbky
+                y‡yÛJØlØßw±îužóæ›¦ï\ìY§1½ï«Óeâ.ÿùz°gAendstream
+                endstream
+                endobj
+                """;
+
+            var scanner = GetScanner(s);
+
+            var tokens = ReadToEnd(scanner);
+        }
+
        [Fact]
        public void ReadsStringsWithMissingEndBracket()
        {
@@ -661,7 +707,6 @@ endobj";
            var tokens = ReadToEnd(strictScanner);
            Assert.Empty(tokens);

-
            var lenientScanner = GetScanner(input, useLenientParsing: true);
            tokens = ReadToEnd(lenientScanner);

--- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
@@ -377,17 +377,11 @@

            long streamDataStart = inputBytes.CurrentOffset;

-            PossibleStreamEndLocation? possibleEndLocation = null;
-
-
+            // Negative indicates endobj.
+            bool isEndData = false;
+            Stack<EndLoc> endLocations = new Stack<EndLoc>();
            while (inputBytes.MoveNext())
            {
-                if (length.HasValue && read == length)
-                {
-                    // TODO: read ahead and check we're at the end...
-                    // break;
-                }
-
                // We are reading 'end' (possibly).
                if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition])
                {
@@ -401,12 +395,14 @@
                        endObjPosition = 0;
                        endStreamPosition++;

-                        // We've finished reading 'endstream', add it to the end tokens we've seen.
-                        if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
+                        if (endStreamPosition == streamPart.Length)
                        {
-                            var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
-
-                            possibleEndLocation = token;
+                            // Token is at end of stream or is followed by whitespace
+                            if (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte))
+                            {
+                                var location = inputBytes.CurrentOffset - EndstreamBytes.Length;
+                                endLocations.Push(new EndLoc(true, location, !isEndData));
+                                isEndData = true;

                                if (length.HasValue && read > length)
                                {
@@ -414,31 +410,33 @@
                                }

                                endStreamPosition = 0;
+                                commonPartPosition = 0;
+                            }
                        }
                    }
                    else if (inputBytes.CurrentByte == objPart[endObjPosition])
                    {
                        // We are reading 'obj' after 'end'
-
                        endStreamPosition = 0;
                        endObjPosition++;

                        // We have finished reading 'endobj'.
                        if (endObjPosition == objPart.Length)
                        {
+                            var hasPreviousEndToken = endLocations.Count > 0;
                            // If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
-                            if (possibleEndLocation != null)
+                            if (hasPreviousEndToken)
                            {
-                                var lastEndToken = possibleEndLocation.Value;
+                                var lastEndTokenLocation = endLocations.Peek();

-                                inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
+                                var correction = lastEndTokenLocation.IsEndStream ? EndstreamBytes.Length : "endobj".Length;
+                                inputBytes.Seek(lastEndTokenLocation.Offset + correction + 1);

                                break;
                            }

-                            var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
-
-                            possibleEndLocation = token;
+                            endLocations.Push(new EndLoc(false, inputBytes.CurrentOffset, !isEndData));
+                            isEndData = true;

                            if (read > length)
                            {
@@ -454,6 +452,7 @@
                        endStreamPosition = 0;
                        endObjPosition = 0;
                        commonPartPosition = 0;
+                        isEndData = false;
                    }
                }
                else
@@ -463,6 +462,11 @@
                    endStreamPosition = 0;
                    endObjPosition = 0;
                    commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0;
+
+                    if (commonPartPosition == 0 && !ReadHelper.IsWhitespace(inputBytes.CurrentByte))
+                    {
+                        isEndData = false;
+                    }
                }

                read++;
@@ -470,15 +474,22 @@

            long streamDataEnd = inputBytes.CurrentOffset + 1;

-            if (possibleEndLocation == null)
+            if (endLocations.Count == 0)
+            {
                return false;
+            }

-            var lastEnd = possibleEndLocation;
+            // Read until the first endstream or obj token indicator preceded by data.
+            EndLoc endLoc;
+            do
+            {
+                endLoc = endLocations.Pop();
+            } while (!endLoc.HasDataPreceding && endLocations.Count > 0);

-            var dataLength = lastEnd.Value.Offset - startDataOffset;
+            var dataLength = endLoc.Offset - startDataOffset;

            // 3 characters, 'e', '\n' and possibly '\r'
-            inputBytes.Seek(lastEnd.Value.Offset - 3);
+            inputBytes.Seek(endLoc.Offset - 3);
            inputBytes.MoveNext();

            if (inputBytes.CurrentByte == '\r')
@@ -902,5 +913,21 @@
            inputBytes?.Dispose();
            isDisposed = true;
        }
+
+        private record EndLoc
+        {
+            public bool IsEndStream { get; }
+
+            public long Offset { get; }
+
+            public bool HasDataPreceding { get; }
+
+            public EndLoc(bool isEndStream, long offset, bool hasDataPreceding)
+            {
+                IsEndStream = isEndStream;
+                Offset = offset;
+                HasDataPreceding = hasDataPreceding;
+            }
+        }
    }
 }
--- a/src/UglyToad.PdfPig/Tokenization/Scanner/PossibleStreamEndLocation.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PossibleStreamEndLocation.cs
@@ -1,35 +0,0 @@
-namespace UglyToad.PdfPig.Tokenization.Scanner
-{
-    using System;
-    using Tokens;
-
-    /// <summary>
-    /// Used internally by the <see cref="PdfTokenScanner"/> when reading streams to store any occurrences of 'endobj' or 'endstream' observed.
-    /// </summary>
-    internal readonly struct PossibleStreamEndLocation
-    {
-        /// <summary>
-        /// The offset at which the token started in the file.
-        /// </summary>
-        public long Offset { get; }
-
-        /// <summary>
-        /// The type, one of either <see cref="OperatorToken.EndObject"/> or <see cref="OperatorToken.EndStream"/>.
-        /// </summary>
-        public OperatorToken Type { get; }
-
-        /// <summary>
-        /// Create a new <see cref="PossibleStreamEndLocation"/>
-        /// </summary>
-        public PossibleStreamEndLocation(long offset, OperatorToken type)
-        {
-            Offset = offset;
-            Type = type ?? throw new ArgumentNullException(nameof(type));
-        }
-
-        public override string ToString()
-        {
-            return $"{Offset}: {Type}";
-        }
-    }
-}