From 0586713da3fdefbbdb260b31cd961371408aa244 Mon Sep 17 00:00:00 2001
From: EliotJones <elioty@hotmail.co.uk>
Date: Sat, 5 Jul 2025 15:46:08 -0500
Subject: [PATCH] skip comments in pdf objects streams #926

the file provided in issue #926 contains the following syntax
in pdf object streams:

```
% 750 0 obj
<< >>
```

currently we read the comment token and skip the rest
however this producer is writing nonsense to the stream.
comment tokens are only valid outside streams in pdf files
so we align to the behavior of pdfbox here by skipping the
entire line containing a comment inside a stream which fixes
parsing this file.
---
 .../Scanner/CoreTokenScanner.cs               | 30 ++++++++++++++++++-
 .../Tokenization/Scanner/PdfTokenScanner.cs   |  6 +++-
 2 files changed, 34 insertions(+), 2 deletions(-)
diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
index 32040fac..303bfe16 100644
--- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
+++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
@@ -43,6 +43,14 @@
 
         private bool hasBytePreRead;
         private bool isInInlineImage;
+        /// <summary>
+        /// '%' only identifies comments outside of PDF streams and strings, inside these we should ignore it.
+        /// </summary>
+        /// <remarks>
+        /// PDFBox skips all of a line following a comment character inside streams, see:
+        /// https://github.com/apache/pdfbox/blob/0e1c42dace1c3a2631d5309f662de5628b80fda6/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java#L1319
+        /// </remarks>
+        private readonly bool isStream;
 
         /// <summary>
         /// Create a new <see cref="CoreTokenScanner"/> from the input.
@@ -52,7 +60,8 @@
             bool usePdfDocEncoding,
             ScannerScope scope = ScannerScope.None,
             IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null,
-            bool useLenientParsing = false)
+            bool useLenientParsing = false,
+            bool isStream = false)
         {
             this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
             this.usePdfDocEncoding = usePdfDocEncoding;
@@ -62,6 +71,7 @@
             this.scope = scope;
             this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
             this.useLenientParsing = useLenientParsing;
+            this.isStream = isStream;
         }
 
         /// <inheritdoc />
@@ -94,6 +104,7 @@
         {
             var endAngleBracesRead = 0;
 
+            bool isSkippingLine = false;
             bool isSkippingSymbol = false;
             while ((hasBytePreRead && !inputBytes.IsAtEnd()) || inputBytes.MoveNext())
             {
@@ -101,6 +112,17 @@
                 var currentByte = inputBytes.CurrentByte;
                 var c = (char) currentByte;
 
+                if (isSkippingLine)
+                {
+                    if (ReadHelper.IsEndOfLine(c))
+                    {
+                        isSkippingLine = false;
+                        continue;
+                    }
+
+                    continue;
+                }
+
                 ITokenizer tokenizer = null;
                 foreach (var customTokenizer in customTokenizers)
                 {
@@ -119,6 +141,12 @@
                         continue;
                     }
 
+                    if (currentByte == (byte)'%' && isStream)
+                    {
+                        isSkippingLine = true;
+                        continue;
+                    }
+
                     // If we failed to read the symbol for whatever reason we pass over it.
                     if (isSkippingSymbol && c != '>')
                     {
diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
index 16f05707..4bfd802d 100644
--- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
@@ -846,7 +846,11 @@
             // Read the N integers
             var bytes = new MemoryInputBytes(stream.Decode(filterProvider, this));
 
-            var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
+            var scanner = new CoreTokenScanner(
+                bytes,
+                true,
+                useLenientParsing: parsingOptions.UseLenientParsing,
+                isStream: true);
 
             var objects = new List<(long, long)>();