support streams containing only carriage returns. handle comments in arrays and dictionaries

* while the pdf specification says stream data should follow a newline following a stream operator some files have only a carriage return following the stream operator. * since comment tokens may appear inside an array or dictionary we ignore them if they occur here since they will break interpretation of the dictionary or array contents.
2025-09-22 12:09:50 +08:00 · 2019-12-20 14:04:58 +00:00
parent 3e6fa4b694
commit 3084a9aab6
3 changed files with 19 additions and 1 deletions
--- a/src/UglyToad.PdfPig/Tokenization/ArrayTokenizer.cs
+++ b/src/UglyToad.PdfPig/Tokenization/ArrayTokenizer.cs
@@ -26,6 +26,11 @@
            while (!CurrentByteEndsCurrentArray(inputBytes, previousToken) && scanner.MoveNext())
            {
                previousToken = scanner.CurrentToken;
                if (scanner.CurrentToken is CommentToken)
                {
                    continue;
                }
                contents.Add(scanner.CurrentToken);
            }
--- a/src/UglyToad.PdfPig/Tokenization/DictionaryTokenizer.cs
+++ b/src/UglyToad.PdfPig/Tokenization/DictionaryTokenizer.cs
@@ -48,6 +48,11 @@
            while (coreScanner.MoveNext())
            {
                if (coreScanner.CurrentToken is CommentToken)
                {
                    continue;
                }
                tokens.Add(coreScanner.CurrentToken);
            }
--- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
@@ -229,14 +229,22 @@
                return false;
            }
            // While the specification demands a \n we have seen files with \r only in the wild.
            var hadWhiteSpace = false;
            if (inputBytes.CurrentByte == '\r')
            {
                hadWhiteSpace = true;
                inputBytes.MoveNext();
            }
            if (inputBytes.CurrentByte != '\n')
            {
-                return false;
+                if (!hadWhiteSpace)
                {
                    return false;
                }
                inputBytes.Seek(inputBytes.CurrentOffset - 1);
            }
            // Store where we started reading the first byte of data.