From 3084a9aab6a18d6cf98ef5f14400cf9fab813011 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Fri, 20 Dec 2019 14:04:58 +0000 Subject: [PATCH] support streams containing only carriage returns. handle comments in arrays and dictionaries * while the pdf specification says stream data should follow a newline following a stream operator some files have only a carriage return following the stream operator. * since comment tokens may appear inside an array or dictionary we ignore them if they occur here since they will break interpretation of the dictionary or array contents. --- src/UglyToad.PdfPig/Tokenization/ArrayTokenizer.cs | 5 +++++ .../Tokenization/DictionaryTokenizer.cs | 5 +++++ .../Tokenization/Scanner/PdfTokenScanner.cs | 10 +++++++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig/Tokenization/ArrayTokenizer.cs b/src/UglyToad.PdfPig/Tokenization/ArrayTokenizer.cs index 31e8436a..b07fccce 100644 --- a/src/UglyToad.PdfPig/Tokenization/ArrayTokenizer.cs +++ b/src/UglyToad.PdfPig/Tokenization/ArrayTokenizer.cs @@ -26,6 +26,11 @@ while (!CurrentByteEndsCurrentArray(inputBytes, previousToken) && scanner.MoveNext()) { previousToken = scanner.CurrentToken; + + if (scanner.CurrentToken is CommentToken) + { + continue; + } contents.Add(scanner.CurrentToken); } diff --git a/src/UglyToad.PdfPig/Tokenization/DictionaryTokenizer.cs b/src/UglyToad.PdfPig/Tokenization/DictionaryTokenizer.cs index 68498ec9..b64b855f 100644 --- a/src/UglyToad.PdfPig/Tokenization/DictionaryTokenizer.cs +++ b/src/UglyToad.PdfPig/Tokenization/DictionaryTokenizer.cs @@ -48,6 +48,11 @@ while (coreScanner.MoveNext()) { + if (coreScanner.CurrentToken is CommentToken) + { + continue; + } + tokens.Add(coreScanner.CurrentToken); } diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 28e8f575..06bce197 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -229,14 +229,22 @@ return false; } + // While the specification demands a \n we have seen files with \r only in the wild. + var hadWhiteSpace = false; if (inputBytes.CurrentByte == '\r') { + hadWhiteSpace = true; inputBytes.MoveNext(); } if (inputBytes.CurrentByte != '\n') { - return false; + if (!hadWhiteSpace) + { + return false; + } + + inputBytes.Seek(inputBytes.CurrentOffset - 1); } // Store where we started reading the first byte of data.