diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs index 32040fac..303bfe16 100644 --- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs @@ -43,6 +43,14 @@ private bool hasBytePreRead; private bool isInInlineImage; + /// + /// '%' only identifies comments outside of PDF streams and strings, inside these we should ignore it. + /// + /// + /// PDFBox skips all of a line following a comment character inside streams, see: + /// https://github.com/apache/pdfbox/blob/0e1c42dace1c3a2631d5309f662de5628b80fda6/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java#L1319 + /// + private readonly bool isStream; /// /// Create a new from the input. @@ -52,7 +60,8 @@ bool usePdfDocEncoding, ScannerScope scope = ScannerScope.None, IReadOnlyDictionary> namedDictionaryRequiredKeys = null, - bool useLenientParsing = false) + bool useLenientParsing = false, + bool isStream = false) { this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes)); this.usePdfDocEncoding = usePdfDocEncoding; @@ -62,6 +71,7 @@ this.scope = scope; this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys; this.useLenientParsing = useLenientParsing; + this.isStream = isStream; } /// @@ -94,6 +104,7 @@ { var endAngleBracesRead = 0; + bool isSkippingLine = false; bool isSkippingSymbol = false; while ((hasBytePreRead && !inputBytes.IsAtEnd()) || inputBytes.MoveNext()) { @@ -101,6 +112,17 @@ var currentByte = inputBytes.CurrentByte; var c = (char) currentByte; + if (isSkippingLine) + { + if (ReadHelper.IsEndOfLine(c)) + { + isSkippingLine = false; + continue; + } + + continue; + } + ITokenizer tokenizer = null; foreach (var customTokenizer in customTokenizers) { @@ -119,6 +141,12 @@ continue; } + if (currentByte == (byte)'%' && isStream) + { + isSkippingLine = true; + continue; + } + // If we failed to read the symbol for whatever reason we pass over it. if (isSkippingSymbol && c != '>') { diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 16f05707..4bfd802d 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -846,7 +846,11 @@ // Read the N integers var bytes = new MemoryInputBytes(stream.Decode(filterProvider, this)); - var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing); + var scanner = new CoreTokenScanner( + bytes, + true, + useLenientParsing: parsingOptions.UseLenientParsing, + isStream: true); var objects = new List<(long, long)>();