diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
index 32040fac..303bfe16 100644
--- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
+++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
@@ -43,6 +43,14 @@
private bool hasBytePreRead;
private bool isInInlineImage;
+ ///
+ /// '%' only identifies comments outside of PDF streams and strings, inside these we should ignore it.
+ ///
+ ///
+ /// PDFBox skips all of a line following a comment character inside streams, see:
+ /// https://github.com/apache/pdfbox/blob/0e1c42dace1c3a2631d5309f662de5628b80fda6/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java#L1319
+ ///
+ private readonly bool isStream;
///
/// Create a new from the input.
@@ -52,7 +60,8 @@
bool usePdfDocEncoding,
ScannerScope scope = ScannerScope.None,
IReadOnlyDictionary> namedDictionaryRequiredKeys = null,
- bool useLenientParsing = false)
+ bool useLenientParsing = false,
+ bool isStream = false)
{
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
this.usePdfDocEncoding = usePdfDocEncoding;
@@ -62,6 +71,7 @@
this.scope = scope;
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
this.useLenientParsing = useLenientParsing;
+ this.isStream = isStream;
}
///
@@ -94,6 +104,7 @@
{
var endAngleBracesRead = 0;
+ bool isSkippingLine = false;
bool isSkippingSymbol = false;
while ((hasBytePreRead && !inputBytes.IsAtEnd()) || inputBytes.MoveNext())
{
@@ -101,6 +112,17 @@
var currentByte = inputBytes.CurrentByte;
var c = (char) currentByte;
+ if (isSkippingLine)
+ {
+ if (ReadHelper.IsEndOfLine(c))
+ {
+ isSkippingLine = false;
+ continue;
+ }
+
+ continue;
+ }
+
ITokenizer tokenizer = null;
foreach (var customTokenizer in customTokenizers)
{
@@ -119,6 +141,12 @@
continue;
}
+ if (currentByte == (byte)'%' && isStream)
+ {
+ isSkippingLine = true;
+ continue;
+ }
+
// If we failed to read the symbol for whatever reason we pass over it.
if (isSkippingSymbol && c != '>')
{
diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
index 16f05707..4bfd802d 100644
--- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
@@ -846,7 +846,11 @@
// Read the N integers
var bytes = new MemoryInputBytes(stream.Decode(filterProvider, this));
- var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
+ var scanner = new CoreTokenScanner(
+ bytes,
+ true,
+ useLenientParsing: parsingOptions.UseLenientParsing,
+ isStream: true);
var objects = new List<(long, long)>();