diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs index 7fb9d8db..1e867d71 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs @@ -528,12 +528,200 @@ endobj"; Assert.NotNull(dictionaryToken); } - private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null) + [Fact] + public void ReadsDictionaryWithoutEndObjBeforeNextObject() + { + const string input = @"1 0 obj +<> +2 0 obj +<> +endobj"; + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + Assert.Equal(2, tokens.Count); + + var dictionaryToken = Assert.IsType(tokens[0].Data); + var typeValue = dictionaryToken.Data["Type"]; + Assert.IsType(typeValue); + + dictionaryToken = tokens[1].Data as DictionaryToken; + Assert.NotNull(dictionaryToken); + typeValue = dictionaryToken.Data["Length"]; + Assert.IsType(typeValue); + } + + [Fact] + public void ReadsStreamWithoutEndObjBeforeNextObject() + { + const string input = @"1 0 obj +<> +stream +aaaa +endstream +2 0 obj +<> +endobj"; + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + Assert.Equal(2, tokens.Count); + + Assert.IsType(tokens[0].Data); + + var dictionaryToken = Assert.IsType(tokens[1].Data); + var typeValue = dictionaryToken.Data["Length"]; + Assert.IsType(typeValue); + } + + [Theory] + [InlineData("startxref")] + [InlineData("xref")] + public void ReadsStreamWithoutEndObjBeforeToken(string token) + { + string input = @$"1 0 obj +<> +stream +aaaa +endstream +{token}"; + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + Assert.Single(tokens); + + Assert.IsType(tokens[0].Data); + } + + [Theory] + [InlineData("startxref")] + [InlineData("xref")] + public void ReadsDictionaryWithoutEndObjBeforeToken(string token) + { + string input = @$"1 0 obj +<> +{token}"; + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + Assert.Single(tokens); + + var dictionaryToken = Assert.IsType(tokens[0].Data); + var typeValue = dictionaryToken.Data["Type"]; + Assert.IsType(typeValue); + } + + [Fact] + public void ReadsStreamWithoutEndStreamBeforeEndObj() + { + const string input = @"1 0 obj +<> +stream +aaaa +endobj +2 0 obj +<> +endobj"; + + var scanner = GetScanner(input); + + var tokens = ReadToEnd(scanner); + + Assert.Equal(2, tokens.Count); + + Assert.IsType(tokens[0].Data); + + var dictionaryToken = Assert.IsType(tokens[1].Data); + var lengthValue = dictionaryToken.Data["Length"]; + Assert.IsType(lengthValue); + } + + [Theory] + [InlineData(">>")] + [InlineData("randomstring")] + public void ReadsIndirectObjectsDictionaryWithContentBeforeEndObj(string addedContent) + { + string input = @$"1 0 obj +<> +{addedContent}endobj +2 0 obj +<> +endobj"; + + var strictScanner = GetScanner(input); + + var tokens = ReadToEnd(strictScanner); + Assert.Empty(tokens); + + + var lenientScanner = GetScanner(input, useLenientParsing: true); + tokens = ReadToEnd(lenientScanner); + + Assert.Equal(2, tokens.Count); + + var dictionaryToken = Assert.IsType(tokens[0].Data); + var typeValue = dictionaryToken.Data["Type"]; + Assert.IsType(typeValue); + + dictionaryToken = Assert.IsType(tokens[1].Data); + var lengthValue = dictionaryToken.Data["Length"]; + Assert.IsType(lengthValue); + } + + [Theory] + [InlineData(">>")] + [InlineData("randomstring")] + public void ReadsIndirectObjectsStreamWithAddedContentBeforeStream(string addedContent) + { + string input = @$"1 0 obj +<> +{addedContent}stream +aaaa +endstream +endobj +2 0 obj +<> +endobj"; + + var strictScanner = GetScanner(input); + + var tokens = ReadToEnd(strictScanner); + Assert.Equal(2, tokens.Count); + // this is linked to the parsing choosing the last token parsed in obj. + // It can probably be challenged against taking the first one. + var operatorToken = Assert.IsType(tokens[0].Data); + Assert.Equal("endstream", operatorToken.Data); + + var dictionaryToken = Assert.IsType(tokens[1].Data); + var lengthValue = dictionaryToken.Data["Length"]; + Assert.IsType(lengthValue); + + var lenientScanner = GetScanner(input, useLenientParsing:true); + tokens = ReadToEnd(lenientScanner); + + Assert.Equal(2, tokens.Count); + + Assert.IsType(tokens[0].Data); + + dictionaryToken = Assert.IsType(tokens[1].Data); + lengthValue = dictionaryToken.Data["Length"]; + Assert.IsType(lengthValue); + } + + private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null, bool useLenientParsing = false) { var input = StringBytesTestConverter.Convert(s, false); return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(), - new TestFilterProvider(), NoOpEncryptionHandler.Instance, ParsingOptions.LenientParsingOff); + new TestFilterProvider(), NoOpEncryptionHandler.Instance, useLenientParsing ? new ParsingOptions() : ParsingOptions.LenientParsingOff); } private static IReadOnlyList ReadToEnd(PdfTokenScanner scanner) diff --git a/src/UglyToad.PdfPig.Tokens/OperatorToken.cs b/src/UglyToad.PdfPig.Tokens/OperatorToken.cs index 35d62c88..ce664915 100644 --- a/src/UglyToad.PdfPig.Tokens/OperatorToken.cs +++ b/src/UglyToad.PdfPig.Tokens/OperatorToken.cs @@ -117,6 +117,11 @@ /// public static readonly OperatorToken Xref = new OperatorToken("xref"); + /// + /// Cross reference section offset. + /// + public static readonly OperatorToken StartXref = new OperatorToken("startxref"); + /// public string Data { get; } @@ -163,6 +168,7 @@ "Tf" => Tf, "W*" => WStar, "xref" => Xref, + "startxref" => StartXref, _ => new OperatorToken(data.ToString()) }; } diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index af8db945..929dbbd9 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -135,7 +135,7 @@ var readStream = false; // Read all tokens between obj and endobj. - while (coreTokenScanner.MoveNext() && !Equals(coreTokenScanner.CurrentToken, OperatorToken.EndObject)) + while (coreTokenScanner.MoveNext() && !IsToken(coreTokenScanner, OperatorToken.EndObject, out _)) { if (coreTokenScanner.CurrentToken is CommentToken) { @@ -171,7 +171,35 @@ return false; } - if (ReferenceEquals(coreTokenScanner.CurrentToken, OperatorToken.StartStream)) + if (IsToken(coreTokenScanner, OperatorToken.Xref, out _) || IsToken(coreTokenScanner, OperatorToken.StartXref, out _)) + { + if (readStream && readTokens[0] is StreamToken streamRead) + { + readTokens.Clear(); + readTokens.Add(streamRead); + coreTokenScanner.Seek(previousTokenPositions[2]); + break; + } + + if (readTokens.Count == 1) + { + // An obj was encountered after reading the actual token and the object and generation number of the following token. + var actualReference = new IndirectReference(objectNumber.Int, generation.Int); + var actualToken = encryptionHandler.Decrypt(actualReference, readTokens[0]); + + CurrentToken = new ObjectToken(startPosition, actualReference, actualToken); + readTokens.Clear(); + coreTokenScanner.Seek(previousTokenPositions[2]); + + return true; + } + + // This should never happen. + Debug.Assert(false, "Encountered a '{' operator before the end of the previous object."); + return false; + } + + if (IsToken(coreTokenScanner, OperatorToken.StartStream, out var actualStartStreamPosition)) { var streamIdentifier = new IndirectReference(objectNumber.Long, generation.Int); @@ -185,7 +213,7 @@ callingObject = streamIdentifier; // Read stream: special case. - if (TryReadStream(coreTokenScanner.CurrentTokenStart, getLengthFromFile, out var stream)) + if (TryReadStream(actualStartStreamPosition.Value, getLengthFromFile, out var stream)) { readTokens.Clear(); readTokens.Add(stream); @@ -212,7 +240,7 @@ previousTokenPositions[2] = coreTokenScanner.CurrentTokenStart; } - if (!readStream && !ReferenceEquals(coreTokenScanner.CurrentToken, OperatorToken.EndObject)) + if (!readStream && !IsToken(coreTokenScanner, OperatorToken.EndObject, out _)) { readTokens.Clear(); return false; @@ -263,6 +291,24 @@ return true; } + private bool IsToken(CoreTokenScanner scanner, OperatorToken token, [NotNullWhen(true)] out long? actualTokenStart) + { + if (ReferenceEquals(scanner.CurrentToken, token)) + { + actualTokenStart = scanner.CurrentTokenStart; + return true; + } + + if (parsingOptions.UseLenientParsing && scanner.CurrentToken is OperatorToken opToken && opToken.Data.EndsWith(token.Data)) + { + actualTokenStart = scanner.CurrentTokenStart + opToken.Data.Length - token.Data.Length; + return true; + } + + actualTokenStart = null; + return false; + } + private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNullWhen(true)] out StreamToken? stream) { stream = null;