From f415c3116ec88ad8845d82472d2b981922712fb4 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Wed, 26 Feb 2020 14:03:46 +0000 Subject: [PATCH] cross reference offset is in the xref table we ignore the error previously we checked the offset was not inside the table (correct thing to check), however this is only a special case of the more general issue (cross reference offsets are wrong). we move handling for this into the pdf token scanner. if we attempt to read an object at an offset and it fails we brute force the entire file to find correct offsets. we also needed to add handling to make sure we don't attempt to use stream length tokens if we're brute-forcing since we can't look up indirect references for length. --- .../CrossReferenceTableParserTests.cs | 7 ++-- .../CrossReferenceTableParser.cs | 11 ++---- .../Tokenization/Scanner/PdfTokenScanner.cs | 37 +++++++++++++------ 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileStructure/CrossReferenceTableParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileStructure/CrossReferenceTableParserTests.cs index 3494cef1..d0e5efb1 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileStructure/CrossReferenceTableParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileStructure/CrossReferenceTableParserTests.cs @@ -227,7 +227,7 @@ trailer } [Fact] - public void EntryPointingAtOffsetInTableThrows() + public void EntryPointingAtOffsetInTableDoesNotThrow() { var input = GetReader(@"xref 0 2 @@ -236,9 +236,10 @@ trailer trailer <<>>"); - Action action = () => parser.Parse(input, 0, false); + var result = parser.Parse(input, 0, false); - Assert.Throws(action); + var offset = Assert.Single(result.ObjectOffsets); + Assert.Equal(10, offset.Value); } [Fact] diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceTableParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceTableParser.cs index cbe108a3..2d553854 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceTableParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceTableParser.cs @@ -69,7 +69,7 @@ readingLine = false; - count = ProcessTokens(tokens, scanner, builder, isLenientParsing, count, ref definition); + count = ProcessTokens(tokens, builder, isLenientParsing, count, ref definition); tokens.Clear(); @@ -94,7 +94,7 @@ if (tokens.Count > 0) { - ProcessTokens(tokens, scanner, builder, isLenientParsing, count, ref definition); + ProcessTokens(tokens, builder, isLenientParsing, count, ref definition); } scanner.DeregisterCustomTokenizer(tokenizer); @@ -105,7 +105,7 @@ return builder.Build(); } - private static int ProcessTokens(List tokens, ISeekableTokenScanner scanner, CrossReferenceTablePartBuilder builder, bool isLenientParsing, + private static int ProcessTokens(List tokens, CrossReferenceTablePartBuilder builder, bool isLenientParsing, int objectCount, ref TableSubsectionDefinition definition) { string GetErrorMessage() @@ -161,11 +161,6 @@ if (tokens[0] is NumericToken offset && tokens[1] is NumericToken generationNumber) { - if (offset.Long >= builder.Offset && offset.Long <= scanner.CurrentPosition) - { - throw new PdfDocumentFormatException($"Object offset {offset} is within its own cross-reference table for object {definition.FirstNumber + objectCount}"); - } - builder.Add(definition.FirstNumber + objectCount, generationNumber.Int, offset.Long); return objectCount + 1; diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index eeec8dc4..0a747bb2 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -29,6 +29,7 @@ private IEncryptionHandler encryptionHandler; private bool isDisposed; + private bool isBruteForcing; /// /// Stores tokens encountered between obj - endobj markers for each call. @@ -153,7 +154,7 @@ var streamIdentifier = new IndirectReference(objectNumber.Long, generation.Int); // Prevent an infinite loop where a stream's length references the stream or the stream's offset. - var getLengthFromFile = !(callingObject.HasValue && callingObject.Value.Equals(streamIdentifier)); + var getLengthFromFile = !isBruteForcing && !(callingObject.HasValue && callingObject.Value.Equals(streamIdentifier)); var outerCallingObject = callingObject; @@ -673,7 +674,7 @@ if (!MoveNext()) { - throw new PdfDocumentFormatException($"Could not parse the object with reference: {reference}."); + return BruteForceFileToFindReference(reference); } var found = (ObjectToken)CurrentToken; @@ -683,20 +684,34 @@ return found; } - // Brute force read the entire file - Seek(0); + return BruteForceFileToFindReference(reference); + } - while (MoveNext()) + private ObjectToken BruteForceFileToFindReference(IndirectReference reference) + { + try { - objectLocationProvider.Cache((ObjectToken)CurrentToken, true); - } + // Brute force read the entire file + isBruteForcing = true; - if (!objectLocationProvider.TryGetCached(reference, out objectToken)) + Seek(0); + + while (MoveNext()) + { + objectLocationProvider.Cache((ObjectToken)CurrentToken, true); + } + + if (!objectLocationProvider.TryGetCached(reference, out var objectToken)) + { + throw new PdfDocumentFormatException($"Could not locate object with reference: {reference} despite a full document search."); + } + + return objectToken; + } + finally { - throw new PdfDocumentFormatException($"Could not locate object with reference: {reference} despite a full document search."); + isBruteForcing = false; } - - return objectToken; } private ObjectToken GetObjectFromStream(IndirectReference reference, long offset)