From 41eddca0bf66b79d002dbe0deaa47658bbc0cf19 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Sun, 23 Jun 2019 12:05:21 +0100 Subject: [PATCH] handle incorrect xref offsets #34 previously if the cross reference did not exist at exactly the provided offset we'd immediately throw, now we assume we can read a few more tokens to find the xref table or stream start. this won't work in the case where the provided offset is past the start of the table or nowhere near the table but in those cases there's not much we can do. there's some more work to do to provide a fallback xref parser which finds the xref tables and streams using a brute-force scan of the whole document. --- .../FileStructure/CrossReferenceParser.cs | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs index b1716978..f41b3ec7 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs @@ -45,8 +45,10 @@ var prevSet = new HashSet(); long previousCrossReferenceLocation = crossReferenceLocation; + var missedAttempts = 0; + // Parse all cross reference tables and streams. - while (previousCrossReferenceLocation > 0) + while (previousCrossReferenceLocation > 0 && missedAttempts < 100) { log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}."); @@ -57,6 +59,7 @@ if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref") { + missedAttempts = 0; log.Debug("Element was cross reference table."); CrossReferenceTablePart tablePart = crossReferenceTableParser.Parse(tokenScanner, @@ -132,6 +135,8 @@ { log.Debug("Element was cross reference stream."); + missedAttempts = 0; + // Unread the numeric token. tokenScanner.Seek(previousCrossReferenceLocation); @@ -153,10 +158,14 @@ } else { - log.Debug("Element was invalid."); + log.Debug($"The cross reference found at this location ({previousCrossReferenceLocation}) was not a table or stream. " + + $"Found token ({tokenScanner.CurrentToken}) ending at {tokenScanner.CurrentPosition} instead. Seeking next token."); - throw new PdfDocumentFormatException("The cross reference found at this location was not a " + - $"table or a stream: Location - {previousCrossReferenceLocation}, {tokenScanner.CurrentPosition}."); + previousCrossReferenceLocation = tokenScanner.CurrentPosition; + + missedAttempts++; + + continue; } if (prevSet.Contains(previousCrossReferenceLocation)) @@ -167,6 +176,12 @@ prevSet.Add(previousCrossReferenceLocation); } + if (missedAttempts == 100) + { + // TODO: scan the document to find the correct token. + throw new PdfDocumentFormatException("The cross reference was not found."); + } + var resolved = table.Build(crossReferenceLocation, log); // check the offsets of all referenced objects