handle incorrect xref offsets #34

previously if the cross reference did not exist at exactly the provided offset we'd immediately throw, now we assume we can read a few more tokens to find the xref table or stream start. this won't work in the case where the provided offset is past the start of the table or nowhere near the table but in those cases there's not much we can do. there's some more work to do to provide a fallback xref parser which finds the xref tables and streams using a brute-force scan of the whole document.
This commit is contained in:
Eliot Jones
2019-06-23 12:05:21 +01:00
parent 0c1b50fcc4
commit 41eddca0bf

View File

@@ -45,8 +45,10 @@
var prevSet = new HashSet<long>(); var prevSet = new HashSet<long>();
long previousCrossReferenceLocation = crossReferenceLocation; long previousCrossReferenceLocation = crossReferenceLocation;
var missedAttempts = 0;
// Parse all cross reference tables and streams. // Parse all cross reference tables and streams.
while (previousCrossReferenceLocation > 0) while (previousCrossReferenceLocation > 0 && missedAttempts < 100)
{ {
log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}."); log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}.");
@@ -57,6 +59,7 @@
if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref") if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
{ {
missedAttempts = 0;
log.Debug("Element was cross reference table."); log.Debug("Element was cross reference table.");
CrossReferenceTablePart tablePart = crossReferenceTableParser.Parse(tokenScanner, CrossReferenceTablePart tablePart = crossReferenceTableParser.Parse(tokenScanner,
@@ -132,6 +135,8 @@
{ {
log.Debug("Element was cross reference stream."); log.Debug("Element was cross reference stream.");
missedAttempts = 0;
// Unread the numeric token. // Unread the numeric token.
tokenScanner.Seek(previousCrossReferenceLocation); tokenScanner.Seek(previousCrossReferenceLocation);
@@ -153,10 +158,14 @@
} }
else else
{ {
log.Debug("Element was invalid."); log.Debug($"The cross reference found at this location ({previousCrossReferenceLocation}) was not a table or stream. " +
$"Found token ({tokenScanner.CurrentToken}) ending at {tokenScanner.CurrentPosition} instead. Seeking next token.");
throw new PdfDocumentFormatException("The cross reference found at this location was not a " + previousCrossReferenceLocation = tokenScanner.CurrentPosition;
$"table or a stream: Location - {previousCrossReferenceLocation}, {tokenScanner.CurrentPosition}.");
missedAttempts++;
continue;
} }
if (prevSet.Contains(previousCrossReferenceLocation)) if (prevSet.Contains(previousCrossReferenceLocation))
@@ -167,6 +176,12 @@
prevSet.Add(previousCrossReferenceLocation); prevSet.Add(previousCrossReferenceLocation);
} }
if (missedAttempts == 100)
{
// TODO: scan the document to find the correct token.
throw new PdfDocumentFormatException("The cross reference was not found.");
}
var resolved = table.Build(crossReferenceLocation, log); var resolved = table.Build(crossReferenceLocation, log);
// check the offsets of all referenced objects // check the offsets of all referenced objects