special case handling for faulty offsets in xref with missing whitespace between eof and object number

This commit is contained in:
Eliot Jones
2019-06-14 20:40:24 +01:00
parent 4c716fcbd6
commit 98424b32aa
2 changed files with 51 additions and 29 deletions

View File

@@ -4,6 +4,7 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Text.RegularExpressions;
using Encryption;
using Exceptions;
using Filters;
@@ -26,6 +27,8 @@
internal class PdfTokenScanner : IPdfTokenScanner
{
private static readonly Regex EndsWithNumberRegex = new Regex(@"(?<=^[^\s\d]+)\d+$");
private readonly IInputBytes inputBytes;
private readonly IObjectLocationProvider objectLocationProvider;
private readonly IFilterProvider filterProvider;
@@ -96,7 +99,26 @@
if (objectNumber == null || generation == null)
{
return false;
// Handle case where the scanner correctly reads most of an object token but includes too much of the first token
// specifically %%EOF1 0 obj where scanning starts from 'F'.
if (generation != null && previousTokens[0] is OperatorToken op)
{
var match = EndsWithNumberRegex.Match(op.Data);
if (match.Success && int.TryParse(match.Value, out var number))
{
startPosition = previousTokenPositions[0] + match.Index;
objectNumber = new NumericToken(number);
}
else
{
return false;
}
}
else
{
return false;
}
}
// Read all tokens between obj and endobj.