special case handling for faulty offsets in xref with missing whitespace between eof and object number

This commit is contained in:
Eliot Jones
2019-06-14 20:40:24 +01:00
parent 4c716fcbd6
commit 98424b32aa
2 changed files with 51 additions and 29 deletions

View File

@@ -1,39 +1,39 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using System;
using System.Diagnostics;
using System.IO;
using Xunit;
//using System;
//using System.Diagnostics;
//using System.IO;
//using Xunit;
/// <summary>
/// A class for testing files which are not checked in to source control.
/// </summary>
public class LocalTests
{
[Fact]
public void Tests()
{
var files = Directory.GetFiles(@"C:\temp\pdfs");
//[Fact]
//public void Tests()
//{
// var files = Directory.GetFiles(@"C:\temp\pdfs");
foreach (var file in files)
{
try
{
using (var document = PdfDocument.Open(file, new ParsingOptions { UseLenientParsing = false }))
{
for (var i = 1; i <= document.NumberOfPages; i++)
{
var page = document.GetPage(i);
var text = page.Text;
Trace.WriteLine(text);
}
}
}
catch (Exception ex)
{
throw new InvalidOperationException($"Error parsing: {Path.GetFileName(file)}.", ex);
}
}
}
// foreach (var file in files)
// {
// try
// {
// using (var document = PdfDocument.Open(file, new ParsingOptions { UseLenientParsing = false }))
// {
// for (var i = 1; i <= document.NumberOfPages; i++)
// {
// var page = document.GetPage(i);
// var text = page.Text;
// Trace.WriteLine(text);
// }
// }
// }
// catch (Exception ex)
// {
// throw new InvalidOperationException($"Error parsing: {Path.GetFileName(file)}.", ex);
// }
// }
//}
}
}

View File

@@ -4,6 +4,7 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Text.RegularExpressions;
using Encryption;
using Exceptions;
using Filters;
@@ -26,6 +27,8 @@
internal class PdfTokenScanner : IPdfTokenScanner
{
private static readonly Regex EndsWithNumberRegex = new Regex(@"(?<=^[^\s\d]+)\d+$");
private readonly IInputBytes inputBytes;
private readonly IObjectLocationProvider objectLocationProvider;
private readonly IFilterProvider filterProvider;
@@ -96,7 +99,26 @@
if (objectNumber == null || generation == null)
{
return false;
// Handle case where the scanner correctly reads most of an object token but includes too much of the first token
// specifically %%EOF1 0 obj where scanning starts from 'F'.
if (generation != null && previousTokens[0] is OperatorToken op)
{
var match = EndsWithNumberRegex.Match(op.Data);
if (match.Success && int.TryParse(match.Value, out var number))
{
startPosition = previousTokenPositions[0] + match.Index;
objectNumber = new NumericToken(number);
}
else
{
return false;
}
}
else
{
return false;
}
}
// Read all tokens between obj and endobj.