Support reading files with missing white space after xref in lenient mode (#906)

Support missing white space after xref

---------

Co-authored-by: Arnaud TAMAILLON <arnaud.tamaillon@younited-credit.fr>
This commit is contained in:
Arnaud TAMAILLON
2024-09-09 08:09:04 +02:00
committed by GitHub
parent 09bddba778
commit 4845f43696
5 changed files with 65 additions and 7 deletions

View File

@@ -0,0 +1,13 @@
namespace UglyToad.PdfPig.Tests.Integration
{
public class CrossReferenceParserTests
{
[Fact]
public void CanReadDocumentWithMissingWhitespaceAfterXRef()
{
string path = IntegrationHelpers.GetSpecificTestDocumentPath("xref-with-no-whitespace.pdf");
using var document = PdfDocument.Open(path);
Assert.Equal(3, document.NumberOfPages);
}
}
}

View File

@@ -300,18 +300,42 @@ trailer
trailer trailer
<< >>"; << >>";
// Strict parsing // Strict parsing
var input = StringBytesTestConverter.Scanner(data); var input = GetReader(data);
var act = () => CrossReferenceTableParser.Parse(input.scanner, 0, false); var act = () => CrossReferenceTableParser.Parse(input, 0, false);
var ex = Assert.Throws<PdfDocumentFormatException>(act); var ex = Assert.Throws<PdfDocumentFormatException>(act);
Assert.Equal("Found a line with 2 unexpected entries in the cross reference table: 127, 0.", ex.Message); Assert.Equal("Found a line with 2 unexpected entries in the cross reference table: 127, 0.", ex.Message);
// Lenient Parsing // Lenient Parsing
input = StringBytesTestConverter.Scanner(data); input = GetReader(data);
var result = CrossReferenceTableParser.Parse(input.scanner, 0, true); var result = CrossReferenceTableParser.Parse(input, 0, true);
Assert.Equal(6, result.ObjectOffsets.Count); Assert.Equal(6, result.ObjectOffsets.Count);
} }
[Fact]
public void ParsesMissingWhitespaceAfterXref()
{
var data = @"xref15 2
0000000190 00000 n
0000000250 00032 n
trailer
<<>>";
var input = GetReader(data);
// Strict parsing
var act = () => CrossReferenceTableParser.Parse(input, 0, false);
var ex = Assert.Throws<PdfDocumentFormatException>(act);
Assert.Equal("Unexpected operator in xref position: xref15.", ex.Message);
// Lenient Parsing
input = GetReader(data);
var result = CrossReferenceTableParser.Parse(input, 0, true);
Assert.Equal(2, result.ObjectOffsets.Count);
}
private static CoreTokenScanner GetReader(string input) private static CoreTokenScanner GetReader(string input)
{ {
return StringBytesTestConverter.Scanner(input).scanner; return StringBytesTestConverter.Scanner(input).scanner;

View File

@@ -59,7 +59,7 @@
tokenScanner.MoveNext(); tokenScanner.MoveNext();
if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref") if (CrossReferenceTableParser.IsCrossReferenceMarker(tokenScanner, isLenientParsing))
{ {
missedAttempts = 0; missedAttempts = 0;
log.Debug("Element was cross reference table."); log.Debug("Element was cross reference table.");

View File

@@ -13,7 +13,7 @@
{ {
private const string InUseEntry = "n"; private const string InUseEntry = "n";
private const string FreeEntry = "f"; private const string FreeEntry = "f";
public static CrossReferenceTablePart Parse(ISeekableTokenScanner scanner, long offset, bool isLenientParsing) public static CrossReferenceTablePart Parse(ISeekableTokenScanner scanner, long offset, bool isLenientParsing)
{ {
var builder = new CrossReferenceTablePartBuilder var builder = new CrossReferenceTablePartBuilder
@@ -31,10 +31,22 @@
if (scanner.CurrentToken is OperatorToken operatorToken) if (scanner.CurrentToken is OperatorToken operatorToken)
{ {
if (operatorToken.Data == "xref") if (operatorToken.Data == OperatorToken.Xref.Data)
{ {
scanner.MoveNext(); scanner.MoveNext();
} }
else if (isLenientParsing)
{
if (operatorToken.Data.StartsWith(OperatorToken.Xref.Data))
{
scanner.Seek(scanner.CurrentPosition - operatorToken.Data.Length + OperatorToken.Xref.Data.Length);
scanner.MoveNext();
}
else
{
throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}.");
}
}
else else
{ {
throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}."); throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}.");
@@ -106,6 +118,15 @@
return builder.Build(); return builder.Build();
} }
public static bool IsCrossReferenceMarker(ISeekableTokenScanner scanner, bool isLenientParsing)
{
return (scanner.CurrentToken is OperatorToken operatorToken
&& (operatorToken.Data == OperatorToken.Xref.Data
|| (isLenientParsing
&& operatorToken.Data.StartsWith(OperatorToken.Xref.Data)
&& int.TryParse(operatorToken.Data.Substring(OperatorToken.Xref.Data.Length), out _))));
}
private static int ProcessTokens(ReadOnlySpan<IToken> tokens, CrossReferenceTablePartBuilder builder, bool isLenientParsing, private static int ProcessTokens(ReadOnlySpan<IToken> tokens, CrossReferenceTablePartBuilder builder, bool isLenientParsing,
int objectCount, ref TableSubsectionDefinition definition) int objectCount, ref TableSubsectionDefinition definition)
{ {