diff --git a/src/UglyToad.PdfPig.Tests/Integration/CrossReferenceParserTests.cs b/src/UglyToad.PdfPig.Tests/Integration/CrossReferenceParserTests.cs new file mode 100644 index 00000000..beab121b --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/CrossReferenceParserTests.cs @@ -0,0 +1,13 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + public class CrossReferenceParserTests + { + [Fact] + public void CanReadDocumentWithMissingWhitespaceAfterXRef() + { + string path = IntegrationHelpers.GetSpecificTestDocumentPath("xref-with-no-whitespace.pdf"); + using var document = PdfDocument.Open(path); + Assert.Equal(3, document.NumberOfPages); + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/xref-with-no-whitespace.pdf b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/xref-with-no-whitespace.pdf new file mode 100644 index 00000000..3ed57b96 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/xref-with-no-whitespace.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileStructure/CrossReferenceTableParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileStructure/CrossReferenceTableParserTests.cs index 16de78de..0f0e8a79 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileStructure/CrossReferenceTableParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileStructure/CrossReferenceTableParserTests.cs @@ -300,18 +300,42 @@ trailer trailer << >>"; // Strict parsing - var input = StringBytesTestConverter.Scanner(data); - var act = () => CrossReferenceTableParser.Parse(input.scanner, 0, false); + var input = GetReader(data); + var act = () => CrossReferenceTableParser.Parse(input, 0, false); var ex = Assert.Throws(act); Assert.Equal("Found a line with 2 unexpected entries in the cross reference table: 127, 0.", ex.Message); // Lenient Parsing - input = StringBytesTestConverter.Scanner(data); - var result = CrossReferenceTableParser.Parse(input.scanner, 0, true); + input = GetReader(data); + var result = CrossReferenceTableParser.Parse(input, 0, true); Assert.Equal(6, result.ObjectOffsets.Count); } + [Fact] + public void ParsesMissingWhitespaceAfterXref() + { + var data = @"xref15 2 +0000000190 00000 n +0000000250 00032 n + +trailer +<<>>"; + var input = GetReader(data); + + // Strict parsing + var act = () => CrossReferenceTableParser.Parse(input, 0, false); + + var ex = Assert.Throws(act); + Assert.Equal("Unexpected operator in xref position: xref15.", ex.Message); + + // Lenient Parsing + input = GetReader(data); + var result = CrossReferenceTableParser.Parse(input, 0, true); + + Assert.Equal(2, result.ObjectOffsets.Count); + } + private static CoreTokenScanner GetReader(string input) { return StringBytesTestConverter.Scanner(input).scanner; diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs index e58b6e1c..f5cac5c0 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs @@ -59,7 +59,7 @@ tokenScanner.MoveNext(); - if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref") + if (CrossReferenceTableParser.IsCrossReferenceMarker(tokenScanner, isLenientParsing)) { missedAttempts = 0; log.Debug("Element was cross reference table."); diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceTableParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceTableParser.cs index 074ba9a0..053bd004 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceTableParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceTableParser.cs @@ -13,7 +13,7 @@ { private const string InUseEntry = "n"; private const string FreeEntry = "f"; - + public static CrossReferenceTablePart Parse(ISeekableTokenScanner scanner, long offset, bool isLenientParsing) { var builder = new CrossReferenceTablePartBuilder @@ -31,10 +31,22 @@ if (scanner.CurrentToken is OperatorToken operatorToken) { - if (operatorToken.Data == "xref") + if (operatorToken.Data == OperatorToken.Xref.Data) { scanner.MoveNext(); } + else if (isLenientParsing) + { + if (operatorToken.Data.StartsWith(OperatorToken.Xref.Data)) + { + scanner.Seek(scanner.CurrentPosition - operatorToken.Data.Length + OperatorToken.Xref.Data.Length); + scanner.MoveNext(); + } + else + { + throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}."); + } + } else { throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}."); @@ -106,6 +118,15 @@ return builder.Build(); } + public static bool IsCrossReferenceMarker(ISeekableTokenScanner scanner, bool isLenientParsing) + { + return (scanner.CurrentToken is OperatorToken operatorToken + && (operatorToken.Data == OperatorToken.Xref.Data + || (isLenientParsing + && operatorToken.Data.StartsWith(OperatorToken.Xref.Data) + && int.TryParse(operatorToken.Data.Substring(OperatorToken.Xref.Data.Length), out _)))); + } + private static int ProcessTokens(ReadOnlySpan tokens, CrossReferenceTablePartBuilder builder, bool isLenientParsing, int objectCount, ref TableSubsectionDefinition definition) {