From 2a68670896030de8d0cf08748dacf0fd3306cf7a Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Sun, 24 Apr 2022 12:37:26 -0400 Subject: [PATCH] #443 handle case where file version comment token included in string by tokenization instead just brute force the raw content --- .../Parser/Parts/BaseFontRangeParserTests.cs | 6 +- .../Parser/Parts/FileHeaderParserTests.cs | 52 ++++++++--- .../CrossReferenceTableParserTests.cs | 4 +- .../StringBytesTestConverter.cs | 7 +- .../Parser/FileStructure/FileHeaderParser.cs | 92 +++++++++++++++---- .../Parser/PdfDocumentFactory.cs | 2 +- 6 files changed, 122 insertions(+), 41 deletions(-) diff --git a/src/UglyToad.PdfPig.Tests/Fonts/Parser/Parts/BaseFontRangeParserTests.cs b/src/UglyToad.PdfPig.Tests/Fonts/Parser/Parts/BaseFontRangeParserTests.cs index 5b7683c7..2941fb4b 100644 --- a/src/UglyToad.PdfPig.Tests/Fonts/Parser/Parts/BaseFontRangeParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Fonts/Parser/Parts/BaseFontRangeParserTests.cs @@ -16,7 +16,7 @@ var builder = new CharacterMapBuilder(); - parser.Parse(new NumericToken(1), input, builder); + parser.Parse(new NumericToken(1), input.scanner, builder); Assert.Equal(2, builder.BaseFontCharacterMap.Count); @@ -31,7 +31,7 @@ var builder = new CharacterMapBuilder(); - parser.Parse(new NumericToken(1), input, builder); + parser.Parse(new NumericToken(1), input.scanner, builder); Assert.Equal(7, builder.BaseFontCharacterMap.Count); @@ -47,7 +47,7 @@ var builder = new CharacterMapBuilder(); - parser.Parse(new NumericToken(2), input, builder); + parser.Parse(new NumericToken(2), input.scanner, builder); Assert.Equal(6, builder.BaseFontCharacterMap.Count); diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs index a292e9bd..7974a358 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs @@ -5,6 +5,8 @@ using PdfPig.Core; using PdfPig.Parser.FileStructure; using PdfPig.Tokenization.Scanner; + using PdfPig.Tokens; + using System.Linq; using Xunit; public class FileHeaderParserTests @@ -13,7 +15,7 @@ [Fact] public void NullScannerThrows() { - Action action = () => FileHeaderParser.Parse(null, false, log); + Action action = () => FileHeaderParser.Parse(null, null, false, log); Assert.Throws(action); } @@ -31,7 +33,7 @@ var scanner = StringBytesTestConverter.Scanner(input); - var result = FileHeaderParser.Parse(scanner, false, log); + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); Assert.Equal(format, result.VersionString); Assert.Equal(0, result.OffsetInFile); @@ -46,7 +48,7 @@ var scanner = StringBytesTestConverter.Scanner(input); - var result = FileHeaderParser.Parse(scanner, false, log); + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); Assert.Equal(1.2m, result.Version); Assert.Equal(TestEnvironment.IsUnixPlatform ? 7 : 9, result.OffsetInFile); @@ -57,7 +59,7 @@ { var scanner = StringBytesTestConverter.Scanner(string.Empty); - Action action = () => FileHeaderParser.Parse(scanner, false, log); + Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); Assert.Throws(action); } @@ -68,7 +70,7 @@ var scanner = StringBytesTestConverter.Scanner(@"one %PDF-1.2"); - var result = FileHeaderParser.Parse(scanner, false, log); + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); Assert.Equal(1.2m, result.Version); Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile); @@ -80,7 +82,7 @@ var scanner = StringBytesTestConverter.Scanner(@"one %PDF-1.7"); - var result = FileHeaderParser.Parse(scanner, true, log); + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); Assert.Equal(1.7m, result.Version); Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile); @@ -92,7 +94,7 @@ var scanner = StringBytesTestConverter.Scanner(@"one two three %PDF-1.6"); - var result = FileHeaderParser.Parse(scanner, true, log); + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); Assert.Equal(1.6m, result.Version); Assert.Equal(TestEnvironment.IsUnixPlatform ? 14 : 15, result.OffsetInFile); @@ -103,7 +105,7 @@ three %PDF-1.6"); { var scanner = StringBytesTestConverter.Scanner(@"one two"); - Action action = () => FileHeaderParser.Parse(scanner, true, log); + Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); Assert.Throws(action); } @@ -113,7 +115,7 @@ three %PDF-1.6"); { var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69"); - Action action = () => FileHeaderParser.Parse(scanner, false, log); + Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); Assert.Throws(action); } @@ -123,7 +125,7 @@ three %PDF-1.6"); { var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69"); - var result = FileHeaderParser.Parse(scanner, true, log); + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); Assert.Equal(1.4m, result.Version); } @@ -133,9 +135,9 @@ three %PDF-1.6"); { var scanner = StringBytesTestConverter.Scanner(@"%FDF-1.6"); - var result = FileHeaderParser.Parse(scanner, false, log); + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); - Assert.Equal(0, scanner.CurrentPosition); + Assert.Equal(0, scanner.scanner.CurrentPosition); Assert.Equal(0, result.OffsetInFile); } @@ -144,11 +146,33 @@ three %PDF-1.6"); { var input = OtherEncodings.StringAsLatin1Bytes("%PDF-1.7\r\n%âãÏÓ\r\n1 0 obj\r\n<>\r\nendobj"); - var scanner = new CoreTokenScanner(new ByteArrayInputBytes(input), ScannerScope.None); + var bytes = new ByteArrayInputBytes(input); - var result = FileHeaderParser.Parse(scanner, false, log); + var scanner = new CoreTokenScanner(bytes, ScannerScope.None); + + var result = FileHeaderParser.Parse(scanner, bytes, false, log); Assert.Equal(1.7m, result.Version); } + + [Fact] + public void Issue443() + { + const string hex = + @"00 0F 4A 43 42 31 33 36 36 31 32 32 37 2E 70 64 66 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 50 44 46 20 43 41 52 4F 01 00 FF FF FF FF 00 00 00 00 00 04 DF 28 00 00 00 00 AF 51 7E 82 AF 52 D7 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 81 81 03 0D 00 00 25 50 44 46 2D 31 2E 31 0A 25 E2 E3 CF D3 0D 0A 31 20 30 20 6F 62 6A"; + + var bytes = hex.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(x => HexToken.Convert(x[0], x[1])); + + var str = OtherEncodings.BytesAsLatin1String(bytes.ToArray()); + + var scanner = StringBytesTestConverter.Scanner(str); + + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); + + Assert.Equal(0, scanner.scanner.CurrentPosition); + Assert.Equal(129, result.OffsetInFile); + Assert.Equal(1.1m, result.Version); + Assert.Equal("PDF-1.1", result.VersionString); + } } } diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileStructure/CrossReferenceTableParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileStructure/CrossReferenceTableParserTests.cs index 112e0cfb..fc9b6f40 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileStructure/CrossReferenceTableParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileStructure/CrossReferenceTableParserTests.cs @@ -25,7 +25,7 @@ trailer << >>"); - var result = CrossReferenceTableParser.Parse(input, 4, false); + var result = CrossReferenceTableParser.Parse(input.scanner, 4, false); Assert.Equal(4, result.ObjectOffsets.Count); } @@ -288,7 +288,7 @@ trailer private static CoreTokenScanner GetReader(string input) { - return StringBytesTestConverter.Scanner(input); + return StringBytesTestConverter.Scanner(input).scanner; } } } diff --git a/src/UglyToad.PdfPig.Tests/StringBytesTestConverter.cs b/src/UglyToad.PdfPig.Tests/StringBytesTestConverter.cs index fc1386c9..4fa77915 100644 --- a/src/UglyToad.PdfPig.Tests/StringBytesTestConverter.cs +++ b/src/UglyToad.PdfPig.Tests/StringBytesTestConverter.cs @@ -31,11 +31,12 @@ public IInputBytes Bytes { get; set; } } - internal static CoreTokenScanner Scanner(string s) + internal static (CoreTokenScanner scanner, IInputBytes bytes) Scanner(string s) { - var result = new CoreTokenScanner(new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s))); + var inputBytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s)); + var result = new CoreTokenScanner(inputBytes); - return result; + return (result, inputBytes); } } } diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs index 3bde26dd..69273537 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs @@ -29,40 +29,41 @@ internal static class FileHeaderParser { [NotNull] - public static HeaderVersion Parse([NotNull]ISeekableTokenScanner scanner, bool isLenientParsing, ILog log) + public static HeaderVersion Parse([NotNull] ISeekableTokenScanner scanner, IInputBytes inputBytes, bool isLenientParsing, ILog log) { if (scanner == null) { throw new ArgumentNullException(nameof(scanner)); } - // Read the first token - if (!scanner.MoveNext()) - { - throw new PdfDocumentFormatException($"Could not read the first token in the document at position {scanner.CurrentPosition}."); - } + var startPosition = scanner.CurrentPosition; - var comment = scanner.CurrentToken as CommentToken; - - const int junkTokensTolerance = 25; + const int junkTokensTolerance = 30; var attempts = 0; - while (comment == null) + CommentToken comment; + do { - if (attempts == junkTokensTolerance) + if (attempts == junkTokensTolerance || !scanner.MoveNext()) { - throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document."); - } + if (!TryBruteForceVersionLocation(startPosition, inputBytes, out var version)) + { + throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document."); + } - if (!scanner.MoveNext()) - { - throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document."); + scanner.Seek(startPosition); + return version; } comment = scanner.CurrentToken as CommentToken; attempts++; - } + } while (comment == null); + return GetHeaderVersionAndResetScanner(comment, scanner, isLenientParsing, log); + } + + private static HeaderVersion GetHeaderVersionAndResetScanner(CommentToken comment, ISeekableTokenScanner scanner, bool isLenientParsing, ILog log) + { if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0) { return HandleMissingVersion(comment, isLenientParsing, log); @@ -70,7 +71,7 @@ const int toDecimalStartLength = 4; - if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength), + if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength), NumberStyles.Number, CultureInfo.InvariantCulture, out var version)) @@ -90,6 +91,61 @@ return result; } + private static bool TryBruteForceVersionLocation(long startPosition, IInputBytes inputBytes, out HeaderVersion headerVersion) + { + headerVersion = null; + + inputBytes.Seek(startPosition); + + // %PDF-x.y or %FDF-x.y + const int versionLength = 8; + const int bufferLength = 64; + + // Slide a window of bufferLength bytes across the file allowing for the fact the version could get split by + // the window (so always ensure an overlap of versionLength bytes between the end of the previous and start of the next buffer). + var buffer = new byte[bufferLength]; + + var currentOffset = startPosition; + int readLength; + do + { + readLength = inputBytes.Read(buffer, bufferLength); + + var content = OtherEncodings.BytesAsLatin1String(buffer); + + var pdfIndex = content.IndexOf("%PDF-", StringComparison.OrdinalIgnoreCase); + var fdfIndex = content.IndexOf("%FDF-", StringComparison.OrdinalIgnoreCase); + var actualIndex = pdfIndex >= 0 ? pdfIndex : fdfIndex; + + if (actualIndex >= 0 && content.Length - actualIndex >= versionLength) + { + var numberPart = content.Substring(actualIndex + 5, 3); + if (decimal.TryParse( + numberPart, + NumberStyles.Number, + CultureInfo.InvariantCulture, + out var version)) + { + var afterCommentSymbolIndex = actualIndex + 1; + + headerVersion = new HeaderVersion( + version, + content.Substring(afterCommentSymbolIndex, versionLength - 1), + currentOffset + afterCommentSymbolIndex); + + inputBytes.Seek(startPosition); + + return true; + } + } + + currentOffset += readLength - versionLength; + inputBytes.Seek(currentOffset); + } while (readLength == bufferLength); + + return false; + } + private static HeaderVersion HandleMissingVersion(CommentToken comment, bool isLenientParsing, ILog log) { if (isLenientParsing) diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index 4d34c735..d2cc5c02 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -97,7 +97,7 @@ var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider); var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser); - var version = FileHeaderParser.Parse(scanner, isLenientParsing, log); + var version = FileHeaderParser.Parse(scanner, inputBytes, isLenientParsing, log); var crossReferenceOffset = FileTrailerParser.GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing) + version.OffsetInFile;