#443 handle case where file version comment token included in string by tokenization

instead just brute force the raw content
This commit is contained in:
Eliot Jones
2022-04-24 12:37:26 -04:00
parent 801a395ba4
commit 2a68670896
6 changed files with 122 additions and 41 deletions

View File

@@ -16,7 +16,7 @@
var builder = new CharacterMapBuilder();
parser.Parse(new NumericToken(1), input, builder);
parser.Parse(new NumericToken(1), input.scanner, builder);
Assert.Equal(2, builder.BaseFontCharacterMap.Count);
@@ -31,7 +31,7 @@
var builder = new CharacterMapBuilder();
parser.Parse(new NumericToken(1), input, builder);
parser.Parse(new NumericToken(1), input.scanner, builder);
Assert.Equal(7, builder.BaseFontCharacterMap.Count);
@@ -47,7 +47,7 @@
var builder = new CharacterMapBuilder();
parser.Parse(new NumericToken(2), input, builder);
parser.Parse(new NumericToken(2), input.scanner, builder);
Assert.Equal(6, builder.BaseFontCharacterMap.Count);

View File

@@ -5,6 +5,8 @@
using PdfPig.Core;
using PdfPig.Parser.FileStructure;
using PdfPig.Tokenization.Scanner;
using PdfPig.Tokens;
using System.Linq;
using Xunit;
public class FileHeaderParserTests
@@ -13,7 +15,7 @@
[Fact]
public void NullScannerThrows()
{
Action action = () => FileHeaderParser.Parse(null, false, log);
Action action = () => FileHeaderParser.Parse(null, null, false, log);
Assert.Throws<ArgumentNullException>(action);
}
@@ -31,7 +33,7 @@
var scanner = StringBytesTestConverter.Scanner(input);
var result = FileHeaderParser.Parse(scanner, false, log);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Equal(format, result.VersionString);
Assert.Equal(0, result.OffsetInFile);
@@ -46,7 +48,7 @@
var scanner = StringBytesTestConverter.Scanner(input);
var result = FileHeaderParser.Parse(scanner, false, log);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Equal(1.2m, result.Version);
Assert.Equal(TestEnvironment.IsUnixPlatform ? 7 : 9, result.OffsetInFile);
@@ -57,7 +59,7 @@
{
var scanner = StringBytesTestConverter.Scanner(string.Empty);
Action action = () => FileHeaderParser.Parse(scanner, false, log);
Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Throws<PdfDocumentFormatException>(action);
}
@@ -68,7 +70,7 @@
var scanner = StringBytesTestConverter.Scanner(@"one
%PDF-1.2");
var result = FileHeaderParser.Parse(scanner, false, log);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Equal(1.2m, result.Version);
Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
@@ -80,7 +82,7 @@
var scanner = StringBytesTestConverter.Scanner(@"one
%PDF-1.7");
var result = FileHeaderParser.Parse(scanner, true, log);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
Assert.Equal(1.7m, result.Version);
Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
@@ -92,7 +94,7 @@
var scanner = StringBytesTestConverter.Scanner(@"one two
three %PDF-1.6");
var result = FileHeaderParser.Parse(scanner, true, log);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
Assert.Equal(1.6m, result.Version);
Assert.Equal(TestEnvironment.IsUnixPlatform ? 14 : 15, result.OffsetInFile);
@@ -103,7 +105,7 @@ three %PDF-1.6");
{
var scanner = StringBytesTestConverter.Scanner(@"one two");
Action action = () => FileHeaderParser.Parse(scanner, true, log);
Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
Assert.Throws<PdfDocumentFormatException>(action);
}
@@ -113,7 +115,7 @@ three %PDF-1.6");
{
var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
Action action = () => FileHeaderParser.Parse(scanner, false, log);
Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Throws<PdfDocumentFormatException>(action);
}
@@ -123,7 +125,7 @@ three %PDF-1.6");
{
var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
var result = FileHeaderParser.Parse(scanner, true, log);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
Assert.Equal(1.4m, result.Version);
}
@@ -133,9 +135,9 @@ three %PDF-1.6");
{
var scanner = StringBytesTestConverter.Scanner(@"%FDF-1.6");
var result = FileHeaderParser.Parse(scanner, false, log);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Equal(0, scanner.CurrentPosition);
Assert.Equal(0, scanner.scanner.CurrentPosition);
Assert.Equal(0, result.OffsetInFile);
}
@@ -144,11 +146,33 @@ three %PDF-1.6");
{
var input = OtherEncodings.StringAsLatin1Bytes("%PDF-1.7\r\n%âãÏÓ\r\n1 0 obj\r\n<</Lang(en-US)>>\r\nendobj");
var scanner = new CoreTokenScanner(new ByteArrayInputBytes(input), ScannerScope.None);
var bytes = new ByteArrayInputBytes(input);
var result = FileHeaderParser.Parse(scanner, false, log);
var scanner = new CoreTokenScanner(bytes, ScannerScope.None);
var result = FileHeaderParser.Parse(scanner, bytes, false, log);
Assert.Equal(1.7m, result.Version);
}
[Fact]
public void Issue443()
{
const string hex =
@"00 0F 4A 43 42 31 33 36 36 31 32 32 37 2E 70 64 66 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 50 44 46 20 43 41 52 4F 01 00 FF FF FF FF 00 00 00 00 00 04 DF 28 00 00 00 00 AF 51 7E 82 AF 52 D7 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 81 81 03 0D 00 00 25 50 44 46 2D 31 2E 31 0A 25 E2 E3 CF D3 0D 0A 31 20 30 20 6F 62 6A";
var bytes = hex.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(x => HexToken.Convert(x[0], x[1]));
var str = OtherEncodings.BytesAsLatin1String(bytes.ToArray());
var scanner = StringBytesTestConverter.Scanner(str);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Equal(0, scanner.scanner.CurrentPosition);
Assert.Equal(129, result.OffsetInFile);
Assert.Equal(1.1m, result.Version);
Assert.Equal("PDF-1.1", result.VersionString);
}
}
}

View File

@@ -25,7 +25,7 @@
trailer
<< >>");
var result = CrossReferenceTableParser.Parse(input, 4, false);
var result = CrossReferenceTableParser.Parse(input.scanner, 4, false);
Assert.Equal(4, result.ObjectOffsets.Count);
}
@@ -288,7 +288,7 @@ trailer
private static CoreTokenScanner GetReader(string input)
{
return StringBytesTestConverter.Scanner(input);
return StringBytesTestConverter.Scanner(input).scanner;
}
}
}

View File

@@ -31,11 +31,12 @@
public IInputBytes Bytes { get; set; }
}
internal static CoreTokenScanner Scanner(string s)
internal static (CoreTokenScanner scanner, IInputBytes bytes) Scanner(string s)
{
var result = new CoreTokenScanner(new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s)));
var inputBytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s));
var result = new CoreTokenScanner(inputBytes);
return result;
return (result, inputBytes);
}
}
}

View File

@@ -29,40 +29,41 @@
internal static class FileHeaderParser
{
[NotNull]
public static HeaderVersion Parse([NotNull]ISeekableTokenScanner scanner, bool isLenientParsing, ILog log)
public static HeaderVersion Parse([NotNull] ISeekableTokenScanner scanner, IInputBytes inputBytes, bool isLenientParsing, ILog log)
{
if (scanner == null)
{
throw new ArgumentNullException(nameof(scanner));
}
// Read the first token
if (!scanner.MoveNext())
{
throw new PdfDocumentFormatException($"Could not read the first token in the document at position {scanner.CurrentPosition}.");
}
var startPosition = scanner.CurrentPosition;
var comment = scanner.CurrentToken as CommentToken;
const int junkTokensTolerance = 25;
const int junkTokensTolerance = 30;
var attempts = 0;
while (comment == null)
CommentToken comment;
do
{
if (attempts == junkTokensTolerance)
if (attempts == junkTokensTolerance || !scanner.MoveNext())
{
throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
}
if (!TryBruteForceVersionLocation(startPosition, inputBytes, out var version))
{
throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
}
if (!scanner.MoveNext())
{
throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
scanner.Seek(startPosition);
return version;
}
comment = scanner.CurrentToken as CommentToken;
attempts++;
}
} while (comment == null);
return GetHeaderVersionAndResetScanner(comment, scanner, isLenientParsing, log);
}
private static HeaderVersion GetHeaderVersionAndResetScanner(CommentToken comment, ISeekableTokenScanner scanner, bool isLenientParsing, ILog log)
{
if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0)
{
return HandleMissingVersion(comment, isLenientParsing, log);
@@ -70,7 +71,7 @@
const int toDecimalStartLength = 4;
if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength),
if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength),
NumberStyles.Number,
CultureInfo.InvariantCulture,
out var version))
@@ -90,6 +91,61 @@
return result;
}
private static bool TryBruteForceVersionLocation(long startPosition, IInputBytes inputBytes, out HeaderVersion headerVersion)
{
headerVersion = null;
inputBytes.Seek(startPosition);
// %PDF-x.y or %FDF-x.y
const int versionLength = 8;
const int bufferLength = 64;
// Slide a window of bufferLength bytes across the file allowing for the fact the version could get split by
// the window (so always ensure an overlap of versionLength bytes between the end of the previous and start of the next buffer).
var buffer = new byte[bufferLength];
var currentOffset = startPosition;
int readLength;
do
{
readLength = inputBytes.Read(buffer, bufferLength);
var content = OtherEncodings.BytesAsLatin1String(buffer);
var pdfIndex = content.IndexOf("%PDF-", StringComparison.OrdinalIgnoreCase);
var fdfIndex = content.IndexOf("%FDF-", StringComparison.OrdinalIgnoreCase);
var actualIndex = pdfIndex >= 0 ? pdfIndex : fdfIndex;
if (actualIndex >= 0 && content.Length - actualIndex >= versionLength)
{
var numberPart = content.Substring(actualIndex + 5, 3);
if (decimal.TryParse(
numberPart,
NumberStyles.Number,
CultureInfo.InvariantCulture,
out var version))
{
var afterCommentSymbolIndex = actualIndex + 1;
headerVersion = new HeaderVersion(
version,
content.Substring(afterCommentSymbolIndex, versionLength - 1),
currentOffset + afterCommentSymbolIndex);
inputBytes.Seek(startPosition);
return true;
}
}
currentOffset += readLength - versionLength;
inputBytes.Seek(currentOffset);
} while (readLength == bufferLength);
return false;
}
private static HeaderVersion HandleMissingVersion(CommentToken comment, bool isLenientParsing, ILog log)
{
if (isLenientParsing)

View File

@@ -97,7 +97,7 @@
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser);
var version = FileHeaderParser.Parse(scanner, isLenientParsing, log);
var version = FileHeaderParser.Parse(scanner, inputBytes, isLenientParsing, log);
var crossReferenceOffset = FileTrailerParser.GetFirstCrossReferenceOffset(inputBytes, scanner,
isLenientParsing) + version.OffsetInFile;