mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-11-28 09:28:25 +08:00
#443 handle case where file version comment token included in string by tokenization
instead just brute force the raw content
This commit is contained in:
@@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
var builder = new CharacterMapBuilder();
|
var builder = new CharacterMapBuilder();
|
||||||
|
|
||||||
parser.Parse(new NumericToken(1), input, builder);
|
parser.Parse(new NumericToken(1), input.scanner, builder);
|
||||||
|
|
||||||
Assert.Equal(2, builder.BaseFontCharacterMap.Count);
|
Assert.Equal(2, builder.BaseFontCharacterMap.Count);
|
||||||
|
|
||||||
@@ -31,7 +31,7 @@
|
|||||||
|
|
||||||
var builder = new CharacterMapBuilder();
|
var builder = new CharacterMapBuilder();
|
||||||
|
|
||||||
parser.Parse(new NumericToken(1), input, builder);
|
parser.Parse(new NumericToken(1), input.scanner, builder);
|
||||||
|
|
||||||
Assert.Equal(7, builder.BaseFontCharacterMap.Count);
|
Assert.Equal(7, builder.BaseFontCharacterMap.Count);
|
||||||
|
|
||||||
@@ -47,7 +47,7 @@
|
|||||||
|
|
||||||
var builder = new CharacterMapBuilder();
|
var builder = new CharacterMapBuilder();
|
||||||
|
|
||||||
parser.Parse(new NumericToken(2), input, builder);
|
parser.Parse(new NumericToken(2), input.scanner, builder);
|
||||||
|
|
||||||
Assert.Equal(6, builder.BaseFontCharacterMap.Count);
|
Assert.Equal(6, builder.BaseFontCharacterMap.Count);
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,8 @@
|
|||||||
using PdfPig.Core;
|
using PdfPig.Core;
|
||||||
using PdfPig.Parser.FileStructure;
|
using PdfPig.Parser.FileStructure;
|
||||||
using PdfPig.Tokenization.Scanner;
|
using PdfPig.Tokenization.Scanner;
|
||||||
|
using PdfPig.Tokens;
|
||||||
|
using System.Linq;
|
||||||
using Xunit;
|
using Xunit;
|
||||||
|
|
||||||
public class FileHeaderParserTests
|
public class FileHeaderParserTests
|
||||||
@@ -13,7 +15,7 @@
|
|||||||
[Fact]
|
[Fact]
|
||||||
public void NullScannerThrows()
|
public void NullScannerThrows()
|
||||||
{
|
{
|
||||||
Action action = () => FileHeaderParser.Parse(null, false, log);
|
Action action = () => FileHeaderParser.Parse(null, null, false, log);
|
||||||
|
|
||||||
Assert.Throws<ArgumentNullException>(action);
|
Assert.Throws<ArgumentNullException>(action);
|
||||||
}
|
}
|
||||||
@@ -31,7 +33,7 @@
|
|||||||
|
|
||||||
var scanner = StringBytesTestConverter.Scanner(input);
|
var scanner = StringBytesTestConverter.Scanner(input);
|
||||||
|
|
||||||
var result = FileHeaderParser.Parse(scanner, false, log);
|
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||||
|
|
||||||
Assert.Equal(format, result.VersionString);
|
Assert.Equal(format, result.VersionString);
|
||||||
Assert.Equal(0, result.OffsetInFile);
|
Assert.Equal(0, result.OffsetInFile);
|
||||||
@@ -46,7 +48,7 @@
|
|||||||
|
|
||||||
var scanner = StringBytesTestConverter.Scanner(input);
|
var scanner = StringBytesTestConverter.Scanner(input);
|
||||||
|
|
||||||
var result = FileHeaderParser.Parse(scanner, false, log);
|
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||||
|
|
||||||
Assert.Equal(1.2m, result.Version);
|
Assert.Equal(1.2m, result.Version);
|
||||||
Assert.Equal(TestEnvironment.IsUnixPlatform ? 7 : 9, result.OffsetInFile);
|
Assert.Equal(TestEnvironment.IsUnixPlatform ? 7 : 9, result.OffsetInFile);
|
||||||
@@ -57,7 +59,7 @@
|
|||||||
{
|
{
|
||||||
var scanner = StringBytesTestConverter.Scanner(string.Empty);
|
var scanner = StringBytesTestConverter.Scanner(string.Empty);
|
||||||
|
|
||||||
Action action = () => FileHeaderParser.Parse(scanner, false, log);
|
Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||||
|
|
||||||
Assert.Throws<PdfDocumentFormatException>(action);
|
Assert.Throws<PdfDocumentFormatException>(action);
|
||||||
}
|
}
|
||||||
@@ -68,7 +70,7 @@
|
|||||||
var scanner = StringBytesTestConverter.Scanner(@"one
|
var scanner = StringBytesTestConverter.Scanner(@"one
|
||||||
%PDF-1.2");
|
%PDF-1.2");
|
||||||
|
|
||||||
var result = FileHeaderParser.Parse(scanner, false, log);
|
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||||
|
|
||||||
Assert.Equal(1.2m, result.Version);
|
Assert.Equal(1.2m, result.Version);
|
||||||
Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
|
Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
|
||||||
@@ -80,7 +82,7 @@
|
|||||||
var scanner = StringBytesTestConverter.Scanner(@"one
|
var scanner = StringBytesTestConverter.Scanner(@"one
|
||||||
%PDF-1.7");
|
%PDF-1.7");
|
||||||
|
|
||||||
var result = FileHeaderParser.Parse(scanner, true, log);
|
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||||
|
|
||||||
Assert.Equal(1.7m, result.Version);
|
Assert.Equal(1.7m, result.Version);
|
||||||
Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
|
Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
|
||||||
@@ -92,7 +94,7 @@
|
|||||||
var scanner = StringBytesTestConverter.Scanner(@"one two
|
var scanner = StringBytesTestConverter.Scanner(@"one two
|
||||||
three %PDF-1.6");
|
three %PDF-1.6");
|
||||||
|
|
||||||
var result = FileHeaderParser.Parse(scanner, true, log);
|
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||||
|
|
||||||
Assert.Equal(1.6m, result.Version);
|
Assert.Equal(1.6m, result.Version);
|
||||||
Assert.Equal(TestEnvironment.IsUnixPlatform ? 14 : 15, result.OffsetInFile);
|
Assert.Equal(TestEnvironment.IsUnixPlatform ? 14 : 15, result.OffsetInFile);
|
||||||
@@ -103,7 +105,7 @@ three %PDF-1.6");
|
|||||||
{
|
{
|
||||||
var scanner = StringBytesTestConverter.Scanner(@"one two");
|
var scanner = StringBytesTestConverter.Scanner(@"one two");
|
||||||
|
|
||||||
Action action = () => FileHeaderParser.Parse(scanner, true, log);
|
Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||||
|
|
||||||
Assert.Throws<PdfDocumentFormatException>(action);
|
Assert.Throws<PdfDocumentFormatException>(action);
|
||||||
}
|
}
|
||||||
@@ -113,7 +115,7 @@ three %PDF-1.6");
|
|||||||
{
|
{
|
||||||
var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
|
var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
|
||||||
|
|
||||||
Action action = () => FileHeaderParser.Parse(scanner, false, log);
|
Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||||
|
|
||||||
Assert.Throws<PdfDocumentFormatException>(action);
|
Assert.Throws<PdfDocumentFormatException>(action);
|
||||||
}
|
}
|
||||||
@@ -123,7 +125,7 @@ three %PDF-1.6");
|
|||||||
{
|
{
|
||||||
var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
|
var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
|
||||||
|
|
||||||
var result = FileHeaderParser.Parse(scanner, true, log);
|
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||||
|
|
||||||
Assert.Equal(1.4m, result.Version);
|
Assert.Equal(1.4m, result.Version);
|
||||||
}
|
}
|
||||||
@@ -133,9 +135,9 @@ three %PDF-1.6");
|
|||||||
{
|
{
|
||||||
var scanner = StringBytesTestConverter.Scanner(@"%FDF-1.6");
|
var scanner = StringBytesTestConverter.Scanner(@"%FDF-1.6");
|
||||||
|
|
||||||
var result = FileHeaderParser.Parse(scanner, false, log);
|
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||||
|
|
||||||
Assert.Equal(0, scanner.CurrentPosition);
|
Assert.Equal(0, scanner.scanner.CurrentPosition);
|
||||||
Assert.Equal(0, result.OffsetInFile);
|
Assert.Equal(0, result.OffsetInFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -144,11 +146,33 @@ three %PDF-1.6");
|
|||||||
{
|
{
|
||||||
var input = OtherEncodings.StringAsLatin1Bytes("%PDF-1.7\r\n%âãÏÓ\r\n1 0 obj\r\n<</Lang(en-US)>>\r\nendobj");
|
var input = OtherEncodings.StringAsLatin1Bytes("%PDF-1.7\r\n%âãÏÓ\r\n1 0 obj\r\n<</Lang(en-US)>>\r\nendobj");
|
||||||
|
|
||||||
var scanner = new CoreTokenScanner(new ByteArrayInputBytes(input), ScannerScope.None);
|
var bytes = new ByteArrayInputBytes(input);
|
||||||
|
|
||||||
var result = FileHeaderParser.Parse(scanner, false, log);
|
var scanner = new CoreTokenScanner(bytes, ScannerScope.None);
|
||||||
|
|
||||||
|
var result = FileHeaderParser.Parse(scanner, bytes, false, log);
|
||||||
|
|
||||||
Assert.Equal(1.7m, result.Version);
|
Assert.Equal(1.7m, result.Version);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Issue443()
|
||||||
|
{
|
||||||
|
const string hex =
|
||||||
|
@"00 0F 4A 43 42 31 33 36 36 31 32 32 37 2E 70 64 66 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 50 44 46 20 43 41 52 4F 01 00 FF FF FF FF 00 00 00 00 00 04 DF 28 00 00 00 00 AF 51 7E 82 AF 52 D7 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 81 81 03 0D 00 00 25 50 44 46 2D 31 2E 31 0A 25 E2 E3 CF D3 0D 0A 31 20 30 20 6F 62 6A";
|
||||||
|
|
||||||
|
var bytes = hex.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(x => HexToken.Convert(x[0], x[1]));
|
||||||
|
|
||||||
|
var str = OtherEncodings.BytesAsLatin1String(bytes.ToArray());
|
||||||
|
|
||||||
|
var scanner = StringBytesTestConverter.Scanner(str);
|
||||||
|
|
||||||
|
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||||
|
|
||||||
|
Assert.Equal(0, scanner.scanner.CurrentPosition);
|
||||||
|
Assert.Equal(129, result.OffsetInFile);
|
||||||
|
Assert.Equal(1.1m, result.Version);
|
||||||
|
Assert.Equal("PDF-1.1", result.VersionString);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,7 +25,7 @@
|
|||||||
trailer
|
trailer
|
||||||
<< >>");
|
<< >>");
|
||||||
|
|
||||||
var result = CrossReferenceTableParser.Parse(input, 4, false);
|
var result = CrossReferenceTableParser.Parse(input.scanner, 4, false);
|
||||||
|
|
||||||
Assert.Equal(4, result.ObjectOffsets.Count);
|
Assert.Equal(4, result.ObjectOffsets.Count);
|
||||||
}
|
}
|
||||||
@@ -288,7 +288,7 @@ trailer
|
|||||||
|
|
||||||
private static CoreTokenScanner GetReader(string input)
|
private static CoreTokenScanner GetReader(string input)
|
||||||
{
|
{
|
||||||
return StringBytesTestConverter.Scanner(input);
|
return StringBytesTestConverter.Scanner(input).scanner;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,11 +31,12 @@
|
|||||||
public IInputBytes Bytes { get; set; }
|
public IInputBytes Bytes { get; set; }
|
||||||
}
|
}
|
||||||
|
|
||||||
internal static CoreTokenScanner Scanner(string s)
|
internal static (CoreTokenScanner scanner, IInputBytes bytes) Scanner(string s)
|
||||||
{
|
{
|
||||||
var result = new CoreTokenScanner(new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s)));
|
var inputBytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s));
|
||||||
|
var result = new CoreTokenScanner(inputBytes);
|
||||||
|
|
||||||
return result;
|
return (result, inputBytes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,40 +29,41 @@
|
|||||||
internal static class FileHeaderParser
|
internal static class FileHeaderParser
|
||||||
{
|
{
|
||||||
[NotNull]
|
[NotNull]
|
||||||
public static HeaderVersion Parse([NotNull]ISeekableTokenScanner scanner, bool isLenientParsing, ILog log)
|
public static HeaderVersion Parse([NotNull] ISeekableTokenScanner scanner, IInputBytes inputBytes, bool isLenientParsing, ILog log)
|
||||||
{
|
{
|
||||||
if (scanner == null)
|
if (scanner == null)
|
||||||
{
|
{
|
||||||
throw new ArgumentNullException(nameof(scanner));
|
throw new ArgumentNullException(nameof(scanner));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read the first token
|
var startPosition = scanner.CurrentPosition;
|
||||||
if (!scanner.MoveNext())
|
|
||||||
{
|
|
||||||
throw new PdfDocumentFormatException($"Could not read the first token in the document at position {scanner.CurrentPosition}.");
|
|
||||||
}
|
|
||||||
|
|
||||||
var comment = scanner.CurrentToken as CommentToken;
|
const int junkTokensTolerance = 30;
|
||||||
|
|
||||||
const int junkTokensTolerance = 25;
|
|
||||||
var attempts = 0;
|
var attempts = 0;
|
||||||
while (comment == null)
|
CommentToken comment;
|
||||||
|
do
|
||||||
{
|
{
|
||||||
if (attempts == junkTokensTolerance)
|
if (attempts == junkTokensTolerance || !scanner.MoveNext())
|
||||||
{
|
{
|
||||||
throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
|
if (!TryBruteForceVersionLocation(startPosition, inputBytes, out var version))
|
||||||
}
|
{
|
||||||
|
throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
|
||||||
|
}
|
||||||
|
|
||||||
if (!scanner.MoveNext())
|
scanner.Seek(startPosition);
|
||||||
{
|
return version;
|
||||||
throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
comment = scanner.CurrentToken as CommentToken;
|
comment = scanner.CurrentToken as CommentToken;
|
||||||
|
|
||||||
attempts++;
|
attempts++;
|
||||||
}
|
} while (comment == null);
|
||||||
|
|
||||||
|
return GetHeaderVersionAndResetScanner(comment, scanner, isLenientParsing, log);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static HeaderVersion GetHeaderVersionAndResetScanner(CommentToken comment, ISeekableTokenScanner scanner, bool isLenientParsing, ILog log)
|
||||||
|
{
|
||||||
if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0)
|
if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0)
|
||||||
{
|
{
|
||||||
return HandleMissingVersion(comment, isLenientParsing, log);
|
return HandleMissingVersion(comment, isLenientParsing, log);
|
||||||
@@ -70,7 +71,7 @@
|
|||||||
|
|
||||||
const int toDecimalStartLength = 4;
|
const int toDecimalStartLength = 4;
|
||||||
|
|
||||||
if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength),
|
if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength),
|
||||||
NumberStyles.Number,
|
NumberStyles.Number,
|
||||||
CultureInfo.InvariantCulture,
|
CultureInfo.InvariantCulture,
|
||||||
out var version))
|
out var version))
|
||||||
@@ -90,6 +91,61 @@
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static bool TryBruteForceVersionLocation(long startPosition, IInputBytes inputBytes, out HeaderVersion headerVersion)
|
||||||
|
{
|
||||||
|
headerVersion = null;
|
||||||
|
|
||||||
|
inputBytes.Seek(startPosition);
|
||||||
|
|
||||||
|
// %PDF-x.y or %FDF-x.y
|
||||||
|
const int versionLength = 8;
|
||||||
|
const int bufferLength = 64;
|
||||||
|
|
||||||
|
// Slide a window of bufferLength bytes across the file allowing for the fact the version could get split by
|
||||||
|
// the window (so always ensure an overlap of versionLength bytes between the end of the previous and start of the next buffer).
|
||||||
|
var buffer = new byte[bufferLength];
|
||||||
|
|
||||||
|
var currentOffset = startPosition;
|
||||||
|
int readLength;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
readLength = inputBytes.Read(buffer, bufferLength);
|
||||||
|
|
||||||
|
var content = OtherEncodings.BytesAsLatin1String(buffer);
|
||||||
|
|
||||||
|
var pdfIndex = content.IndexOf("%PDF-", StringComparison.OrdinalIgnoreCase);
|
||||||
|
var fdfIndex = content.IndexOf("%FDF-", StringComparison.OrdinalIgnoreCase);
|
||||||
|
var actualIndex = pdfIndex >= 0 ? pdfIndex : fdfIndex;
|
||||||
|
|
||||||
|
if (actualIndex >= 0 && content.Length - actualIndex >= versionLength)
|
||||||
|
{
|
||||||
|
var numberPart = content.Substring(actualIndex + 5, 3);
|
||||||
|
if (decimal.TryParse(
|
||||||
|
numberPart,
|
||||||
|
NumberStyles.Number,
|
||||||
|
CultureInfo.InvariantCulture,
|
||||||
|
out var version))
|
||||||
|
{
|
||||||
|
var afterCommentSymbolIndex = actualIndex + 1;
|
||||||
|
|
||||||
|
headerVersion = new HeaderVersion(
|
||||||
|
version,
|
||||||
|
content.Substring(afterCommentSymbolIndex, versionLength - 1),
|
||||||
|
currentOffset + afterCommentSymbolIndex);
|
||||||
|
|
||||||
|
inputBytes.Seek(startPosition);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
currentOffset += readLength - versionLength;
|
||||||
|
inputBytes.Seek(currentOffset);
|
||||||
|
} while (readLength == bufferLength);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
private static HeaderVersion HandleMissingVersion(CommentToken comment, bool isLenientParsing, ILog log)
|
private static HeaderVersion HandleMissingVersion(CommentToken comment, bool isLenientParsing, ILog log)
|
||||||
{
|
{
|
||||||
if (isLenientParsing)
|
if (isLenientParsing)
|
||||||
|
|||||||
@@ -97,7 +97,7 @@
|
|||||||
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
|
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
|
||||||
var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser);
|
var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser);
|
||||||
|
|
||||||
var version = FileHeaderParser.Parse(scanner, isLenientParsing, log);
|
var version = FileHeaderParser.Parse(scanner, inputBytes, isLenientParsing, log);
|
||||||
|
|
||||||
var crossReferenceOffset = FileTrailerParser.GetFirstCrossReferenceOffset(inputBytes, scanner,
|
var crossReferenceOffset = FileTrailerParser.GetFirstCrossReferenceOffset(inputBytes, scanner,
|
||||||
isLenientParsing) + version.OffsetInFile;
|
isLenientParsing) + version.OffsetInFile;
|
||||||
|
|||||||
Reference in New Issue
Block a user