Allow lenient parsing in DictionaryTokenizer and fix #791

This commit is contained in:
BobLd 2024-03-10 14:02:39 +00:00
parent 250362e015
commit acfe8b5fdd
8 changed files with 80 additions and 17 deletions

View File

@ -1,5 +1,6 @@
namespace UglyToad.PdfPig.Tests.Integration namespace UglyToad.PdfPig.Tests.Integration
{ {
using PdfPig.Core;
using PdfPig.Tokens; using PdfPig.Tokens;
using Xunit; using Xunit;
@ -33,5 +34,53 @@
Assert.Equal("Another Property Value", ((StringToken)valueToken2).Data); Assert.Equal("Another Property Value", ((StringToken)valueToken2).Data);
} }
} }
[Fact]
public void CanReadInvalidDocumentInformation()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("invalid-pdf-structure-pdfminer-entire-doc.pdf");
/*
<<
/Producer (pdfTeX-1.40.21)
Collaborative Neural Rendering Using Anime Character Sheets /Author()/Title()/Subject()/Creator(LaTeX with hyperref)/Keywords()
/CreationDate (D:20230418010134Z)
/ModDate (D:20230418010134Z)
/Trapped /False
/PTEX.Fullbanner (This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2)
>>
*/
// Lenient Parsing On -> can process
using (var document = PdfDocument.Open(path))
{
var information = document.Information;
Assert.Equal("LaTeX with hyperref", information.Creator);
Assert.Equal("", information.Keywords);
Assert.Equal("pdfTeX-1.40.21", information.Producer);
Assert.Equal("", information.Subject);
Assert.Equal("", information.Title);
Assert.Equal("", information.Author);
Assert.Equal("D:20230418010134Z", information.CreationDate);
Assert.Equal("D:20230418010134Z", information.ModifiedDate);
var infoDictionary = information.DocumentInformationDictionary;
var nameToken = NameToken.Create("Trapped");
Assert.True(infoDictionary.TryGet(nameToken, out var valueToken));
Assert.IsType<NameToken>(valueToken);
Assert.Equal("False", ((NameToken)valueToken).Data);
nameToken = NameToken.Create("PTEX.Fullbanner");
Assert.True(infoDictionary.TryGet(nameToken, out var valueToken2));
Assert.IsType<StringToken>(valueToken2);
Assert.Equal("This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2", ((StringToken)valueToken2).Data);
}
// Lenient Parsing Off -> throws
var ex = Assert.Throws<PdfDocumentFormatException>(() => PdfDocument.Open(path, ParsingOptions.LenientParsingOff));
Assert.Equal("Expected name as dictionary key, instead got: Collaborative", ex.Message);
}
} }
} }

View File

@ -499,7 +499,7 @@ endobj";
var input = StringBytesTestConverter.Convert(s, false); var input = StringBytesTestConverter.Convert(s, false);
return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(), return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(),
new TestFilterProvider(), NoOpEncryptionHandler.Instance); new TestFilterProvider(), NoOpEncryptionHandler.Instance, ParsingOptions.LenientParsingOff);
} }
private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner) private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner)

View File

@ -9,6 +9,7 @@
{ {
private readonly bool usePdfDocEncoding; private readonly bool usePdfDocEncoding;
private readonly IReadOnlyList<NameToken> requiredKeys; private readonly IReadOnlyList<NameToken> requiredKeys;
private readonly bool useLenientParsing;
public bool ReadsNextByte { get; } = false; public bool ReadsNextByte { get; } = false;
@ -22,10 +23,12 @@
/// Can be provided to recover from errors with missing dictionary end symbols if the /// Can be provided to recover from errors with missing dictionary end symbols if the
/// set of keys expected in the dictionary are known. /// set of keys expected in the dictionary are known.
/// </param> /// </param>
public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList<NameToken> requiredKeys = null) /// <param name="useLenientParsing">Whether to use lenient parsing.</param>
public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList<NameToken> requiredKeys = null, bool useLenientParsing = false)
{ {
this.usePdfDocEncoding = usePdfDocEncoding; this.usePdfDocEncoding = usePdfDocEncoding;
this.requiredKeys = requiredKeys; this.requiredKeys = requiredKeys;
this.useLenientParsing = useLenientParsing;
} }
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
@ -80,7 +83,7 @@
return false; return false;
} }
var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary); var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary, useLenientParsing: useLenientParsing);
var tokens = new List<IToken>(); var tokens = new List<IToken>();
@ -96,7 +99,7 @@
// Has enough key/values for each required key // Has enough key/values for each required key
if (useRequiredKeys && tokens.Count >= requiredKeys.Count * 2) if (useRequiredKeys && tokens.Count >= requiredKeys.Count * 2)
{ {
var proposedDictionary = ConvertToDictionary(tokens); var proposedDictionary = ConvertToDictionary(tokens, useLenientParsing);
var isAcceptable = true; var isAcceptable = true;
foreach (var key in requiredKeys) foreach (var key in requiredKeys)
@ -118,15 +121,14 @@
} }
} }
var dictionary = ConvertToDictionary(tokens); var dictionary = ConvertToDictionary(tokens, useLenientParsing);
token = new DictionaryToken(dictionary); token = new DictionaryToken(dictionary);
return true; return true;
} }
private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> tokens) private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> tokens, bool useLenientParsing)
{ {
var result = new Dictionary<NameToken, IToken>(); var result = new Dictionary<NameToken, IToken>();
@ -143,6 +145,13 @@
continue; continue;
} }
if (useLenientParsing)
{
// TODO - Log warning
System.Diagnostics.Debug.WriteLine($"Expected name as dictionary key, instead got: " + token);
continue;
}
throw new PdfDocumentFormatException($"Expected name as dictionary key, instead got: " + token); throw new PdfDocumentFormatException($"Expected name as dictionary key, instead got: " + token);
} }

View File

@ -27,7 +27,8 @@
private readonly IInputBytes inputBytes; private readonly IInputBytes inputBytes;
private readonly bool usePdfDocEncoding; private readonly bool usePdfDocEncoding;
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>(); private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
private readonly bool useLenientParsing;
/// <summary> /// <summary>
/// The offset in the input data at which the <see cref="CurrentToken"/> starts. /// The offset in the input data at which the <see cref="CurrentToken"/> starts.
/// </summary> /// </summary>
@ -52,15 +53,17 @@
IInputBytes inputBytes, IInputBytes inputBytes,
bool usePdfDocEncoding, bool usePdfDocEncoding,
ScannerScope scope = ScannerScope.None, ScannerScope scope = ScannerScope.None,
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null) IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null,
bool useLenientParsing = false)
{ {
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes)); this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
this.usePdfDocEncoding = usePdfDocEncoding; this.usePdfDocEncoding = usePdfDocEncoding;
this.stringTokenizer = new StringTokenizer(usePdfDocEncoding); this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding); this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding);
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding); this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding, useLenientParsing: useLenientParsing);
this.scope = scope; this.scope = scope;
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys; this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
this.useLenientParsing = useLenientParsing;
} }
/// <inheritdoc /> /// <inheritdoc />
@ -140,7 +143,7 @@
&& CurrentToken is NameToken name && CurrentToken is NameToken name
&& namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys)) && namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys))
{ {
tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys); tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys, useLenientParsing);
} }
} }
else else

View File

@ -28,7 +28,7 @@
IInputBytes inputBytes, IInputBytes inputBytes,
ILog log) ILog log)
{ {
var scanner = new CoreTokenScanner(inputBytes, false); var scanner = new CoreTokenScanner(inputBytes, false, useLenientParsing: useLenientParsing);
var precedingTokens = new List<IToken>(); var precedingTokens = new List<IToken>();
var graphicsStateOperations = new List<IGraphicsStateOperation>(); var graphicsStateOperations = new List<IGraphicsStateOperation>();

View File

@ -75,7 +75,7 @@
SkipMissingFonts = false SkipMissingFonts = false
}; };
var tokenScanner = new CoreTokenScanner(inputBytes, true); var tokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: options.UseLenientParsing);
var passwords = new List<string>(); var passwords = new List<string>();
@ -115,7 +115,7 @@
// We're ok with this since our intent is to lazily load the cross reference table. // We're ok with this since our intent is to lazily load the cross reference table.
// ReSharper disable once AccessToModifiedClosure // ReSharper disable once AccessToModifiedClosure
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, inputBytes); var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, inputBytes);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance); var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance, parsingOptions);
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider); var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
var crossReferenceParser = new CrossReferenceParser(parsingOptions.Logger, xrefValidator, crossReferenceStreamParser); var crossReferenceParser = new CrossReferenceParser(parsingOptions.Logger, xrefValidator, crossReferenceStreamParser);

View File

@ -25,6 +25,7 @@
private readonly IObjectLocationProvider objectLocationProvider; private readonly IObjectLocationProvider objectLocationProvider;
private readonly ILookupFilterProvider filterProvider; private readonly ILookupFilterProvider filterProvider;
private readonly CoreTokenScanner coreTokenScanner; private readonly CoreTokenScanner coreTokenScanner;
private readonly ParsingOptions parsingOptions;
private IEncryptionHandler encryptionHandler; private IEncryptionHandler encryptionHandler;
private bool isDisposed; private bool isDisposed;
@ -52,13 +53,14 @@
public long Length => coreTokenScanner.Length; public long Length => coreTokenScanner.Length;
public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider, ILookupFilterProvider filterProvider, public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider, ILookupFilterProvider filterProvider,
IEncryptionHandler encryptionHandler) IEncryptionHandler encryptionHandler, ParsingOptions parsingOptions)
{ {
this.inputBytes = inputBytes; this.inputBytes = inputBytes;
this.objectLocationProvider = objectLocationProvider; this.objectLocationProvider = objectLocationProvider;
this.filterProvider = filterProvider; this.filterProvider = filterProvider;
this.encryptionHandler = encryptionHandler; this.encryptionHandler = encryptionHandler;
coreTokenScanner = new CoreTokenScanner(inputBytes, true); this.parsingOptions = parsingOptions;
coreTokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
} }
public void UpdateEncryptionHandler(IEncryptionHandler newHandler) public void UpdateEncryptionHandler(IEncryptionHandler newHandler)
@ -815,7 +817,7 @@
// Read the N integers // Read the N integers
var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider, this)); var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider, this));
var scanner = new CoreTokenScanner(bytes, true); var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
var objects = new List<Tuple<long, long>>(); var objects = new List<Tuple<long, long>>();