Allow lenient parsing in DictionaryTokenizer and fix #791

This commit is contained in:
BobLd 2024-03-10 14:02:39 +00:00
parent 250362e015
commit acfe8b5fdd
8 changed files with 80 additions and 17 deletions

View File

@ -1,5 +1,6 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using PdfPig.Core;
using PdfPig.Tokens;
using Xunit;
@ -33,5 +34,53 @@
Assert.Equal("Another Property Value", ((StringToken)valueToken2).Data);
}
}
[Fact]
public void CanReadInvalidDocumentInformation()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("invalid-pdf-structure-pdfminer-entire-doc.pdf");
/*
<<
/Producer (pdfTeX-1.40.21)
Collaborative Neural Rendering Using Anime Character Sheets /Author()/Title()/Subject()/Creator(LaTeX with hyperref)/Keywords()
/CreationDate (D:20230418010134Z)
/ModDate (D:20230418010134Z)
/Trapped /False
/PTEX.Fullbanner (This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2)
>>
*/
// Lenient Parsing On -> can process
using (var document = PdfDocument.Open(path))
{
var information = document.Information;
Assert.Equal("LaTeX with hyperref", information.Creator);
Assert.Equal("", information.Keywords);
Assert.Equal("pdfTeX-1.40.21", information.Producer);
Assert.Equal("", information.Subject);
Assert.Equal("", information.Title);
Assert.Equal("", information.Author);
Assert.Equal("D:20230418010134Z", information.CreationDate);
Assert.Equal("D:20230418010134Z", information.ModifiedDate);
var infoDictionary = information.DocumentInformationDictionary;
var nameToken = NameToken.Create("Trapped");
Assert.True(infoDictionary.TryGet(nameToken, out var valueToken));
Assert.IsType<NameToken>(valueToken);
Assert.Equal("False", ((NameToken)valueToken).Data);
nameToken = NameToken.Create("PTEX.Fullbanner");
Assert.True(infoDictionary.TryGet(nameToken, out var valueToken2));
Assert.IsType<StringToken>(valueToken2);
Assert.Equal("This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2", ((StringToken)valueToken2).Data);
}
// Lenient Parsing Off -> throws
var ex = Assert.Throws<PdfDocumentFormatException>(() => PdfDocument.Open(path, ParsingOptions.LenientParsingOff));
Assert.Equal("Expected name as dictionary key, instead got: Collaborative", ex.Message);
}
}
}

View File

@ -499,7 +499,7 @@ endobj";
var input = StringBytesTestConverter.Convert(s, false);
return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(),
new TestFilterProvider(), NoOpEncryptionHandler.Instance);
new TestFilterProvider(), NoOpEncryptionHandler.Instance, ParsingOptions.LenientParsingOff);
}
private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner)

View File

@ -9,6 +9,7 @@
{
private readonly bool usePdfDocEncoding;
private readonly IReadOnlyList<NameToken> requiredKeys;
private readonly bool useLenientParsing;
public bool ReadsNextByte { get; } = false;
@ -22,10 +23,12 @@
/// Can be provided to recover from errors with missing dictionary end symbols if the
/// set of keys expected in the dictionary are known.
/// </param>
public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList<NameToken> requiredKeys = null)
/// <param name="useLenientParsing">Whether to use lenient parsing.</param>
public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList<NameToken> requiredKeys = null, bool useLenientParsing = false)
{
this.usePdfDocEncoding = usePdfDocEncoding;
this.requiredKeys = requiredKeys;
this.useLenientParsing = useLenientParsing;
}
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
@ -80,7 +83,7 @@
return false;
}
var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary);
var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary, useLenientParsing: useLenientParsing);
var tokens = new List<IToken>();
@ -96,7 +99,7 @@
// Has enough key/values for each required key
if (useRequiredKeys && tokens.Count >= requiredKeys.Count * 2)
{
var proposedDictionary = ConvertToDictionary(tokens);
var proposedDictionary = ConvertToDictionary(tokens, useLenientParsing);
var isAcceptable = true;
foreach (var key in requiredKeys)
@ -118,15 +121,14 @@
}
}
var dictionary = ConvertToDictionary(tokens);
var dictionary = ConvertToDictionary(tokens, useLenientParsing);
token = new DictionaryToken(dictionary);
return true;
}
private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> tokens)
private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> tokens, bool useLenientParsing)
{
var result = new Dictionary<NameToken, IToken>();
@ -143,6 +145,13 @@
continue;
}
if (useLenientParsing)
{
// TODO - Log warning
System.Diagnostics.Debug.WriteLine($"Expected name as dictionary key, instead got: " + token);
continue;
}
throw new PdfDocumentFormatException($"Expected name as dictionary key, instead got: " + token);
}

View File

@ -27,6 +27,7 @@
private readonly IInputBytes inputBytes;
private readonly bool usePdfDocEncoding;
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
private readonly bool useLenientParsing;
/// <summary>
/// The offset in the input data at which the <see cref="CurrentToken"/> starts.
@ -52,15 +53,17 @@
IInputBytes inputBytes,
bool usePdfDocEncoding,
ScannerScope scope = ScannerScope.None,
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null)
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null,
bool useLenientParsing = false)
{
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
this.usePdfDocEncoding = usePdfDocEncoding;
this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding);
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding);
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding, useLenientParsing: useLenientParsing);
this.scope = scope;
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
this.useLenientParsing = useLenientParsing;
}
/// <inheritdoc />
@ -140,7 +143,7 @@
&& CurrentToken is NameToken name
&& namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys))
{
tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys);
tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys, useLenientParsing);
}
}
else

View File

@ -28,7 +28,7 @@
IInputBytes inputBytes,
ILog log)
{
var scanner = new CoreTokenScanner(inputBytes, false);
var scanner = new CoreTokenScanner(inputBytes, false, useLenientParsing: useLenientParsing);
var precedingTokens = new List<IToken>();
var graphicsStateOperations = new List<IGraphicsStateOperation>();

View File

@ -75,7 +75,7 @@
SkipMissingFonts = false
};
var tokenScanner = new CoreTokenScanner(inputBytes, true);
var tokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: options.UseLenientParsing);
var passwords = new List<string>();
@ -115,7 +115,7 @@
// We're ok with this since our intent is to lazily load the cross reference table.
// ReSharper disable once AccessToModifiedClosure
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, inputBytes);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance, parsingOptions);
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
var crossReferenceParser = new CrossReferenceParser(parsingOptions.Logger, xrefValidator, crossReferenceStreamParser);

View File

@ -25,6 +25,7 @@
private readonly IObjectLocationProvider objectLocationProvider;
private readonly ILookupFilterProvider filterProvider;
private readonly CoreTokenScanner coreTokenScanner;
private readonly ParsingOptions parsingOptions;
private IEncryptionHandler encryptionHandler;
private bool isDisposed;
@ -52,13 +53,14 @@
public long Length => coreTokenScanner.Length;
public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider, ILookupFilterProvider filterProvider,
IEncryptionHandler encryptionHandler)
IEncryptionHandler encryptionHandler, ParsingOptions parsingOptions)
{
this.inputBytes = inputBytes;
this.objectLocationProvider = objectLocationProvider;
this.filterProvider = filterProvider;
this.encryptionHandler = encryptionHandler;
coreTokenScanner = new CoreTokenScanner(inputBytes, true);
this.parsingOptions = parsingOptions;
coreTokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
}
public void UpdateEncryptionHandler(IEncryptionHandler newHandler)
@ -815,7 +817,7 @@
// Read the N integers
var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider, this));
var scanner = new CoreTokenScanner(bytes, true);
var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
var objects = new List<Tuple<long, long>>();