mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-07-15 13:57:56 +08:00
Allow lenient parsing in DictionaryTokenizer and fix #791
This commit is contained in:
parent
250362e015
commit
acfe8b5fdd
@ -1,5 +1,6 @@
|
||||
namespace UglyToad.PdfPig.Tests.Integration
|
||||
{
|
||||
using PdfPig.Core;
|
||||
using PdfPig.Tokens;
|
||||
using Xunit;
|
||||
|
||||
@ -33,5 +34,53 @@
|
||||
Assert.Equal("Another Property Value", ((StringToken)valueToken2).Data);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanReadInvalidDocumentInformation()
|
||||
{
|
||||
var path = IntegrationHelpers.GetSpecificTestDocumentPath("invalid-pdf-structure-pdfminer-entire-doc.pdf");
|
||||
|
||||
/*
|
||||
<<
|
||||
/Producer (pdfTeX-1.40.21)
|
||||
Collaborative Neural Rendering Using Anime Character Sheets /Author()/Title()/Subject()/Creator(LaTeX with hyperref)/Keywords()
|
||||
/CreationDate (D:20230418010134Z)
|
||||
/ModDate (D:20230418010134Z)
|
||||
/Trapped /False
|
||||
/PTEX.Fullbanner (This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2)
|
||||
>>
|
||||
*/
|
||||
|
||||
// Lenient Parsing On -> can process
|
||||
using (var document = PdfDocument.Open(path))
|
||||
{
|
||||
var information = document.Information;
|
||||
|
||||
Assert.Equal("LaTeX with hyperref", information.Creator);
|
||||
Assert.Equal("", information.Keywords);
|
||||
Assert.Equal("pdfTeX-1.40.21", information.Producer);
|
||||
Assert.Equal("", information.Subject);
|
||||
Assert.Equal("", information.Title);
|
||||
Assert.Equal("", information.Author);
|
||||
Assert.Equal("D:20230418010134Z", information.CreationDate);
|
||||
Assert.Equal("D:20230418010134Z", information.ModifiedDate);
|
||||
|
||||
var infoDictionary = information.DocumentInformationDictionary;
|
||||
|
||||
var nameToken = NameToken.Create("Trapped");
|
||||
Assert.True(infoDictionary.TryGet(nameToken, out var valueToken));
|
||||
Assert.IsType<NameToken>(valueToken);
|
||||
Assert.Equal("False", ((NameToken)valueToken).Data);
|
||||
|
||||
nameToken = NameToken.Create("PTEX.Fullbanner");
|
||||
Assert.True(infoDictionary.TryGet(nameToken, out var valueToken2));
|
||||
Assert.IsType<StringToken>(valueToken2);
|
||||
Assert.Equal("This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2", ((StringToken)valueToken2).Data);
|
||||
}
|
||||
|
||||
// Lenient Parsing Off -> throws
|
||||
var ex = Assert.Throws<PdfDocumentFormatException>(() => PdfDocument.Open(path, ParsingOptions.LenientParsingOff));
|
||||
Assert.Equal("Expected name as dictionary key, instead got: Collaborative", ex.Message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Binary file not shown.
@ -499,7 +499,7 @@ endobj";
|
||||
var input = StringBytesTestConverter.Convert(s, false);
|
||||
|
||||
return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(),
|
||||
new TestFilterProvider(), NoOpEncryptionHandler.Instance);
|
||||
new TestFilterProvider(), NoOpEncryptionHandler.Instance, ParsingOptions.LenientParsingOff);
|
||||
}
|
||||
|
||||
private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner)
|
||||
|
@ -9,6 +9,7 @@
|
||||
{
|
||||
private readonly bool usePdfDocEncoding;
|
||||
private readonly IReadOnlyList<NameToken> requiredKeys;
|
||||
private readonly bool useLenientParsing;
|
||||
|
||||
public bool ReadsNextByte { get; } = false;
|
||||
|
||||
@ -22,10 +23,12 @@
|
||||
/// Can be provided to recover from errors with missing dictionary end symbols if the
|
||||
/// set of keys expected in the dictionary are known.
|
||||
/// </param>
|
||||
public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList<NameToken> requiredKeys = null)
|
||||
/// <param name="useLenientParsing">Whether to use lenient parsing.</param>
|
||||
public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList<NameToken> requiredKeys = null, bool useLenientParsing = false)
|
||||
{
|
||||
this.usePdfDocEncoding = usePdfDocEncoding;
|
||||
this.requiredKeys = requiredKeys;
|
||||
this.useLenientParsing = useLenientParsing;
|
||||
}
|
||||
|
||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||
@ -80,7 +83,7 @@
|
||||
return false;
|
||||
}
|
||||
|
||||
var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary);
|
||||
var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary, useLenientParsing: useLenientParsing);
|
||||
|
||||
var tokens = new List<IToken>();
|
||||
|
||||
@ -96,7 +99,7 @@
|
||||
// Has enough key/values for each required key
|
||||
if (useRequiredKeys && tokens.Count >= requiredKeys.Count * 2)
|
||||
{
|
||||
var proposedDictionary = ConvertToDictionary(tokens);
|
||||
var proposedDictionary = ConvertToDictionary(tokens, useLenientParsing);
|
||||
|
||||
var isAcceptable = true;
|
||||
foreach (var key in requiredKeys)
|
||||
@ -118,15 +121,14 @@
|
||||
}
|
||||
}
|
||||
|
||||
var dictionary = ConvertToDictionary(tokens);
|
||||
var dictionary = ConvertToDictionary(tokens, useLenientParsing);
|
||||
|
||||
token = new DictionaryToken(dictionary);
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> tokens)
|
||||
private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> tokens, bool useLenientParsing)
|
||||
{
|
||||
var result = new Dictionary<NameToken, IToken>();
|
||||
|
||||
@ -143,6 +145,13 @@
|
||||
continue;
|
||||
}
|
||||
|
||||
if (useLenientParsing)
|
||||
{
|
||||
// TODO - Log warning
|
||||
System.Diagnostics.Debug.WriteLine($"Expected name as dictionary key, instead got: " + token);
|
||||
continue;
|
||||
}
|
||||
|
||||
throw new PdfDocumentFormatException($"Expected name as dictionary key, instead got: " + token);
|
||||
}
|
||||
|
||||
|
@ -27,7 +27,8 @@
|
||||
private readonly IInputBytes inputBytes;
|
||||
private readonly bool usePdfDocEncoding;
|
||||
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
|
||||
|
||||
private readonly bool useLenientParsing;
|
||||
|
||||
/// <summary>
|
||||
/// The offset in the input data at which the <see cref="CurrentToken"/> starts.
|
||||
/// </summary>
|
||||
@ -52,15 +53,17 @@
|
||||
IInputBytes inputBytes,
|
||||
bool usePdfDocEncoding,
|
||||
ScannerScope scope = ScannerScope.None,
|
||||
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null)
|
||||
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null,
|
||||
bool useLenientParsing = false)
|
||||
{
|
||||
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
|
||||
this.usePdfDocEncoding = usePdfDocEncoding;
|
||||
this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
|
||||
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding);
|
||||
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding);
|
||||
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding, useLenientParsing: useLenientParsing);
|
||||
this.scope = scope;
|
||||
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
|
||||
this.useLenientParsing = useLenientParsing;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
@ -140,7 +143,7 @@
|
||||
&& CurrentToken is NameToken name
|
||||
&& namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys))
|
||||
{
|
||||
tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys);
|
||||
tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys, useLenientParsing);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -28,7 +28,7 @@
|
||||
IInputBytes inputBytes,
|
||||
ILog log)
|
||||
{
|
||||
var scanner = new CoreTokenScanner(inputBytes, false);
|
||||
var scanner = new CoreTokenScanner(inputBytes, false, useLenientParsing: useLenientParsing);
|
||||
|
||||
var precedingTokens = new List<IToken>();
|
||||
var graphicsStateOperations = new List<IGraphicsStateOperation>();
|
||||
|
@ -75,7 +75,7 @@
|
||||
SkipMissingFonts = false
|
||||
};
|
||||
|
||||
var tokenScanner = new CoreTokenScanner(inputBytes, true);
|
||||
var tokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: options.UseLenientParsing);
|
||||
|
||||
var passwords = new List<string>();
|
||||
|
||||
@ -115,7 +115,7 @@
|
||||
// We're ok with this since our intent is to lazily load the cross reference table.
|
||||
// ReSharper disable once AccessToModifiedClosure
|
||||
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, inputBytes);
|
||||
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance);
|
||||
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance, parsingOptions);
|
||||
|
||||
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
|
||||
var crossReferenceParser = new CrossReferenceParser(parsingOptions.Logger, xrefValidator, crossReferenceStreamParser);
|
||||
|
@ -25,6 +25,7 @@
|
||||
private readonly IObjectLocationProvider objectLocationProvider;
|
||||
private readonly ILookupFilterProvider filterProvider;
|
||||
private readonly CoreTokenScanner coreTokenScanner;
|
||||
private readonly ParsingOptions parsingOptions;
|
||||
|
||||
private IEncryptionHandler encryptionHandler;
|
||||
private bool isDisposed;
|
||||
@ -52,13 +53,14 @@
|
||||
public long Length => coreTokenScanner.Length;
|
||||
|
||||
public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider, ILookupFilterProvider filterProvider,
|
||||
IEncryptionHandler encryptionHandler)
|
||||
IEncryptionHandler encryptionHandler, ParsingOptions parsingOptions)
|
||||
{
|
||||
this.inputBytes = inputBytes;
|
||||
this.objectLocationProvider = objectLocationProvider;
|
||||
this.filterProvider = filterProvider;
|
||||
this.encryptionHandler = encryptionHandler;
|
||||
coreTokenScanner = new CoreTokenScanner(inputBytes, true);
|
||||
this.parsingOptions = parsingOptions;
|
||||
coreTokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
|
||||
}
|
||||
|
||||
public void UpdateEncryptionHandler(IEncryptionHandler newHandler)
|
||||
@ -815,7 +817,7 @@
|
||||
// Read the N integers
|
||||
var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider, this));
|
||||
|
||||
var scanner = new CoreTokenScanner(bytes, true);
|
||||
var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
|
||||
|
||||
var objects = new List<Tuple<long, long>>();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user