mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-07-16 23:19:43 +08:00
Allow lenient parsing in DictionaryTokenizer and fix #791
This commit is contained in:
parent
250362e015
commit
acfe8b5fdd
@ -1,5 +1,6 @@
|
|||||||
namespace UglyToad.PdfPig.Tests.Integration
|
namespace UglyToad.PdfPig.Tests.Integration
|
||||||
{
|
{
|
||||||
|
using PdfPig.Core;
|
||||||
using PdfPig.Tokens;
|
using PdfPig.Tokens;
|
||||||
using Xunit;
|
using Xunit;
|
||||||
|
|
||||||
@ -33,5 +34,53 @@
|
|||||||
Assert.Equal("Another Property Value", ((StringToken)valueToken2).Data);
|
Assert.Equal("Another Property Value", ((StringToken)valueToken2).Data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void CanReadInvalidDocumentInformation()
|
||||||
|
{
|
||||||
|
var path = IntegrationHelpers.GetSpecificTestDocumentPath("invalid-pdf-structure-pdfminer-entire-doc.pdf");
|
||||||
|
|
||||||
|
/*
|
||||||
|
<<
|
||||||
|
/Producer (pdfTeX-1.40.21)
|
||||||
|
Collaborative Neural Rendering Using Anime Character Sheets /Author()/Title()/Subject()/Creator(LaTeX with hyperref)/Keywords()
|
||||||
|
/CreationDate (D:20230418010134Z)
|
||||||
|
/ModDate (D:20230418010134Z)
|
||||||
|
/Trapped /False
|
||||||
|
/PTEX.Fullbanner (This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2)
|
||||||
|
>>
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Lenient Parsing On -> can process
|
||||||
|
using (var document = PdfDocument.Open(path))
|
||||||
|
{
|
||||||
|
var information = document.Information;
|
||||||
|
|
||||||
|
Assert.Equal("LaTeX with hyperref", information.Creator);
|
||||||
|
Assert.Equal("", information.Keywords);
|
||||||
|
Assert.Equal("pdfTeX-1.40.21", information.Producer);
|
||||||
|
Assert.Equal("", information.Subject);
|
||||||
|
Assert.Equal("", information.Title);
|
||||||
|
Assert.Equal("", information.Author);
|
||||||
|
Assert.Equal("D:20230418010134Z", information.CreationDate);
|
||||||
|
Assert.Equal("D:20230418010134Z", information.ModifiedDate);
|
||||||
|
|
||||||
|
var infoDictionary = information.DocumentInformationDictionary;
|
||||||
|
|
||||||
|
var nameToken = NameToken.Create("Trapped");
|
||||||
|
Assert.True(infoDictionary.TryGet(nameToken, out var valueToken));
|
||||||
|
Assert.IsType<NameToken>(valueToken);
|
||||||
|
Assert.Equal("False", ((NameToken)valueToken).Data);
|
||||||
|
|
||||||
|
nameToken = NameToken.Create("PTEX.Fullbanner");
|
||||||
|
Assert.True(infoDictionary.TryGet(nameToken, out var valueToken2));
|
||||||
|
Assert.IsType<StringToken>(valueToken2);
|
||||||
|
Assert.Equal("This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2", ((StringToken)valueToken2).Data);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Lenient Parsing Off -> throws
|
||||||
|
var ex = Assert.Throws<PdfDocumentFormatException>(() => PdfDocument.Open(path, ParsingOptions.LenientParsingOff));
|
||||||
|
Assert.Equal("Expected name as dictionary key, instead got: Collaborative", ex.Message);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Binary file not shown.
@ -499,7 +499,7 @@ endobj";
|
|||||||
var input = StringBytesTestConverter.Convert(s, false);
|
var input = StringBytesTestConverter.Convert(s, false);
|
||||||
|
|
||||||
return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(),
|
return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(),
|
||||||
new TestFilterProvider(), NoOpEncryptionHandler.Instance);
|
new TestFilterProvider(), NoOpEncryptionHandler.Instance, ParsingOptions.LenientParsingOff);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner)
|
private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner)
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
{
|
{
|
||||||
private readonly bool usePdfDocEncoding;
|
private readonly bool usePdfDocEncoding;
|
||||||
private readonly IReadOnlyList<NameToken> requiredKeys;
|
private readonly IReadOnlyList<NameToken> requiredKeys;
|
||||||
|
private readonly bool useLenientParsing;
|
||||||
|
|
||||||
public bool ReadsNextByte { get; } = false;
|
public bool ReadsNextByte { get; } = false;
|
||||||
|
|
||||||
@ -22,10 +23,12 @@
|
|||||||
/// Can be provided to recover from errors with missing dictionary end symbols if the
|
/// Can be provided to recover from errors with missing dictionary end symbols if the
|
||||||
/// set of keys expected in the dictionary are known.
|
/// set of keys expected in the dictionary are known.
|
||||||
/// </param>
|
/// </param>
|
||||||
public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList<NameToken> requiredKeys = null)
|
/// <param name="useLenientParsing">Whether to use lenient parsing.</param>
|
||||||
|
public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList<NameToken> requiredKeys = null, bool useLenientParsing = false)
|
||||||
{
|
{
|
||||||
this.usePdfDocEncoding = usePdfDocEncoding;
|
this.usePdfDocEncoding = usePdfDocEncoding;
|
||||||
this.requiredKeys = requiredKeys;
|
this.requiredKeys = requiredKeys;
|
||||||
|
this.useLenientParsing = useLenientParsing;
|
||||||
}
|
}
|
||||||
|
|
||||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||||
@ -80,7 +83,7 @@
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary);
|
var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary, useLenientParsing: useLenientParsing);
|
||||||
|
|
||||||
var tokens = new List<IToken>();
|
var tokens = new List<IToken>();
|
||||||
|
|
||||||
@ -96,7 +99,7 @@
|
|||||||
// Has enough key/values for each required key
|
// Has enough key/values for each required key
|
||||||
if (useRequiredKeys && tokens.Count >= requiredKeys.Count * 2)
|
if (useRequiredKeys && tokens.Count >= requiredKeys.Count * 2)
|
||||||
{
|
{
|
||||||
var proposedDictionary = ConvertToDictionary(tokens);
|
var proposedDictionary = ConvertToDictionary(tokens, useLenientParsing);
|
||||||
|
|
||||||
var isAcceptable = true;
|
var isAcceptable = true;
|
||||||
foreach (var key in requiredKeys)
|
foreach (var key in requiredKeys)
|
||||||
@ -118,15 +121,14 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var dictionary = ConvertToDictionary(tokens);
|
var dictionary = ConvertToDictionary(tokens, useLenientParsing);
|
||||||
|
|
||||||
token = new DictionaryToken(dictionary);
|
token = new DictionaryToken(dictionary);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> tokens)
|
private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> tokens, bool useLenientParsing)
|
||||||
{
|
{
|
||||||
var result = new Dictionary<NameToken, IToken>();
|
var result = new Dictionary<NameToken, IToken>();
|
||||||
|
|
||||||
@ -143,6 +145,13 @@
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (useLenientParsing)
|
||||||
|
{
|
||||||
|
// TODO - Log warning
|
||||||
|
System.Diagnostics.Debug.WriteLine($"Expected name as dictionary key, instead got: " + token);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
throw new PdfDocumentFormatException($"Expected name as dictionary key, instead got: " + token);
|
throw new PdfDocumentFormatException($"Expected name as dictionary key, instead got: " + token);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,7 +27,8 @@
|
|||||||
private readonly IInputBytes inputBytes;
|
private readonly IInputBytes inputBytes;
|
||||||
private readonly bool usePdfDocEncoding;
|
private readonly bool usePdfDocEncoding;
|
||||||
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
|
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
|
||||||
|
private readonly bool useLenientParsing;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The offset in the input data at which the <see cref="CurrentToken"/> starts.
|
/// The offset in the input data at which the <see cref="CurrentToken"/> starts.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
@ -52,15 +53,17 @@
|
|||||||
IInputBytes inputBytes,
|
IInputBytes inputBytes,
|
||||||
bool usePdfDocEncoding,
|
bool usePdfDocEncoding,
|
||||||
ScannerScope scope = ScannerScope.None,
|
ScannerScope scope = ScannerScope.None,
|
||||||
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null)
|
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null,
|
||||||
|
bool useLenientParsing = false)
|
||||||
{
|
{
|
||||||
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
|
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
|
||||||
this.usePdfDocEncoding = usePdfDocEncoding;
|
this.usePdfDocEncoding = usePdfDocEncoding;
|
||||||
this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
|
this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
|
||||||
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding);
|
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding);
|
||||||
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding);
|
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding, useLenientParsing: useLenientParsing);
|
||||||
this.scope = scope;
|
this.scope = scope;
|
||||||
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
|
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
|
||||||
|
this.useLenientParsing = useLenientParsing;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
@ -140,7 +143,7 @@
|
|||||||
&& CurrentToken is NameToken name
|
&& CurrentToken is NameToken name
|
||||||
&& namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys))
|
&& namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys))
|
||||||
{
|
{
|
||||||
tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys);
|
tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys, useLenientParsing);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -28,7 +28,7 @@
|
|||||||
IInputBytes inputBytes,
|
IInputBytes inputBytes,
|
||||||
ILog log)
|
ILog log)
|
||||||
{
|
{
|
||||||
var scanner = new CoreTokenScanner(inputBytes, false);
|
var scanner = new CoreTokenScanner(inputBytes, false, useLenientParsing: useLenientParsing);
|
||||||
|
|
||||||
var precedingTokens = new List<IToken>();
|
var precedingTokens = new List<IToken>();
|
||||||
var graphicsStateOperations = new List<IGraphicsStateOperation>();
|
var graphicsStateOperations = new List<IGraphicsStateOperation>();
|
||||||
|
@ -75,7 +75,7 @@
|
|||||||
SkipMissingFonts = false
|
SkipMissingFonts = false
|
||||||
};
|
};
|
||||||
|
|
||||||
var tokenScanner = new CoreTokenScanner(inputBytes, true);
|
var tokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: options.UseLenientParsing);
|
||||||
|
|
||||||
var passwords = new List<string>();
|
var passwords = new List<string>();
|
||||||
|
|
||||||
@ -115,7 +115,7 @@
|
|||||||
// We're ok with this since our intent is to lazily load the cross reference table.
|
// We're ok with this since our intent is to lazily load the cross reference table.
|
||||||
// ReSharper disable once AccessToModifiedClosure
|
// ReSharper disable once AccessToModifiedClosure
|
||||||
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, inputBytes);
|
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, inputBytes);
|
||||||
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance);
|
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance, parsingOptions);
|
||||||
|
|
||||||
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
|
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
|
||||||
var crossReferenceParser = new CrossReferenceParser(parsingOptions.Logger, xrefValidator, crossReferenceStreamParser);
|
var crossReferenceParser = new CrossReferenceParser(parsingOptions.Logger, xrefValidator, crossReferenceStreamParser);
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
private readonly IObjectLocationProvider objectLocationProvider;
|
private readonly IObjectLocationProvider objectLocationProvider;
|
||||||
private readonly ILookupFilterProvider filterProvider;
|
private readonly ILookupFilterProvider filterProvider;
|
||||||
private readonly CoreTokenScanner coreTokenScanner;
|
private readonly CoreTokenScanner coreTokenScanner;
|
||||||
|
private readonly ParsingOptions parsingOptions;
|
||||||
|
|
||||||
private IEncryptionHandler encryptionHandler;
|
private IEncryptionHandler encryptionHandler;
|
||||||
private bool isDisposed;
|
private bool isDisposed;
|
||||||
@ -52,13 +53,14 @@
|
|||||||
public long Length => coreTokenScanner.Length;
|
public long Length => coreTokenScanner.Length;
|
||||||
|
|
||||||
public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider, ILookupFilterProvider filterProvider,
|
public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider, ILookupFilterProvider filterProvider,
|
||||||
IEncryptionHandler encryptionHandler)
|
IEncryptionHandler encryptionHandler, ParsingOptions parsingOptions)
|
||||||
{
|
{
|
||||||
this.inputBytes = inputBytes;
|
this.inputBytes = inputBytes;
|
||||||
this.objectLocationProvider = objectLocationProvider;
|
this.objectLocationProvider = objectLocationProvider;
|
||||||
this.filterProvider = filterProvider;
|
this.filterProvider = filterProvider;
|
||||||
this.encryptionHandler = encryptionHandler;
|
this.encryptionHandler = encryptionHandler;
|
||||||
coreTokenScanner = new CoreTokenScanner(inputBytes, true);
|
this.parsingOptions = parsingOptions;
|
||||||
|
coreTokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void UpdateEncryptionHandler(IEncryptionHandler newHandler)
|
public void UpdateEncryptionHandler(IEncryptionHandler newHandler)
|
||||||
@ -815,7 +817,7 @@
|
|||||||
// Read the N integers
|
// Read the N integers
|
||||||
var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider, this));
|
var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider, this));
|
||||||
|
|
||||||
var scanner = new CoreTokenScanner(bytes, true);
|
var scanner = new CoreTokenScanner(bytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
|
||||||
|
|
||||||
var objects = new List<Tuple<long, long>>();
|
var objects = new List<Tuple<long, long>>();
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user