mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-11-24 16:53:20 +08:00
add intelligent error recovery for known dictionaries #511
if we're parsing a known dictionary (e.g. all keys are required and there are no additional optional keys) and we encounter an error we provide the possibility to recover by assuming a dictionary end token after all required tokens are consumed if parsing by looking for dictionary end failed due to a format exception
This commit is contained in:
@@ -41,6 +41,20 @@ CMapName currentdict /CMap defineresource pop
|
||||
end
|
||||
end";
|
||||
|
||||
private const string CmapMissingDictionaryEndToken = @"
|
||||
/CIDInit /ProcSet findresource
|
||||
begin 12 dict
|
||||
begin begincmap
|
||||
/CIDSystemInfo <<
|
||||
/Registry (F2+0) /Ordering (F2) /Supplement 0
|
||||
/CMapName /F2+0 def
|
||||
/CMapType 2 def
|
||||
1 begincodespacerange <020D> <020D> endcodespacerange
|
||||
1 beginbfchar
|
||||
<020D> <03A9>
|
||||
endcmap CMapName currentdict /CMap defineresource pop end end
|
||||
endbfchar";
|
||||
|
||||
private readonly CMapParser cMapParser = new CMapParser();
|
||||
|
||||
[Fact]
|
||||
@@ -58,6 +72,18 @@ end";
|
||||
Assert.Equal(2, cmap.Type);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanParseCidSystemInfoAndOtherInformationWhenMissingDictionaryClose()
|
||||
{
|
||||
var input = StringBytesTestConverter.Convert(CmapMissingDictionaryEndToken, false);
|
||||
|
||||
var cmap = cMapParser.Parse(input.Bytes);
|
||||
|
||||
Assert.Equal("F2+0", cmap.Info.Registry);
|
||||
Assert.Equal("F2", cmap.Info.Ordering);
|
||||
Assert.Equal(0, cmap.Info.Supplement);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanParseCodespaceRange()
|
||||
{
|
||||
|
||||
@@ -7,9 +7,45 @@
|
||||
|
||||
internal class DictionaryTokenizer : ITokenizer
|
||||
{
|
||||
private readonly IReadOnlyList<NameToken> requiredKeys;
|
||||
|
||||
public bool ReadsNextByte { get; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="DictionaryTokenizer"/>.
|
||||
/// </summary>
|
||||
/// <param name="requiredKeys">
|
||||
/// Can be provided to recover from errors with missing dictionary end symbols if the
|
||||
/// set of keys expected in the dictionary are known.
|
||||
/// </param>
|
||||
public DictionaryTokenizer(IReadOnlyList<NameToken> requiredKeys = null)
|
||||
{
|
||||
this.requiredKeys = requiredKeys;
|
||||
}
|
||||
|
||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||
{
|
||||
var start = inputBytes.CurrentOffset;
|
||||
|
||||
try
|
||||
{
|
||||
return TryTokenizeInternal(currentByte, inputBytes, false, out token);
|
||||
}
|
||||
catch (PdfDocumentFormatException)
|
||||
{
|
||||
// Cannot attempt inferred end.
|
||||
if (requiredKeys == null)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
inputBytes.Seek(start);
|
||||
|
||||
return TryTokenizeInternal(currentByte, inputBytes, true, out token);
|
||||
}
|
||||
|
||||
private bool TryTokenizeInternal(byte currentByte, IInputBytes inputBytes, bool useRequiredKeys, out IToken token)
|
||||
{
|
||||
token = null;
|
||||
|
||||
@@ -51,6 +87,30 @@
|
||||
}
|
||||
|
||||
tokens.Add(coreScanner.CurrentToken);
|
||||
|
||||
// Has enough key/values for each required key
|
||||
if (useRequiredKeys && tokens.Count >= requiredKeys.Count * 2)
|
||||
{
|
||||
var proposedDictionary = ConvertToDictionary(tokens);
|
||||
|
||||
var isAcceptable = true;
|
||||
foreach (var key in requiredKeys)
|
||||
{
|
||||
if (!proposedDictionary.TryGetValue(key, out var tok) || tok == null)
|
||||
{
|
||||
isAcceptable = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If each required key has a value and we're here because parsing broke previously then return
|
||||
// this dictionary.
|
||||
if (isAcceptable)
|
||||
{
|
||||
token = new DictionaryToken(proposedDictionary);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var dictionary = ConvertToDictionary(tokens);
|
||||
@@ -58,6 +118,7 @@
|
||||
token = new DictionaryToken(dictionary);
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> tokens)
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
private readonly StringTokenizer StringTokenizer = new StringTokenizer();
|
||||
|
||||
private readonly ScannerScope scope;
|
||||
private readonly IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys;
|
||||
private readonly IInputBytes inputBytes;
|
||||
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
|
||||
|
||||
@@ -46,10 +47,14 @@
|
||||
/// <summary>
|
||||
/// Create a new <see cref="CoreTokenScanner"/> from the input.
|
||||
/// </summary>
|
||||
public CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None)
|
||||
public CoreTokenScanner(
|
||||
IInputBytes inputBytes,
|
||||
ScannerScope scope = ScannerScope.None,
|
||||
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null)
|
||||
{
|
||||
this.scope = scope;
|
||||
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
|
||||
this.scope = scope;
|
||||
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
@@ -124,6 +129,13 @@
|
||||
{
|
||||
isSkippingSymbol = true;
|
||||
tokenizer = DictionaryTokenizer;
|
||||
|
||||
if (namedDictionaryRequiredKeys != null
|
||||
&& CurrentToken is NameToken name
|
||||
&& namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys))
|
||||
{
|
||||
tokenizer = new DictionaryTokenizer(requiredKeys);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
using Cmap;
|
||||
using Core;
|
||||
using Parts;
|
||||
using System.Collections.Generic;
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
|
||||
@@ -20,7 +21,11 @@
|
||||
|
||||
public CMap Parse(IInputBytes inputBytes)
|
||||
{
|
||||
var scanner = new CoreTokenScanner(inputBytes);
|
||||
var scanner = new CoreTokenScanner(inputBytes,
|
||||
namedDictionaryRequiredKeys: new Dictionary<NameToken, IReadOnlyList<NameToken>>
|
||||
{
|
||||
{ NameToken.CidSystemInfo, new[] { NameToken.Registry, NameToken.Ordering, NameToken.Supplement } }
|
||||
});
|
||||
|
||||
var builder = new CharacterMapBuilder();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user