add intelligent error recovery for known dictionaries #511

if we're parsing a known dictionary (e.g. all keys are required
and there are no additional optional keys) and we encounter
an error we provide the possibility to recover by assuming
a dictionary end token after all required tokens are consumed
if parsing by looking for dictionary end failed due to a format
exception
This commit is contained in:
Eliot Jones
2023-05-21 14:58:39 +01:00
parent 903218854c
commit fc2f7b9325
4 changed files with 107 additions and 3 deletions

View File

@@ -41,6 +41,20 @@ CMapName currentdict /CMap defineresource pop
end
end";
private const string CmapMissingDictionaryEndToken = @"
/CIDInit /ProcSet findresource
begin 12 dict
begin begincmap
/CIDSystemInfo <<
/Registry (F2+0) /Ordering (F2) /Supplement 0
/CMapName /F2+0 def
/CMapType 2 def
1 begincodespacerange <020D> <020D> endcodespacerange
1 beginbfchar
<020D> <03A9>
endcmap CMapName currentdict /CMap defineresource pop end end
endbfchar";
private readonly CMapParser cMapParser = new CMapParser();
[Fact]
@@ -58,6 +72,18 @@ end";
Assert.Equal(2, cmap.Type);
}
[Fact]
public void CanParseCidSystemInfoAndOtherInformationWhenMissingDictionaryClose()
{
var input = StringBytesTestConverter.Convert(CmapMissingDictionaryEndToken, false);
var cmap = cMapParser.Parse(input.Bytes);
Assert.Equal("F2+0", cmap.Info.Registry);
Assert.Equal("F2", cmap.Info.Ordering);
Assert.Equal(0, cmap.Info.Supplement);
}
[Fact]
public void CanParseCodespaceRange()
{

View File

@@ -7,9 +7,45 @@
internal class DictionaryTokenizer : ITokenizer
{
private readonly IReadOnlyList<NameToken> requiredKeys;
public bool ReadsNextByte { get; } = false;
/// <summary>
/// Create a new <see cref="DictionaryTokenizer"/>.
/// </summary>
/// <param name="requiredKeys">
/// Can be provided to recover from errors with missing dictionary end symbols if the
/// set of keys expected in the dictionary are known.
/// </param>
public DictionaryTokenizer(IReadOnlyList<NameToken> requiredKeys = null)
{
this.requiredKeys = requiredKeys;
}
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
var start = inputBytes.CurrentOffset;
try
{
return TryTokenizeInternal(currentByte, inputBytes, false, out token);
}
catch (PdfDocumentFormatException)
{
// Cannot attempt inferred end.
if (requiredKeys == null)
{
throw;
}
}
inputBytes.Seek(start);
return TryTokenizeInternal(currentByte, inputBytes, true, out token);
}
private bool TryTokenizeInternal(byte currentByte, IInputBytes inputBytes, bool useRequiredKeys, out IToken token)
{
token = null;
@@ -51,6 +87,30 @@
}
tokens.Add(coreScanner.CurrentToken);
// Has enough key/values for each required key
if (useRequiredKeys && tokens.Count >= requiredKeys.Count * 2)
{
var proposedDictionary = ConvertToDictionary(tokens);
var isAcceptable = true;
foreach (var key in requiredKeys)
{
if (!proposedDictionary.TryGetValue(key, out var tok) || tok == null)
{
isAcceptable = false;
break;
}
}
// If each required key has a value and we're here because parsing broke previously then return
// this dictionary.
if (isAcceptable)
{
token = new DictionaryToken(proposedDictionary);
return true;
}
}
}
var dictionary = ConvertToDictionary(tokens);
@@ -58,6 +118,7 @@
token = new DictionaryToken(dictionary);
return true;
}
private static Dictionary<NameToken, IToken> ConvertToDictionary(List<IToken> tokens)

View File

@@ -23,6 +23,7 @@
private readonly StringTokenizer StringTokenizer = new StringTokenizer();
private readonly ScannerScope scope;
private readonly IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys;
private readonly IInputBytes inputBytes;
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
@@ -46,10 +47,14 @@
/// <summary>
/// Create a new <see cref="CoreTokenScanner"/> from the input.
/// </summary>
public CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None)
public CoreTokenScanner(
IInputBytes inputBytes,
ScannerScope scope = ScannerScope.None,
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null)
{
this.scope = scope;
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
this.scope = scope;
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
}
/// <inheritdoc />
@@ -124,6 +129,13 @@
{
isSkippingSymbol = true;
tokenizer = DictionaryTokenizer;
if (namedDictionaryRequiredKeys != null
&& CurrentToken is NameToken name
&& namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys))
{
tokenizer = new DictionaryTokenizer(requiredKeys);
}
}
else
{

View File

@@ -6,6 +6,7 @@
using Cmap;
using Core;
using Parts;
using System.Collections.Generic;
using Tokenization.Scanner;
using Tokens;
@@ -20,7 +21,11 @@
public CMap Parse(IInputBytes inputBytes)
{
var scanner = new CoreTokenScanner(inputBytes);
var scanner = new CoreTokenScanner(inputBytes,
namedDictionaryRequiredKeys: new Dictionary<NameToken, IReadOnlyList<NameToken>>
{
{ NameToken.CidSystemInfo, new[] { NameToken.Registry, NameToken.Ordering, NameToken.Supplement } }
});
var builder = new CharacterMapBuilder();