add dictionary tokenizer with tests and unify other tokens under operator token

This commit is contained in:
Eliot Jones
2017-11-12 17:06:19 +00:00
parent c22a79f327
commit 879563dd0c
9 changed files with 362 additions and 36 deletions

View File

@@ -0,0 +1,174 @@
// ReSharper disable ParameterOnlyUsedForPreconditionCheck.Local
namespace UglyToad.Pdf.Tests.Tokenization
{
using System;
using System.Collections.Generic;
using Pdf.Cos;
using Pdf.Tokenization;
using Pdf.Tokenization.Tokens;
using Xunit;
public class DictionaryTokenizerTests
{
private readonly DictionaryTokenizer tokenizer = new DictionaryTokenizer();
[Theory]
[InlineData("[rjee]")]
[InlineData("\r\n")]
[InlineData("<AE>")]
[InlineData("<[p]>")]
public void IncorrectStartCharacters_ReturnsFalse(string s)
{
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.False(result);
Assert.Null(token);
}
[Fact]
public void SkipsWhitespaceInStartSymbols()
{
var input = StringBytesTestConverter.Convert("< < /Name (Barry Scott) >>");
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
var dictionary = AssertDictionaryToken(token);
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(dictionary, 0, CosName.NAME, "Barry Scott");
}
[Fact]
public void SimpleNameDictionary()
{
var input = StringBytesTestConverter.Convert("<< /Type /Example>>");
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
var dictionary = AssertDictionaryToken(token);
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 0, CosName.TYPE,
CosName.Create("Example"));
}
[Fact]
public void StreamDictionary()
{
var input = StringBytesTestConverter.Convert("<< /Filter /FlateDecode /S 36 /Length 53 >>");
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
var dictionary = AssertDictionaryToken(token);
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 0, CosName.FILTER, CosName.FLATE_DECODE);
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 1, CosName.S, 36);
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 2, CosName.LENGTH, 53);
}
[Fact]
public void CatalogDictionary()
{
var input = StringBytesTestConverter.Convert("<</Pages 14 0 R /Type /Catalog >>");
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
var dictionary = AssertDictionaryToken(token);
var reference = new IndirectReference(14, 0);
AssertDictionaryEntry<NameToken, CosName, IndirectReferenceToken, IndirectReference>(dictionary, 0, CosName.PAGES, reference);
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 1, CosName.TYPE, CosName.CATALOG);
}
[Fact]
public void SpecificationExampleDictionary()
{
const string s = @"<< /Type /Example
/Subtype /DictionaryExample
/Version 0.01
/IntegerItem 12
/StringItem (a string)
/Subdictionary
<< /Item1 0.4
/Item2 true
/LastItem (not!)
/VeryLastItem (OK)
>>
>>";
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
var dictionary = AssertDictionaryToken(token);
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 0, CosName.TYPE, CosName.Create("Example"));
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 1, CosName.SUBTYPE, CosName.Create("DictionaryExample"));
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 2, CosName.VERSION, 0.01m);
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 3, CosName.Create("IntegerItem"), 12m);
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(dictionary, 4, CosName.Create("StringItem"), "a string");
var subDictionary = GetIndex(5, dictionary);
Assert.Equal(CosName.Create("Subdictionary"), Assert.IsType<NameToken>(subDictionary.Key).Data);
var subDictionaryValue = Assert.IsType<DictionaryToken>(subDictionary.Value);
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(subDictionaryValue, 0, CosName.Create("Item1"), 0.4m);
AssertDictionaryEntry<NameToken, CosName, BooleanToken, bool>(subDictionaryValue, 1, CosName.Create("Item2"), true);
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(subDictionaryValue, 2, CosName.Create("LastItem"), "not!");
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(subDictionaryValue, 3, CosName.Create("VeryLastItem"), "OK");
}
private static void AssertDictionaryEntry<TKey, TKeyData, TValue, TValueData>(
DictionaryToken dictionary, int index, TKeyData key,
TValueData value) where TKey : IDataToken<TKeyData> where TValue : IDataToken<TValueData>
{
KeyValuePair<IToken, IToken> data = GetIndex(index, dictionary);
var keyToken = Assert.IsType<TKey>(data.Key);
Assert.Equal(key, keyToken.Data);
var valueToken = Assert.IsType<TValue>(data.Value);
Assert.Equal(value, valueToken.Data);
}
private static KeyValuePair<IToken, IToken> GetIndex(int index, DictionaryToken dictionary)
{
int i = 0;
foreach (var pair in dictionary.Data)
{
if (i == index)
{
return pair;
}
i++;
}
throw new ArgumentException("The dictionary did not contain an index: " + index);
}
private static DictionaryToken AssertDictionaryToken(IToken token)
{
Assert.NotNull(token);
var result = Assert.IsType<DictionaryToken>(token);
return result;
}
}
}

View File

@@ -87,9 +87,9 @@ endobj";
AssertCorrectToken<NumericToken, decimal>(tokens[0], 12);
AssertCorrectToken<NumericToken, decimal>(tokens[1], 0);
Assert.Equal(tokens[2], ObjectDelimiterToken.StartObject);
Assert.Equal(tokens[2], OperatorToken.StartObject);
AssertCorrectToken<StringToken, string>(tokens[3], "Brillig");
Assert.Equal(tokens[4], ObjectDelimiterToken.EndObject);
Assert.Equal(tokens[4], OperatorToken.EndObject);
}
[Fact]

View File

@@ -0,0 +1,108 @@
namespace UglyToad.Pdf.Tokenization
{
using System.Collections.Generic;
using IO;
using Parser.Parts;
using Scanner;
using Tokens;
using Util.JetBrains.Annotations;
public class DictionaryTokenizer : ITokenizer
{
public bool ReadsNextByte { get; } = false;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;
if (currentByte != '<')
{
return false;
}
bool foundNextOpenBrace = false;
while (inputBytes.MoveNext())
{
if (inputBytes.CurrentByte == '<')
{
foundNextOpenBrace = true;
break;
}
if (!ReadHelper.IsWhitespace(inputBytes.CurrentByte))
{
break;
}
}
if (!foundNextOpenBrace)
{
return false;
}
var coreScanner = new CoreTokenScanner(inputBytes, ScannerScope.Dictionary);
var tokens = new List<IToken>();
while (coreScanner.MoveNext())
{
tokens.Add(coreScanner.CurrentToken);
}
var dictionary = ConvertToDictionary(tokens);
token = new DictionaryToken(dictionary);
return true;
}
private static Dictionary<IToken, IToken> ConvertToDictionary(IReadOnlyList<IToken> tokens)
{
var result = new Dictionary<IToken, IToken>();
IToken key = null;
for (var i = 0; i < tokens.Count; i++)
{
var token = tokens[i];
if (key == null)
{
key = token;
continue;
}
// Combine indirect references, e.g. 12 0 R
if (token is NumericToken num && PeekNext(tokens, i) is NumericToken gen)
{
var r = PeekNext(tokens, i + 1);
if (r == OperatorToken.R)
{
result[key] = new IndirectReferenceToken(new IndirectReference(num.Long, gen.Long));
i = i + 2;
}
}
else
{
result[key] = token;
}
key = null;
}
return result;
}
[CanBeNull]
private static IToken PeekNext(IReadOnlyList<IToken> tokens, int currentIndex)
{
if (tokens.Count - 1 < currentIndex + 1)
{
return null;
}
return tokens[currentIndex + 1];
}
}
}

View File

@@ -28,7 +28,8 @@
}
if (inputBytes.CurrentByte == '<' || inputBytes.CurrentByte == '['
|| inputBytes.CurrentByte == '/')
|| inputBytes.CurrentByte == '/' || inputBytes.CurrentByte == ']'
|| inputBytes.CurrentByte == '>')
{
break;
}
@@ -49,20 +50,8 @@
case "null":
token = NullToken.Instance;
break;
case "endstream":
token = ObjectDelimiterToken.EndStream;
break;
case "stream":
token = ObjectDelimiterToken.StartStream;
break;
case "obj":
token = ObjectDelimiterToken.StartObject;
break;
case "endobj":
token = ObjectDelimiterToken.EndObject;
break;
default:
token = new OperatorToken(text);
token = OperatorToken.Create(text);
break;
}

View File

@@ -23,6 +23,7 @@
private static readonly NameTokenizer NameTokenizer = new NameTokenizer();
private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer();
private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer();
private static readonly DictionaryTokenizer DictionaryTokenizer = new DictionaryTokenizer();
private readonly ScannerScope scope;
private readonly IInputBytes inputBytes;
@@ -59,7 +60,7 @@
}
// If we failed to read the symbol for whatever reason we pass over it.
if (isSkippingSymbol)
if (isSkippingSymbol && c != '>')
{
continue;
}
@@ -75,7 +76,7 @@
if (following == '<')
{
isSkippingSymbol = true;
// TODO: Dictionary tokenizer
tokenizer = DictionaryTokenizer;
}
else
{

View File

@@ -0,0 +1,21 @@
namespace UglyToad.Pdf.Tokenization.Tokens
{
using System;
using System.Collections.Generic;
using System.Linq;
public class DictionaryToken : IDataToken<IReadOnlyDictionary<IToken, IToken>>
{
public IReadOnlyDictionary<IToken, IToken> Data { get; }
public DictionaryToken(IReadOnlyDictionary<IToken, IToken> data)
{
Data = data ?? throw new ArgumentNullException(nameof(data));
}
public override string ToString()
{
return string.Join(", ", Data.Select(x => $"<{x.Key}, {x.Value}>"));
}
}
}

View File

@@ -0,0 +1,25 @@
namespace UglyToad.Pdf.Tokenization.Tokens
{
public class IndirectReferenceToken : IDataToken<IndirectReference>
{
public IndirectReference Data { get; }
public IndirectReferenceToken(IndirectReference data)
{
Data = data;
}
}
public struct IndirectReference
{
public long ObjectNumber { get; }
public long Generation { get; }
public IndirectReference(long objectNumber, long generation)
{
ObjectNumber = objectNumber;
Generation = generation;
}
}
}

View File

@@ -1,17 +0,0 @@
namespace UglyToad.Pdf.Tokenization.Tokens
{
public class ObjectDelimiterToken : IDataToken<string>
{
public static ObjectDelimiterToken StartObject = new ObjectDelimiterToken("obj");
public static ObjectDelimiterToken EndObject = new ObjectDelimiterToken("endobj");
public static ObjectDelimiterToken StartStream = new ObjectDelimiterToken("stream");
public static ObjectDelimiterToken EndStream = new ObjectDelimiterToken("endstream");
public string Data { get; }
private ObjectDelimiterToken(string data)
{
Data = data;
}
}
}

View File

@@ -6,9 +6,15 @@
{
private static readonly Dictionary<string, string> PooledNames = new Dictionary<string, string>();
public static readonly OperatorToken R = new OperatorToken("R");
public static readonly OperatorToken StartObject = new OperatorToken("obj");
public static readonly OperatorToken EndObject = new OperatorToken("endobj");
public static readonly OperatorToken StartStream = new OperatorToken("stream");
public static readonly OperatorToken EndStream = new OperatorToken("endstream");
public string Data { get; }
public OperatorToken(string data)
private OperatorToken(string data)
{
if (!PooledNames.TryGetValue(data, out var stored))
{
@@ -18,5 +24,24 @@
Data = stored;
}
public static OperatorToken Create(string data)
{
switch (data)
{
case "R":
return R;
case "obj":
return StartObject;
case "endobj":
return EndObject;
case "stream":
return StartStream;
case "endstream":
return EndStream;
default:
return new OperatorToken(data);
}
}
}
}