mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-19 02:37:56 +08:00
add dictionary tokenizer with tests and unify other tokens under operator token
This commit is contained in:
174
src/UglyToad.Pdf.Tests/Tokenization/DictionaryTokenizerTests.cs
Normal file
174
src/UglyToad.Pdf.Tests/Tokenization/DictionaryTokenizerTests.cs
Normal file
@@ -0,0 +1,174 @@
|
||||
// ReSharper disable ParameterOnlyUsedForPreconditionCheck.Local
|
||||
namespace UglyToad.Pdf.Tests.Tokenization
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Pdf.Cos;
|
||||
using Pdf.Tokenization;
|
||||
using Pdf.Tokenization.Tokens;
|
||||
using Xunit;
|
||||
|
||||
public class DictionaryTokenizerTests
|
||||
{
|
||||
private readonly DictionaryTokenizer tokenizer = new DictionaryTokenizer();
|
||||
|
||||
[Theory]
|
||||
[InlineData("[rjee]")]
|
||||
[InlineData("\r\n")]
|
||||
[InlineData("<AE>")]
|
||||
[InlineData("<[p]>")]
|
||||
public void IncorrectStartCharacters_ReturnsFalse(string s)
|
||||
{
|
||||
var input = StringBytesTestConverter.Convert(s);
|
||||
|
||||
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
|
||||
|
||||
Assert.False(result);
|
||||
Assert.Null(token);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SkipsWhitespaceInStartSymbols()
|
||||
{
|
||||
var input = StringBytesTestConverter.Convert("< < /Name (Barry Scott) >>");
|
||||
|
||||
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
var dictionary = AssertDictionaryToken(token);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(dictionary, 0, CosName.NAME, "Barry Scott");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SimpleNameDictionary()
|
||||
{
|
||||
var input = StringBytesTestConverter.Convert("<< /Type /Example>>");
|
||||
|
||||
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
var dictionary = AssertDictionaryToken(token);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 0, CosName.TYPE,
|
||||
CosName.Create("Example"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void StreamDictionary()
|
||||
{
|
||||
var input = StringBytesTestConverter.Convert("<< /Filter /FlateDecode /S 36 /Length 53 >>");
|
||||
|
||||
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
var dictionary = AssertDictionaryToken(token);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 0, CosName.FILTER, CosName.FLATE_DECODE);
|
||||
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 1, CosName.S, 36);
|
||||
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 2, CosName.LENGTH, 53);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CatalogDictionary()
|
||||
{
|
||||
var input = StringBytesTestConverter.Convert("<</Pages 14 0 R /Type /Catalog >>");
|
||||
|
||||
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
var dictionary = AssertDictionaryToken(token);
|
||||
|
||||
var reference = new IndirectReference(14, 0);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, IndirectReferenceToken, IndirectReference>(dictionary, 0, CosName.PAGES, reference);
|
||||
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 1, CosName.TYPE, CosName.CATALOG);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SpecificationExampleDictionary()
|
||||
{
|
||||
const string s = @"<< /Type /Example
|
||||
/Subtype /DictionaryExample
|
||||
/Version 0.01
|
||||
/IntegerItem 12
|
||||
/StringItem (a string)
|
||||
/Subdictionary
|
||||
<< /Item1 0.4
|
||||
/Item2 true
|
||||
/LastItem (not!)
|
||||
/VeryLastItem (OK)
|
||||
>>
|
||||
>>";
|
||||
|
||||
var input = StringBytesTestConverter.Convert(s);
|
||||
|
||||
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
var dictionary = AssertDictionaryToken(token);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 0, CosName.TYPE, CosName.Create("Example"));
|
||||
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 1, CosName.SUBTYPE, CosName.Create("DictionaryExample"));
|
||||
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 2, CosName.VERSION, 0.01m);
|
||||
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 3, CosName.Create("IntegerItem"), 12m);
|
||||
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(dictionary, 4, CosName.Create("StringItem"), "a string");
|
||||
|
||||
var subDictionary = GetIndex(5, dictionary);
|
||||
|
||||
Assert.Equal(CosName.Create("Subdictionary"), Assert.IsType<NameToken>(subDictionary.Key).Data);
|
||||
|
||||
var subDictionaryValue = Assert.IsType<DictionaryToken>(subDictionary.Value);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(subDictionaryValue, 0, CosName.Create("Item1"), 0.4m);
|
||||
AssertDictionaryEntry<NameToken, CosName, BooleanToken, bool>(subDictionaryValue, 1, CosName.Create("Item2"), true);
|
||||
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(subDictionaryValue, 2, CosName.Create("LastItem"), "not!");
|
||||
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(subDictionaryValue, 3, CosName.Create("VeryLastItem"), "OK");
|
||||
}
|
||||
|
||||
private static void AssertDictionaryEntry<TKey, TKeyData, TValue, TValueData>(
|
||||
DictionaryToken dictionary, int index, TKeyData key,
|
||||
TValueData value) where TKey : IDataToken<TKeyData> where TValue : IDataToken<TValueData>
|
||||
{
|
||||
KeyValuePair<IToken, IToken> data = GetIndex(index, dictionary);
|
||||
|
||||
var keyToken = Assert.IsType<TKey>(data.Key);
|
||||
|
||||
Assert.Equal(key, keyToken.Data);
|
||||
|
||||
var valueToken = Assert.IsType<TValue>(data.Value);
|
||||
|
||||
Assert.Equal(value, valueToken.Data);
|
||||
}
|
||||
|
||||
private static KeyValuePair<IToken, IToken> GetIndex(int index, DictionaryToken dictionary)
|
||||
{
|
||||
int i = 0;
|
||||
foreach (var pair in dictionary.Data)
|
||||
{
|
||||
if (i == index)
|
||||
{
|
||||
return pair;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
throw new ArgumentException("The dictionary did not contain an index: " + index);
|
||||
}
|
||||
|
||||
private static DictionaryToken AssertDictionaryToken(IToken token)
|
||||
{
|
||||
Assert.NotNull(token);
|
||||
|
||||
var result = Assert.IsType<DictionaryToken>(token);
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
@@ -87,9 +87,9 @@ endobj";
|
||||
|
||||
AssertCorrectToken<NumericToken, decimal>(tokens[0], 12);
|
||||
AssertCorrectToken<NumericToken, decimal>(tokens[1], 0);
|
||||
Assert.Equal(tokens[2], ObjectDelimiterToken.StartObject);
|
||||
Assert.Equal(tokens[2], OperatorToken.StartObject);
|
||||
AssertCorrectToken<StringToken, string>(tokens[3], "Brillig");
|
||||
Assert.Equal(tokens[4], ObjectDelimiterToken.EndObject);
|
||||
Assert.Equal(tokens[4], OperatorToken.EndObject);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
|
108
src/UglyToad.Pdf/Tokenization/DictionaryTokenizer.cs
Normal file
108
src/UglyToad.Pdf/Tokenization/DictionaryTokenizer.cs
Normal file
@@ -0,0 +1,108 @@
|
||||
namespace UglyToad.Pdf.Tokenization
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using IO;
|
||||
using Parser.Parts;
|
||||
using Scanner;
|
||||
using Tokens;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
public class DictionaryTokenizer : ITokenizer
|
||||
{
|
||||
public bool ReadsNextByte { get; } = false;
|
||||
|
||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||
{
|
||||
token = null;
|
||||
|
||||
if (currentByte != '<')
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool foundNextOpenBrace = false;
|
||||
|
||||
while (inputBytes.MoveNext())
|
||||
{
|
||||
if (inputBytes.CurrentByte == '<')
|
||||
{
|
||||
foundNextOpenBrace = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!ReadHelper.IsWhitespace(inputBytes.CurrentByte))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundNextOpenBrace)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var coreScanner = new CoreTokenScanner(inputBytes, ScannerScope.Dictionary);
|
||||
|
||||
var tokens = new List<IToken>();
|
||||
|
||||
while (coreScanner.MoveNext())
|
||||
{
|
||||
tokens.Add(coreScanner.CurrentToken);
|
||||
}
|
||||
|
||||
var dictionary = ConvertToDictionary(tokens);
|
||||
|
||||
token = new DictionaryToken(dictionary);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static Dictionary<IToken, IToken> ConvertToDictionary(IReadOnlyList<IToken> tokens)
|
||||
{
|
||||
var result = new Dictionary<IToken, IToken>();
|
||||
|
||||
IToken key = null;
|
||||
for (var i = 0; i < tokens.Count; i++)
|
||||
{
|
||||
var token = tokens[i];
|
||||
|
||||
if (key == null)
|
||||
{
|
||||
key = token;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Combine indirect references, e.g. 12 0 R
|
||||
if (token is NumericToken num && PeekNext(tokens, i) is NumericToken gen)
|
||||
{
|
||||
var r = PeekNext(tokens, i + 1);
|
||||
|
||||
if (r == OperatorToken.R)
|
||||
{
|
||||
result[key] = new IndirectReferenceToken(new IndirectReference(num.Long, gen.Long));
|
||||
i = i + 2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
result[key] = token;
|
||||
}
|
||||
|
||||
key = null;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
[CanBeNull]
|
||||
private static IToken PeekNext(IReadOnlyList<IToken> tokens, int currentIndex)
|
||||
{
|
||||
if (tokens.Count - 1 < currentIndex + 1)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return tokens[currentIndex + 1];
|
||||
}
|
||||
}
|
||||
}
|
@@ -28,7 +28,8 @@
|
||||
}
|
||||
|
||||
if (inputBytes.CurrentByte == '<' || inputBytes.CurrentByte == '['
|
||||
|| inputBytes.CurrentByte == '/')
|
||||
|| inputBytes.CurrentByte == '/' || inputBytes.CurrentByte == ']'
|
||||
|| inputBytes.CurrentByte == '>')
|
||||
{
|
||||
break;
|
||||
}
|
||||
@@ -49,20 +50,8 @@
|
||||
case "null":
|
||||
token = NullToken.Instance;
|
||||
break;
|
||||
case "endstream":
|
||||
token = ObjectDelimiterToken.EndStream;
|
||||
break;
|
||||
case "stream":
|
||||
token = ObjectDelimiterToken.StartStream;
|
||||
break;
|
||||
case "obj":
|
||||
token = ObjectDelimiterToken.StartObject;
|
||||
break;
|
||||
case "endobj":
|
||||
token = ObjectDelimiterToken.EndObject;
|
||||
break;
|
||||
default:
|
||||
token = new OperatorToken(text);
|
||||
token = OperatorToken.Create(text);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@@ -23,6 +23,7 @@
|
||||
private static readonly NameTokenizer NameTokenizer = new NameTokenizer();
|
||||
private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer();
|
||||
private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer();
|
||||
private static readonly DictionaryTokenizer DictionaryTokenizer = new DictionaryTokenizer();
|
||||
|
||||
private readonly ScannerScope scope;
|
||||
private readonly IInputBytes inputBytes;
|
||||
@@ -59,7 +60,7 @@
|
||||
}
|
||||
|
||||
// If we failed to read the symbol for whatever reason we pass over it.
|
||||
if (isSkippingSymbol)
|
||||
if (isSkippingSymbol && c != '>')
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@@ -75,7 +76,7 @@
|
||||
if (following == '<')
|
||||
{
|
||||
isSkippingSymbol = true;
|
||||
// TODO: Dictionary tokenizer
|
||||
tokenizer = DictionaryTokenizer;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
21
src/UglyToad.Pdf/Tokenization/Tokens/DictionaryToken.cs
Normal file
21
src/UglyToad.Pdf/Tokenization/Tokens/DictionaryToken.cs
Normal file
@@ -0,0 +1,21 @@
|
||||
namespace UglyToad.Pdf.Tokenization.Tokens
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
||||
public class DictionaryToken : IDataToken<IReadOnlyDictionary<IToken, IToken>>
|
||||
{
|
||||
public IReadOnlyDictionary<IToken, IToken> Data { get; }
|
||||
|
||||
public DictionaryToken(IReadOnlyDictionary<IToken, IToken> data)
|
||||
{
|
||||
Data = data ?? throw new ArgumentNullException(nameof(data));
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return string.Join(", ", Data.Select(x => $"<{x.Key}, {x.Value}>"));
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,25 @@
|
||||
namespace UglyToad.Pdf.Tokenization.Tokens
|
||||
{
|
||||
public class IndirectReferenceToken : IDataToken<IndirectReference>
|
||||
{
|
||||
public IndirectReference Data { get; }
|
||||
|
||||
public IndirectReferenceToken(IndirectReference data)
|
||||
{
|
||||
Data = data;
|
||||
}
|
||||
}
|
||||
|
||||
public struct IndirectReference
|
||||
{
|
||||
public long ObjectNumber { get; }
|
||||
|
||||
public long Generation { get; }
|
||||
|
||||
public IndirectReference(long objectNumber, long generation)
|
||||
{
|
||||
ObjectNumber = objectNumber;
|
||||
Generation = generation;
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,17 +0,0 @@
|
||||
namespace UglyToad.Pdf.Tokenization.Tokens
|
||||
{
|
||||
public class ObjectDelimiterToken : IDataToken<string>
|
||||
{
|
||||
public static ObjectDelimiterToken StartObject = new ObjectDelimiterToken("obj");
|
||||
public static ObjectDelimiterToken EndObject = new ObjectDelimiterToken("endobj");
|
||||
public static ObjectDelimiterToken StartStream = new ObjectDelimiterToken("stream");
|
||||
public static ObjectDelimiterToken EndStream = new ObjectDelimiterToken("endstream");
|
||||
|
||||
public string Data { get; }
|
||||
|
||||
private ObjectDelimiterToken(string data)
|
||||
{
|
||||
Data = data;
|
||||
}
|
||||
}
|
||||
}
|
@@ -6,9 +6,15 @@
|
||||
{
|
||||
private static readonly Dictionary<string, string> PooledNames = new Dictionary<string, string>();
|
||||
|
||||
public static readonly OperatorToken R = new OperatorToken("R");
|
||||
public static readonly OperatorToken StartObject = new OperatorToken("obj");
|
||||
public static readonly OperatorToken EndObject = new OperatorToken("endobj");
|
||||
public static readonly OperatorToken StartStream = new OperatorToken("stream");
|
||||
public static readonly OperatorToken EndStream = new OperatorToken("endstream");
|
||||
|
||||
public string Data { get; }
|
||||
|
||||
public OperatorToken(string data)
|
||||
private OperatorToken(string data)
|
||||
{
|
||||
if (!PooledNames.TryGetValue(data, out var stored))
|
||||
{
|
||||
@@ -18,5 +24,24 @@
|
||||
|
||||
Data = stored;
|
||||
}
|
||||
|
||||
public static OperatorToken Create(string data)
|
||||
{
|
||||
switch (data)
|
||||
{
|
||||
case "R":
|
||||
return R;
|
||||
case "obj":
|
||||
return StartObject;
|
||||
case "endobj":
|
||||
return EndObject;
|
||||
case "stream":
|
||||
return StartStream;
|
||||
case "endstream":
|
||||
return EndStream;
|
||||
default:
|
||||
return new OperatorToken(data);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user