add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

This commit is contained in:
Eliot Jones
2017-11-12 01:08:09 +00:00
parent 096278de50
commit 00e3d06513
18 changed files with 252 additions and 22 deletions

View File

@@ -5,13 +5,17 @@
public static class StringBytesTestConverter
{
public static Result Convert(string s)
public static Result Convert(string s, bool readFirst = true)
{
var input = new ByteArrayInputBytes(Encoding.UTF8.GetBytes(s));
input.MoveNext();
var initialByte = input.CurrentByte;
byte initialByte = 0;
if (readFirst)
{
input.MoveNext();
initialByte = input.CurrentByte;
}
return new Result
{
First = initialByte,

View File

@@ -0,0 +1,108 @@
// ReSharper disable ParameterOnlyUsedForPreconditionCheck.Local
namespace UglyToad.Pdf.Tests.Tokenization.Scanner
{
using System;
using System.Collections.Generic;
using IO;
using Parser.Parts;
using Pdf.Cos;
using Pdf.Parser.Parts;
using Pdf.Tokenization.Scanner;
using Pdf.Tokenization.Tokens;
using Xunit;
public class CoreTokenScannerTests
{
private readonly CosDictionaryParser dictionaryParser = new CosDictionaryParser(new CosNameParser(), new TestingLog());
private readonly CosArrayParser arrayParser = new CosArrayParser();
private readonly Func<IInputBytes, CoreTokenScanner> scannerFactory;
public CoreTokenScannerTests()
{
scannerFactory = x => new CoreTokenScanner(x, dictionaryParser, arrayParser);
}
[Fact]
public void ScansSpecificationArrayExampleContents()
{
const string s = "549 3.14 false (Ralph) /SomeName";
var tokens = new List<IToken>();
var scanner = scannerFactory(StringBytesTestConverter.Convert(s, false).Bytes);
while (scanner.MoveNext())
{
tokens.Add(scanner.CurrentToken);
}
AssertCorrectToken<NumericToken, decimal>(tokens[0], 549);
AssertCorrectToken<NumericToken, decimal>(tokens[1], 3.14m);
AssertCorrectToken<BooleanToken, bool>(tokens[2], false);
AssertCorrectToken<StringToken, string>(tokens[3], "Ralph");
AssertCorrectToken<NameToken, CosName>(tokens[4], CosName.Create("SomeName"));
}
[Fact]
public void ScansSpecificationSimpleDictionaryExampleContents()
{
const string s = @"/Type /Example
/Subtype /DictionaryExample
/Version 0.01
/IntegerItem 12
/StringItem(a string)";
var tokens = new List<IToken>();
var scanner = scannerFactory(StringBytesTestConverter.Convert(s, false).Bytes);
while (scanner.MoveNext())
{
tokens.Add(scanner.CurrentToken);
}
AssertCorrectToken<NameToken, CosName>(tokens[0], CosName.TYPE);
AssertCorrectToken<NameToken, CosName>(tokens[1], CosName.Create("Example"));
AssertCorrectToken<NameToken, CosName>(tokens[2], CosName.SUBTYPE);
AssertCorrectToken<NameToken, CosName>(tokens[3], CosName.Create("DictionaryExample"));
AssertCorrectToken<NameToken, CosName>(tokens[4], CosName.VERSION);
AssertCorrectToken<NumericToken, decimal>(tokens[5], 0.01m);
AssertCorrectToken<NameToken, CosName>(tokens[6], CosName.Create("IntegerItem"));
AssertCorrectToken<NumericToken, decimal>(tokens[7], 12m);
AssertCorrectToken<NameToken, CosName>(tokens[8], CosName.Create("StringItem"));
AssertCorrectToken<StringToken, string>(tokens[9], "a string");
}
[Fact]
public void ScansIndirectObjectExampleContents()
{
const string s = @"12 0 obj
(Brillig)
endobj";
var tokens = new List<IToken>();
var scanner = scannerFactory(StringBytesTestConverter.Convert(s, false).Bytes);
while (scanner.MoveNext())
{
tokens.Add(scanner.CurrentToken);
}
AssertCorrectToken<NumericToken, decimal>(tokens[0], 12);
AssertCorrectToken<NumericToken, decimal>(tokens[1], 0);
Assert.Equal(tokens[2], ObjectDelimiterToken.StartObject);
AssertCorrectToken<StringToken, string>(tokens[3], "Brillig");
Assert.Equal(tokens[4], ObjectDelimiterToken.EndObject);
}
private static void AssertCorrectToken<T, TData>(IToken token, TData expected) where T : IDataToken<TData>
{
var cast = Assert.IsType<T>(token);
Assert.Equal(expected, cast.Data);
}
}
}

View File

@@ -37,5 +37,10 @@
return bytes[CurrentOffset + 1];
}
public bool IsAtEnd()
{
return CurrentOffset == bytes.Count - 1;
}
}
}

View File

@@ -9,5 +9,7 @@
byte CurrentByte { get; }
byte? Peek();
bool IsAtEnd();
}
}

View File

@@ -7,6 +7,8 @@
public class HexTokenizer : ITokenizer
{
public bool ReadsNextByte { get; } = false;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;

View File

@@ -5,6 +5,8 @@
internal interface ITokenizer
{
bool ReadsNextByte { get; }
bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token);
}
}

View File

@@ -9,6 +9,8 @@
public class NameTokenizer : ITokenizer
{
public bool ReadsNextByte { get; } = true;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;

View File

@@ -8,6 +8,8 @@
public class NumericTokenizer : ITokenizer
{
public bool ReadsNextByte { get; } = true;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;

View File

@@ -1,6 +1,5 @@
namespace UglyToad.Pdf.Tokenization
{
using System;
using System.Text;
using IO;
using Parser.Parts;
@@ -8,6 +7,8 @@
public class PlainTokenizer : ITokenizer
{
public bool ReadsNextByte { get; } = true;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;
@@ -18,7 +19,7 @@
}
var builder = new StringBuilder();
builder.Append(currentByte);
builder.Append((char)currentByte);
while (inputBytes.MoveNext())
{
if (ReadHelper.IsWhitespace(inputBytes.CurrentByte))
@@ -32,7 +33,7 @@
break;
}
builder.Append((char) currentByte);
builder.Append((char) inputBytes.CurrentByte);
}
var text = builder.ToString();
@@ -40,20 +41,28 @@
switch (text)
{
case "true":
token = BooleanToken.True;
break;
case "false":
token = BooleanToken.False;
break;
case "null":
token = NullToken.Instance;
break;
case "endstream":
token = ObjectDelimiterToken.EndStream;
break;
case "stream":
token = ObjectDelimiterToken.StartStream;
break;
case "obj":
token = ObjectDelimiterToken.StartObject;
break;
case "endobj":
token = ObjectDelimiterToken.EndObject;
break;
default:
token = new OperatorToken(text);
break;
}

View File

@@ -1,12 +1,12 @@
namespace UglyToad.Pdf.IO
namespace UglyToad.Pdf.Tokenization.Scanner
{
using System;
using System.Collections.Generic;
using IO;
using Parser.Parts;
using Text.Operators;
using Tokenization;
using Tokenization.Scanner;
using Tokenization.Tokens;
using Tokens;
public class CoreTokenScanner : ITokenScanner
{
@@ -17,16 +17,14 @@
private static readonly HexTokenizer HexTokenizer = new HexTokenizer();
private static readonly StringTokenizer StringTokenizer = new StringTokenizer();
private static readonly NumericTokenizer NumericTokenizer = new NumericTokenizer();
private static readonly Tokenization.NumericTokenizer NumericTokenizer = new Tokenization.NumericTokenizer();
private static readonly NameTokenizer NameTokenizer = new NameTokenizer();
private static readonly IReadOnlyDictionary<byte, ITokenizer> Tokenizers = new Dictionary<byte, ITokenizer>
{
{(byte) '(', new StringTokenizer()}
};
private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer();
public IToken CurrentToken { get; private set; }
private bool hasBytePreRead;
internal CoreTokenScanner(IInputBytes inputBytes, CosDictionaryParser dictionaryParser,
CosArrayParser arrayParser)
{
@@ -40,16 +38,19 @@
currentBuffer.Clear();
bool isSkippingSymbol = false;
while (inputBytes.MoveNext())
while ((hasBytePreRead && !inputBytes.IsAtEnd()) || inputBytes.MoveNext())
{
hasBytePreRead = false;
var currentByte = inputBytes.CurrentByte;
if (BaseTextComponentApproach.IsEmpty(currentByte))
if (BaseTextComponentApproach.IsEmpty(currentByte)
|| ReadHelper.IsWhitespace(currentByte))
{
isSkippingSymbol = false;
continue;
}
// If we failed to read the symbol for whatever reason we pass over it.
if (isSkippingSymbol)
{
continue;
@@ -65,12 +66,17 @@
var following = inputBytes.Peek();
if (following == '<')
{
isSkippingSymbol = true;
// TODO: Dictionary tokenizer
}
else
{
tokenizer = HexTokenizer;
}
break;
case '[':
// TODO: Array tokenizer
break;
case '/':
tokenizer = NameTokenizer;
break;
@@ -87,18 +93,28 @@
case '-':
case '+':
case '.':
tokenizer = null;
tokenizer = NumericTokenizer;
break;
default:
tokenizer = PlainTokenizer;
break;
}
if (tokenizer == null || !tokenizer.TryTokenize(currentByte, inputBytes, out var token))
{
isSkippingSymbol = true;
hasBytePreRead = false;
continue;
}
CurrentToken = token;
/*
* Some tokenizers need to read the symbol of the next token to know if they have ended
* so we don't want to move on to the next byte, we would lose a byte, e.g.: /NameOne/NameTwo or /Name(string)
*/
hasBytePreRead = tokenizer.ReadsNextByte;
return true;
}

View File

@@ -7,6 +7,8 @@
public class StringTokenizer : ITokenizer
{
public bool ReadsNextByte { get; } = false;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
var builder = new StringBuilder();
@@ -138,6 +140,11 @@
break;
}
if (numberOfBrackets <= 0)
{
break;
}
}
token = new StringToken(builder.ToString());

View File

@@ -2,11 +2,15 @@
{
public class BooleanToken : IDataToken<bool>
{
public static BooleanToken True { get; } = new BooleanToken(true);
public static BooleanToken False { get; } = new BooleanToken(false);
public bool Data { get; }
public BooleanToken(bool data)
private BooleanToken(bool data)
{
Data = data;
}
}
}
}

View File

@@ -10,5 +10,10 @@
{
Data = CosName.Create(text);
}
public override string ToString()
{
return Data.ToString();
}
}
}

View File

@@ -0,0 +1,11 @@
namespace UglyToad.Pdf.Tokenization.Tokens
{
public class NullToken : IDataToken<object>
{
public static NullToken Instance { get; } = new NullToken();
public object Data { get; } = null;
private NullToken() { }
}
}

View File

@@ -1,5 +1,7 @@
namespace UglyToad.Pdf.Tokenization.Tokens
{
using System.Globalization;
public class NumericToken : IDataToken<decimal>
{
public decimal Data { get; }
@@ -17,5 +19,10 @@
Int = (int) value;
Long = (long) value;
}
public override string ToString()
{
return Data.ToString(NumberFormatInfo.InvariantInfo);
}
}
}

View File

@@ -0,0 +1,17 @@
namespace UglyToad.Pdf.Tokenization.Tokens
{
public class ObjectDelimiterToken : IDataToken<string>
{
public static ObjectDelimiterToken StartObject = new ObjectDelimiterToken("obj");
public static ObjectDelimiterToken EndObject = new ObjectDelimiterToken("endobj");
public static ObjectDelimiterToken StartStream = new ObjectDelimiterToken("stream");
public static ObjectDelimiterToken EndStream = new ObjectDelimiterToken("endstream");
public string Data { get; }
private ObjectDelimiterToken(string data)
{
Data = data;
}
}
}

View File

@@ -0,0 +1,22 @@
namespace UglyToad.Pdf.Tokenization.Tokens
{
using System.Collections.Generic;
public class OperatorToken : IDataToken<string>
{
private static readonly Dictionary<string, string> PooledNames = new Dictionary<string, string>();
public string Data { get; }
public OperatorToken(string data)
{
if (!PooledNames.TryGetValue(data, out var stored))
{
stored = data;
PooledNames[data] = stored;
}
Data = stored;
}
}
}

View File

@@ -8,5 +8,10 @@ namespace UglyToad.Pdf.Tokenization.Tokens
{
Data = data;
}
public override string ToString()
{
return Data;
}
}
}