support for tokenizing arrays and nested arrays

This commit is contained in:
Eliot Jones
2017-11-12 14:42:01 +00:00
parent 00e3d06513
commit 158cd5f2e3
6 changed files with 245 additions and 20 deletions

View File

@@ -0,0 +1,137 @@
namespace UglyToad.Pdf.Tests.Tokenization
{
using System.Collections.Generic;
using Pdf.Cos;
using Pdf.Tokenization;
using Pdf.Tokenization.Tokens;
using Xunit;
public class ArrayTokenizerTests
{
private readonly ArrayTokenizer tokenizer = new ArrayTokenizer();
[Theory]
[InlineData("]")]
[InlineData("<")]
[InlineData(" [")]
[InlineData("a")]
[InlineData("\0")]
public void InvalidFirstCharacter_ReturnsFalse(string s)
{
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.False(result);
Assert.Null(token);
}
public static IEnumerable<object[]> SingleElementTestData => new[]
{
new object[] {"[12]", 12m},
new object[] {"[ 12 ]", 12m},
new object[] {@"[
2948344 ]", 2948344m},
new object[] { "[(Bertrand) \t]", "Bertrand" },
new object[] { "[ <AE>\r\n]", "®" },
};
[Theory]
[MemberData(nameof(SingleElementTestData))]
public void SingleElementArray(string s, object dataValue)
{
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
var array = AssertArrayToken(token);
Assert.Equal(1, array.Data.Count);
Assert.Equal(dataValue, ((dynamic)token).Data[0].Data);
}
[Fact]
public void NestedArray()
{
const string s = "[ 12 +10.453 /Fonts [ /F1 /F3 ] (Moreover) ]";
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
var array = AssertArrayToken(token);
Assert.Equal(12m, AssertDataToken<NumericToken, decimal>(0, array).Data);
Assert.Equal(10.453m, AssertDataToken<NumericToken, decimal>(1, array).Data);
Assert.Equal(CosName.Create("Fonts"), AssertDataToken<NameToken, CosName>(2, array).Data);
var inner = AssertArrayToken(array.Data[3]);
Assert.Equal(CosName.Create("F1"), AssertDataToken<NameToken, CosName>(0, inner).Data);
Assert.Equal(CosName.Create("F3"), AssertDataToken<NameToken, CosName>(1, inner).Data);
Assert.Equal("Moreover", AssertDataToken<StringToken, string>(4, array).Data);
}
[Fact]
public void ManyNestedArrays()
{
const string s = "[ /Bounds [ [19 -69.] [7 64.625]] (More) [[[15]]]]";
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
var array = AssertArrayToken(token);
Assert.Equal(CosName.Create("Bounds"), AssertDataToken<NameToken, CosName>(0, array).Data);
var firstInner = AssertArrayToken(array.Data[1]);
var firstFirstInner = AssertArrayToken(firstInner.Data[0]);
Assert.Equal(19m, AssertDataToken<NumericToken, decimal>(0, firstFirstInner).Data);
Assert.Equal(-69m, AssertDataToken<NumericToken, decimal>(1, firstFirstInner).Data);
var secondFirstInner = AssertArrayToken(firstInner.Data[1]);
Assert.Equal(7m, AssertDataToken<NumericToken, decimal>(0, secondFirstInner).Data);
Assert.Equal(64.625m, AssertDataToken<NumericToken, decimal>(1, secondFirstInner).Data);
Assert.Equal("More", AssertDataToken<StringToken, string>(2, array).Data);
var secondInner = AssertArrayToken(array.Data[3]);
var firstSecondInner = AssertArrayToken(secondInner.Data[0]);
var firstFirstSecondInner = AssertArrayToken(firstSecondInner.Data[0]);
Assert.Equal(15m, AssertDataToken<NumericToken, decimal>(0, firstFirstSecondInner).Data);
}
private static ArrayToken AssertArrayToken(IToken token)
{
Assert.NotNull(token);
var result = Assert.IsType<ArrayToken>(token);
return result;
}
private static T AssertDataToken<T, TData>(int index, ArrayToken array) where T : IDataToken<TData>
{
Assert.True(array.Data.Count > index);
var result = Assert.IsType<T>(array.Data[index]);
return result;
}
}
}

View File

@@ -4,23 +4,18 @@ namespace UglyToad.Pdf.Tests.Tokenization.Scanner
using System;
using System.Collections.Generic;
using IO;
using Parser.Parts;
using Pdf.Cos;
using Pdf.Parser.Parts;
using Pdf.Tokenization.Scanner;
using Pdf.Tokenization.Tokens;
using Xunit;
public class CoreTokenScannerTests
{
private readonly CosDictionaryParser dictionaryParser = new CosDictionaryParser(new CosNameParser(), new TestingLog());
private readonly CosArrayParser arrayParser = new CosArrayParser();
private readonly Func<IInputBytes, CoreTokenScanner> scannerFactory;
public CoreTokenScannerTests()
{
scannerFactory = x => new CoreTokenScanner(x, dictionaryParser, arrayParser);
scannerFactory = x => new CoreTokenScanner(x);
}
[Fact]

View File

@@ -600,7 +600,7 @@ namespace UglyToad.Pdf.Cos
public override string ToString()
{
return $"CosName{{{Name}}}";
return $"/{Name}";
}
public void WriteToPdfStream(StreamWriter output)

View File

@@ -0,0 +1,47 @@
namespace UglyToad.Pdf.Tokenization
{
using System.Collections.Generic;
using IO;
using Scanner;
using Tokens;
public class ArrayTokenizer : ITokenizer
{
public bool ReadsNextByte { get; } = false;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;
if (currentByte != '[')
{
return false;
}
var scanner = new CoreTokenScanner(inputBytes, ScannerScope.Array);
var contents = new List<IToken>();
IToken previousToken = null;
while (!CurrentByteEndsCurrentArray(inputBytes, previousToken) && scanner.MoveNext())
{
previousToken = scanner.CurrentToken;
contents.Add(scanner.CurrentToken);
}
token = new ArrayToken(contents);
return true;
}
private static bool CurrentByteEndsCurrentArray(IInputBytes inputBytes, IToken previousToken)
{
if (inputBytes.CurrentByte == ']' && !(previousToken is ArrayToken))
{
return true;
}
return false;
}
}
}

View File

@@ -8,28 +8,33 @@
using Tokenization;
using Tokens;
internal enum ScannerScope
{
None,
Array,
Dictionary
}
public class CoreTokenScanner : ITokenScanner
{
private readonly CosDictionaryParser dictionaryParser;
private readonly CosArrayParser arrayParser;
private readonly IInputBytes inputBytes;
private readonly List<byte> currentBuffer = new List<byte>();
private static readonly HexTokenizer HexTokenizer = new HexTokenizer();
private static readonly StringTokenizer StringTokenizer = new StringTokenizer();
private static readonly Tokenization.NumericTokenizer NumericTokenizer = new Tokenization.NumericTokenizer();
private static readonly NameTokenizer NameTokenizer = new NameTokenizer();
private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer();
private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer();
private readonly ScannerScope scope;
private readonly IInputBytes inputBytes;
private readonly List<byte> currentBuffer = new List<byte>();
public IToken CurrentToken { get; private set; }
private bool hasBytePreRead;
internal CoreTokenScanner(IInputBytes inputBytes, CosDictionaryParser dictionaryParser,
CosArrayParser arrayParser)
internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None)
{
this.dictionaryParser = dictionaryParser;
this.arrayParser = arrayParser;
this.scope = scope;
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
}
@@ -37,12 +42,15 @@
{
currentBuffer.Clear();
var endAngleBracesRead = 0;
bool isSkippingSymbol = false;
while ((hasBytePreRead && !inputBytes.IsAtEnd()) || inputBytes.MoveNext())
{
hasBytePreRead = false;
var currentByte = inputBytes.CurrentByte;
var c = (char) currentByte;
if (BaseTextComponentApproach.IsEmpty(currentByte)
|| ReadHelper.IsWhitespace(currentByte))
{
@@ -57,7 +65,7 @@
}
ITokenizer tokenizer = null;
switch ((char) currentByte)
switch (c)
{
case '(':
tokenizer = StringTokenizer;
@@ -74,9 +82,18 @@
tokenizer = HexTokenizer;
}
break;
case '[':
// TODO: Array tokenizer
case '>' when scope == ScannerScope.Dictionary:
endAngleBracesRead++;
if (endAngleBracesRead == 2)
{
return false;
}
break;
case '[':
tokenizer = ArrayTokenizer;
break;
case ']' when scope == ScannerScope.Array:
return false;
case '/':
tokenizer = NameTokenizer;
break;

View File

@@ -0,0 +1,29 @@
namespace UglyToad.Pdf.Tokenization.Tokens
{
using System.Collections.Generic;
using System.Text;
public class ArrayToken : IDataToken<IReadOnlyList<IToken>>
{
public IReadOnlyList<IToken> Data { get; }
public ArrayToken(IReadOnlyList<IToken> data)
{
Data = data;
}
public override string ToString()
{
var builder = new StringBuilder("[ ");
foreach (var token in Data)
{
builder.Append(token).Append(' ');
}
builder.Append(']');
return builder.ToString();
}
}
}