From 158cd5f2e359a5ff775d53988147e5aa4dd17468 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Sun, 12 Nov 2017 14:42:01 +0000 Subject: [PATCH] support for tokenizing arrays and nested arrays --- .../Tokenization/ArrayTokenizerTests.cs | 137 ++++++++++++++++++ .../Scanner/CoreTokenScannerTests.cs | 7 +- src/UglyToad.Pdf/Cos/CosName.cs | 2 +- .../Tokenization/ArrayTokenizer.cs | 47 ++++++ .../Tokenization/Scanner/CoreTokenScanner.cs | 43 ++++-- .../Tokenization/Tokens/ArrayToken.cs | 29 ++++ 6 files changed, 245 insertions(+), 20 deletions(-) create mode 100644 src/UglyToad.Pdf.Tests/Tokenization/ArrayTokenizerTests.cs create mode 100644 src/UglyToad.Pdf/Tokenization/ArrayTokenizer.cs create mode 100644 src/UglyToad.Pdf/Tokenization/Tokens/ArrayToken.cs diff --git a/src/UglyToad.Pdf.Tests/Tokenization/ArrayTokenizerTests.cs b/src/UglyToad.Pdf.Tests/Tokenization/ArrayTokenizerTests.cs new file mode 100644 index 00000000..dc695450 --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Tokenization/ArrayTokenizerTests.cs @@ -0,0 +1,137 @@ +namespace UglyToad.Pdf.Tests.Tokenization +{ + using System.Collections.Generic; + using Pdf.Cos; + using Pdf.Tokenization; + using Pdf.Tokenization.Tokens; + using Xunit; + + public class ArrayTokenizerTests + { + private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(); + + [Theory] + [InlineData("]")] + [InlineData("<")] + [InlineData(" [")] + [InlineData("a")] + [InlineData("\0")] + public void InvalidFirstCharacter_ReturnsFalse(string s) + { + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.False(result); + Assert.Null(token); + } + + public static IEnumerable SingleElementTestData => new[] + { + new object[] {"[12]", 12m}, + new object[] {"[ 12 ]", 12m}, + new object[] {@"[ +2948344 ]", 2948344m}, + new object[] { "[(Bertrand) \t]", "Bertrand" }, + new object[] { "[ \r\n]", "®" }, + }; + + [Theory] + [MemberData(nameof(SingleElementTestData))] + public void SingleElementArray(string s, object dataValue) + { + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + + var array = AssertArrayToken(token); + + Assert.Equal(1, array.Data.Count); + + Assert.Equal(dataValue, ((dynamic)token).Data[0].Data); + } + + [Fact] + public void NestedArray() + { + const string s = "[ 12 +10.453 /Fonts [ /F1 /F3 ] (Moreover) ]"; + + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + + var array = AssertArrayToken(token); + + Assert.Equal(12m, AssertDataToken(0, array).Data); + Assert.Equal(10.453m, AssertDataToken(1, array).Data); + Assert.Equal(CosName.Create("Fonts"), AssertDataToken(2, array).Data); + + var inner = AssertArrayToken(array.Data[3]); + + Assert.Equal(CosName.Create("F1"), AssertDataToken(0, inner).Data); + Assert.Equal(CosName.Create("F3"), AssertDataToken(1, inner).Data); + + Assert.Equal("Moreover", AssertDataToken(4, array).Data); + } + + [Fact] + public void ManyNestedArrays() + { + const string s = "[ /Bounds [ [19 -69.] [7 64.625]] (More) [[[15]]]]"; + + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + + var array = AssertArrayToken(token); + + Assert.Equal(CosName.Create("Bounds"), AssertDataToken(0, array).Data); + + var firstInner = AssertArrayToken(array.Data[1]); + + var firstFirstInner = AssertArrayToken(firstInner.Data[0]); + + Assert.Equal(19m, AssertDataToken(0, firstFirstInner).Data); + Assert.Equal(-69m, AssertDataToken(1, firstFirstInner).Data); + + var secondFirstInner = AssertArrayToken(firstInner.Data[1]); + + Assert.Equal(7m, AssertDataToken(0, secondFirstInner).Data); + Assert.Equal(64.625m, AssertDataToken(1, secondFirstInner).Data); + + Assert.Equal("More", AssertDataToken(2, array).Data); + + var secondInner = AssertArrayToken(array.Data[3]); + + var firstSecondInner = AssertArrayToken(secondInner.Data[0]); + + var firstFirstSecondInner = AssertArrayToken(firstSecondInner.Data[0]); + + Assert.Equal(15m, AssertDataToken(0, firstFirstSecondInner).Data); + } + + private static ArrayToken AssertArrayToken(IToken token) + { + Assert.NotNull(token); + + var result = Assert.IsType(token); + + return result; + } + + private static T AssertDataToken(int index, ArrayToken array) where T : IDataToken + { + Assert.True(array.Data.Count > index); + + var result = Assert.IsType(array.Data[index]); + + return result; + } + } +} diff --git a/src/UglyToad.Pdf.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs b/src/UglyToad.Pdf.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs index 5376fc63..04f09d7a 100644 --- a/src/UglyToad.Pdf.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs +++ b/src/UglyToad.Pdf.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs @@ -4,23 +4,18 @@ namespace UglyToad.Pdf.Tests.Tokenization.Scanner using System; using System.Collections.Generic; using IO; - using Parser.Parts; using Pdf.Cos; - using Pdf.Parser.Parts; using Pdf.Tokenization.Scanner; using Pdf.Tokenization.Tokens; using Xunit; public class CoreTokenScannerTests { - private readonly CosDictionaryParser dictionaryParser = new CosDictionaryParser(new CosNameParser(), new TestingLog()); - private readonly CosArrayParser arrayParser = new CosArrayParser(); - private readonly Func scannerFactory; public CoreTokenScannerTests() { - scannerFactory = x => new CoreTokenScanner(x, dictionaryParser, arrayParser); + scannerFactory = x => new CoreTokenScanner(x); } [Fact] diff --git a/src/UglyToad.Pdf/Cos/CosName.cs b/src/UglyToad.Pdf/Cos/CosName.cs index 97f5c161..7eb00c72 100644 --- a/src/UglyToad.Pdf/Cos/CosName.cs +++ b/src/UglyToad.Pdf/Cos/CosName.cs @@ -600,7 +600,7 @@ namespace UglyToad.Pdf.Cos public override string ToString() { - return $"CosName{{{Name}}}"; + return $"/{Name}"; } public void WriteToPdfStream(StreamWriter output) diff --git a/src/UglyToad.Pdf/Tokenization/ArrayTokenizer.cs b/src/UglyToad.Pdf/Tokenization/ArrayTokenizer.cs new file mode 100644 index 00000000..9afd06a4 --- /dev/null +++ b/src/UglyToad.Pdf/Tokenization/ArrayTokenizer.cs @@ -0,0 +1,47 @@ +namespace UglyToad.Pdf.Tokenization +{ + using System.Collections.Generic; + using IO; + using Scanner; + using Tokens; + + public class ArrayTokenizer : ITokenizer + { + public bool ReadsNextByte { get; } = false; + + public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) + { + token = null; + + if (currentByte != '[') + { + return false; + } + + var scanner = new CoreTokenScanner(inputBytes, ScannerScope.Array); + + var contents = new List(); + + IToken previousToken = null; + while (!CurrentByteEndsCurrentArray(inputBytes, previousToken) && scanner.MoveNext()) + { + previousToken = scanner.CurrentToken; + contents.Add(scanner.CurrentToken); + } + + token = new ArrayToken(contents); + + return true; + } + + private static bool CurrentByteEndsCurrentArray(IInputBytes inputBytes, IToken previousToken) + { + if (inputBytes.CurrentByte == ']' && !(previousToken is ArrayToken)) + { + return true; + } + + return false; + } + } +} diff --git a/src/UglyToad.Pdf/Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.Pdf/Tokenization/Scanner/CoreTokenScanner.cs index d6018681..ce774e00 100644 --- a/src/UglyToad.Pdf/Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.Pdf/Tokenization/Scanner/CoreTokenScanner.cs @@ -8,28 +8,33 @@ using Tokenization; using Tokens; + internal enum ScannerScope + { + None, + Array, + Dictionary + } + public class CoreTokenScanner : ITokenScanner { - private readonly CosDictionaryParser dictionaryParser; - private readonly CosArrayParser arrayParser; - private readonly IInputBytes inputBytes; - private readonly List currentBuffer = new List(); - private static readonly HexTokenizer HexTokenizer = new HexTokenizer(); private static readonly StringTokenizer StringTokenizer = new StringTokenizer(); private static readonly Tokenization.NumericTokenizer NumericTokenizer = new Tokenization.NumericTokenizer(); private static readonly NameTokenizer NameTokenizer = new NameTokenizer(); private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer(); + private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer(); + + private readonly ScannerScope scope; + private readonly IInputBytes inputBytes; + private readonly List currentBuffer = new List(); public IToken CurrentToken { get; private set; } private bool hasBytePreRead; - internal CoreTokenScanner(IInputBytes inputBytes, CosDictionaryParser dictionaryParser, - CosArrayParser arrayParser) + internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None) { - this.dictionaryParser = dictionaryParser; - this.arrayParser = arrayParser; + this.scope = scope; this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes)); } @@ -37,12 +42,15 @@ { currentBuffer.Clear(); + var endAngleBracesRead = 0; + bool isSkippingSymbol = false; while ((hasBytePreRead && !inputBytes.IsAtEnd()) || inputBytes.MoveNext()) { hasBytePreRead = false; var currentByte = inputBytes.CurrentByte; - + var c = (char) currentByte; + if (BaseTextComponentApproach.IsEmpty(currentByte) || ReadHelper.IsWhitespace(currentByte)) { @@ -57,7 +65,7 @@ } ITokenizer tokenizer = null; - switch ((char) currentByte) + switch (c) { case '(': tokenizer = StringTokenizer; @@ -74,9 +82,18 @@ tokenizer = HexTokenizer; } break; - case '[': - // TODO: Array tokenizer + case '>' when scope == ScannerScope.Dictionary: + endAngleBracesRead++; + if (endAngleBracesRead == 2) + { + return false; + } break; + case '[': + tokenizer = ArrayTokenizer; + break; + case ']' when scope == ScannerScope.Array: + return false; case '/': tokenizer = NameTokenizer; break; diff --git a/src/UglyToad.Pdf/Tokenization/Tokens/ArrayToken.cs b/src/UglyToad.Pdf/Tokenization/Tokens/ArrayToken.cs new file mode 100644 index 00000000..2cc31297 --- /dev/null +++ b/src/UglyToad.Pdf/Tokenization/Tokens/ArrayToken.cs @@ -0,0 +1,29 @@ +namespace UglyToad.Pdf.Tokenization.Tokens +{ + using System.Collections.Generic; + using System.Text; + + public class ArrayToken : IDataToken> + { + public IReadOnlyList Data { get; } + + public ArrayToken(IReadOnlyList data) + { + Data = data; + } + + public override string ToString() + { + var builder = new StringBuilder("[ "); + + foreach (var token in Data) + { + builder.Append(token).Append(' '); + } + + builder.Append(']'); + + return builder.ToString(); + } + } +}