create name tokenizer and approach for reading non variable data tokens

This commit is contained in:
Eliot Jones
2017-11-10 23:58:28 +00:00
parent f986e16c97
commit 096278de50
11 changed files with 332 additions and 149 deletions

View File

@@ -1,128 +0,0 @@
using System;
using System.Text;
namespace UglyToad.Pdf.Tests.Parser.Parts
{
using IO;
using Pdf.Parser.Parts;
using Xunit;
public class CosNameParserTests
{
private readonly CosNameParser parser = new CosNameParser();
[Fact]
public void ReaderNull_Throws()
{
Action action = () => parser.Parse(null);
Assert.Throws<ArgumentNullException>(action);
}
[Fact]
public void ReadsName()
{
var input = "/Type /XRef";
var bytes = Encoding.UTF8.GetBytes(input);
var access = new RandomAccessBuffer(bytes);
var name = parser.Parse(access);
var text = name.Name;
Assert.Equal("Type", text);
}
[Fact]
public void ReadsNameNoEndSpace()
{
var input = "/Type/XRef";
var bytes = Encoding.UTF8.GetBytes(input);
var access = new RandomAccessBuffer(bytes);
var name = parser.Parse(access);
var text = name.Name;
Assert.Equal("Type", text);
}
[Fact]
public void ReadsName_NotAtForwardSlash_Throws()
{
var input = " /Type";
var bytes = Encoding.UTF8.GetBytes(input);
var access = new RandomAccessBuffer(bytes);
Action action = () => parser.Parse(access);
Assert.Throws<InvalidOperationException>(action);
}
[Fact]
public void ReadsNameAtEndOfStream()
{
var input = "/Xref";
var bytes = Encoding.UTF8.GetBytes(input);
var access = new RandomAccessBuffer(bytes);
var name = parser.Parse(access);
var text = name.Name;
Assert.Equal("Xref", text);
}
[Theory]
[InlineData("/Name1", "Name1")]
[InlineData("/ASomewhatLongerName", "ASomewhatLongerName")]
[InlineData("/A;Name_WithVarious***Characters?", "A;Name_WithVarious***Characters?")]
[InlineData("/1.2", "1.2")]
[InlineData("/$$", "$$")]
[InlineData("/@pattern", "@pattern")]
[InlineData("/.notdef", ".notdef")]
public void ReadsValidPdfNames(string inputString, string expected)
{
var input = inputString;
var bytes = Encoding.UTF8.GetBytes(input);
var access = new RandomAccessBuffer(bytes);
var name = parser.Parse(access);
var text = name.Name;
Assert.Equal(expected, text);
}
[Theory]
[InlineData("/Adobe#20Green", "Adobe Green")]
[InlineData("/PANTONE#205757#20CV", "PANTONE 5757 CV")]
[InlineData("/paired#28#29parentheses", "paired()parentheses")]
[InlineData("/The_Key_of_F#23_Minor", "The_Key_of_F#_Minor")]
[InlineData("/A#42", "AB")]
public void ReadsHexNames(string inputString, string expected)
{
var input = inputString;
var bytes = Encoding.UTF8.GetBytes(input);
var access = new RandomAccessBuffer(bytes);
var name = parser.Parse(access);
var text = name.Name;
Assert.Equal(expected, text);
}
}
}

View File

@@ -1,13 +1,13 @@
namespace UglyToad.Pdf.Tests
{
using System.Linq;
using System.Text;
using IO;
public static class StringBytesTestConverter
{
public static Result Convert(string s)
{
var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
var input = new ByteArrayInputBytes(Encoding.UTF8.GetBytes(s));
input.MoveNext();
var initialByte = input.CurrentByte;

View File

@@ -0,0 +1,124 @@
namespace UglyToad.Pdf.Tests.Tokenization
{
using Pdf.Tokenization;
using Pdf.Tokenization.Tokens;
using Xunit;
public class NameTokenizerTests
{
private readonly NameTokenizer tokenizer = new NameTokenizer();
[Fact]
public void ReadsName()
{
const string s = "/Type /XRef";
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.Equal("Type", AssertNameToken(token).Data.Name);
}
[Fact]
public void ReadsNameNoEndSpace()
{
const string s = "/Type/XRef";
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.Equal("Type", AssertNameToken(token).Data.Name);
}
[Fact]
public void ReadsName_NotAtForwardSlash_Throws()
{
const string s = " /Type";
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var _);
Assert.False(result);
}
[Fact]
public void ReadsNameAtEndOfStream()
{
const string s = "/XRef";
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.Equal("XRef", AssertNameToken(token).Data.Name);
}
[Fact]
public void FallsBackToUnescapedForEarlyPdfTypes()
{
const string s = "/Priorto1.2#INvalidHexHash";
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.Equal("Priorto1.2#INvalidHexHash", AssertNameToken(token).Data.Name);
}
[Theory]
[InlineData("/Name1", "Name1")]
[InlineData("/ASomewhatLongerName", "ASomewhatLongerName")]
[InlineData("/AName_With;Various***Characters?", "AName_With;Various***Characters?")]
[InlineData("/1.2", "1.2")]
[InlineData("/$$", "$$")]
[InlineData("/@pattern", "@pattern")]
[InlineData("/.notdef", ".notdef")]
public void ReadsValidPdfNames(string s, string expected)
{
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.Equal(expected, AssertNameToken(token).Data.Name);
}
[Theory]
[InlineData("/Adobe#20Green", "Adobe Green")]
[InlineData("/PANTONE#205757#20CV", "PANTONE 5757 CV")]
[InlineData("/paired#28#29parentheses", "paired()parentheses")]
[InlineData("/The_Key_of_F#23_Minor", "The_Key_of_F#_Minor")]
[InlineData("/A#42", "AB")]
public void ReadsHexNames(string s, string expected)
{
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.Equal(expected, AssertNameToken(token).Data.Name);
}
private static NameToken AssertNameToken(IToken token)
{
Assert.NotNull(token);
var result = Assert.IsType<NameToken>(token);
return result;
}
}
}

View File

@@ -1,13 +0,0 @@
namespace UglyToad.Pdf.IO
{
using System;
using Tokenization.Tokens;
public class NameTokenizer : ITokenizer
{
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
throw new NotImplementedException();
}
}
}

View File

@@ -81,7 +81,7 @@ namespace UglyToad.Pdf.Parser.Parts
}
byte[] bytes = memoryStream.ToArray();
var str = ReadHelper.IsValidUTF8(bytes) ? Encoding.UTF8.GetString(memoryStream.ToArray()) : Encoding.GetEncoding("windows-1252").GetString(memoryStream.ToArray());
var str = ReadHelper.IsValidUtf8(bytes) ? Encoding.UTF8.GetString(memoryStream.ToArray()) : Encoding.GetEncoding("windows-1252").GetString(memoryStream.ToArray());
return CosName.Create(str);
}
}

View File

@@ -314,17 +314,18 @@ namespace UglyToad.Pdf.Parser.Parts
return char.IsDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
}
public static bool IsValidUTF8(byte[] input)
public static bool IsValidUtf8(byte[] input)
{
try
{
Decoder d = Encoding.UTF8.GetDecoder();
var d = Encoding.UTF8.GetDecoder();
var charLength = d.GetCharCount(input, 0, input.Length);
var chars = new char[charLength];
d.Convert(input, 0, input.Length, chars, 0, charLength, true, out _, out _, out _);
return true;
}
catch (Exception e)
catch (Exception)
{
return false;
}

View File

@@ -1,6 +1,7 @@
namespace UglyToad.Pdf.IO
namespace UglyToad.Pdf.Tokenization
{
using Tokenization.Tokens;
using IO;
using Tokens;
internal interface ITokenizer
{

View File

@@ -0,0 +1,109 @@
namespace UglyToad.Pdf.Tokenization
{
using System;
using System.Collections.Generic;
using System.Text;
using IO;
using Parser.Parts;
using Tokens;
public class NameTokenizer : ITokenizer
{
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;
if (currentByte != '/')
{
return false;
}
var bytes = new List<byte>();
bool escapeActive = false;
int postEscapeRead = 0;
var escapedChars = new char[2];
while (inputBytes.MoveNext())
{
var b = inputBytes.CurrentByte;
if (b == '#')
{
escapeActive = true;
}
else if (escapeActive)
{
if (ReadHelper.IsHexDigit((char)b))
{
escapedChars[postEscapeRead] = (char)b;
postEscapeRead++;
if (postEscapeRead == 2)
{
string hex = new string(escapedChars);
try
{
var characterToWrite = (byte)Convert.ToInt32(hex, 16);
bytes.Add(characterToWrite);
}
catch (FormatException e)
{
throw new InvalidOperationException("Error: expected hex digit, actual='" + hex + "'", e);
}
escapeActive = false;
postEscapeRead = 0;
}
}
else
{
bytes.Add((byte)'#');
if (postEscapeRead == 1)
{
bytes.Add((byte)escapedChars[0]);
}
if (ReadHelper.IsEndOfName(b))
{
break;
}
if (b == '#')
{
// Make it clear what's going on, we read something like #m#AE
// ReSharper disable once RedundantAssignment
escapeActive = true;
postEscapeRead = 0;
continue;
}
bytes.Add(b);
escapeActive = false;
postEscapeRead = 0;
}
}
else if (ReadHelper.IsEndOfName(b))
{
break;
}
else
{
bytes.Add(b);
}
}
byte[] byteArray = bytes.ToArray();
var str = ReadHelper.IsValidUtf8(byteArray)
? Encoding.UTF8.GetString(byteArray)
: Encoding.GetEncoding("windows-1252").GetString(byteArray);
token = new NameToken(str);
return true;
}
}
}

View File

@@ -0,0 +1,63 @@
namespace UglyToad.Pdf.Tokenization
{
using System;
using System.Text;
using IO;
using Parser.Parts;
using Tokens;
public class PlainTokenizer : ITokenizer
{
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;
if (ReadHelper.IsWhitespace(currentByte))
{
return false;
}
var builder = new StringBuilder();
builder.Append(currentByte);
while (inputBytes.MoveNext())
{
if (ReadHelper.IsWhitespace(inputBytes.CurrentByte))
{
break;
}
if (inputBytes.CurrentByte == '<' || inputBytes.CurrentByte == '['
|| inputBytes.CurrentByte == '/')
{
break;
}
builder.Append((char) currentByte);
}
var text = builder.ToString();
switch (text)
{
case "true":
break;
case "false":
break;
case "null":
break;
case "endstream":
break;
case "stream":
break;
case "obj":
break;
case "endobj":
break;
default:
break;
}
return true;
}
}
}

View File

@@ -0,0 +1,12 @@
namespace UglyToad.Pdf.Tokenization.Tokens
{
public class BooleanToken : IDataToken<bool>
{
public bool Data { get; }
public BooleanToken(bool data)
{
Data = data;
}
}
}

View File

@@ -0,0 +1,14 @@
namespace UglyToad.Pdf.Tokenization.Tokens
{
using Cos;
public class NameToken : IDataToken<CosName>
{
public CosName Data { get; }
public NameToken(string text)
{
Data = CosName.Create(text);
}
}
}