fixes for octal in strings and tests for string and hex tokenizing

This commit is contained in:
Eliot Jones
2017-11-09 22:52:48 +00:00
parent afe07849d4
commit 83cc1a6bf1
5 changed files with 196 additions and 75 deletions

View File

@@ -0,0 +1,29 @@
namespace UglyToad.Pdf.Tests
{
using System.Linq;
using IO;
public static class StringBytesTestConverter
{
public static Result Convert(string s)
{
var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
input.MoveNext();
var initialByte = input.CurrentByte;
return new Result
{
First = initialByte,
Bytes = input
};
}
public class Result
{
public byte First { get; set; }
public IInputBytes Bytes { get; set; }
}
}
}

View File

@@ -0,0 +1,25 @@
namespace UglyToad.Pdf.Tests.Tokenization
{
using Pdf.Tokenization;
using Xunit;
public class HexStringTokenizerTests
{
private readonly HexStringTokenizer tokenizer = new HexStringTokenizer();
[Theory]
[InlineData(">not hex")]
[InlineData("\\<not hex")]
[InlineData("not hex")]
[InlineData("AE1094 still not hex")]
public void CannotTokenizeInvalidBytes(string s)
{
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.False(result);
Assert.Null(token);
}
}
}

View File

@@ -1,6 +1,5 @@
namespace UglyToad.Pdf.Tests.Tokenization
{
using System.Linq;
using IO;
using Pdf.Tokenization;
using Pdf.Tokenization.Tokens;
@@ -45,19 +44,13 @@
{
const string s = "(this string \\)contains escaped \\( parentheses)";
var input = new ByteArrayInputBytes(s.Select(x => (byte) x).ToArray());
input.MoveNext();
var initialByte = input.CurrentByte;
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(initialByte, input, out var token);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.NotNull(token);
var stringToken = Assert.IsType<StringToken>(token);
Assert.Equal(@"this string )contains escaped ( parentheses", stringToken.Data);
Assert.Equal(@"this string )contains escaped ( parentheses", AssertStringToken(token).Data);
}
[Theory]
@@ -68,18 +61,13 @@
[InlineData("()", "")]
public void CanReadValidStrings(string s, string expected)
{
var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
var input = StringBytesTestConverter.Convert(s);
input.MoveNext();
var initialByte = input.CurrentByte;
var result = tokenizer.TryTokenize(initialByte, input, out var token);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.NotNull(token);
var stringToken = Assert.IsType<StringToken>(token);
Assert.Equal(expected, stringToken.Data);
Assert.Equal(expected, AssertStringToken(token).Data);
}
[Fact]
@@ -87,19 +75,13 @@
{
const string s = "(this string (contains nested (two levels)) parentheses)";
var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
var input = StringBytesTestConverter.Convert(s);
input.MoveNext();
var initialByte = input.CurrentByte;
var result = tokenizer.TryTokenize(initialByte, input, out var token);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.NotNull(token);
var stringToken = Assert.IsType<StringToken>(token);
Assert.Equal("this string (contains nested (two levels)) parentheses", stringToken.Data);
Assert.Equal("this string (contains nested (two levels)) parentheses", AssertStringToken(token).Data);
}
[Fact]
@@ -107,19 +89,13 @@
{
const string s = "(this string <contains>)";
var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
var input = StringBytesTestConverter.Convert(s);
input.MoveNext();
var initialByte = input.CurrentByte;
var result = tokenizer.TryTokenize(initialByte, input, out var token);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.NotNull(token);
var stringToken = Assert.IsType<StringToken>(token);
Assert.Equal("this string <contains>", stringToken.Data);
Assert.Equal("this string <contains>", AssertStringToken(token).Data);
}
[Fact]
@@ -130,20 +106,14 @@ two strings \
are the same.)";
const string expected = "These two strings are the same.";
var input = StringBytesTestConverter.Convert(s);
var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
input.MoveNext();
var initialByte = input.CurrentByte;
var result = tokenizer.TryTokenize(initialByte, input, out var token);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.NotNull(token);
var stringToken = Assert.IsType<StringToken>(token);
Assert.Equal(expected, stringToken.Data);
Assert.Equal(expected, AssertStringToken(token).Data);
}
[Fact]
@@ -153,19 +123,76 @@ are the same.)";
const string expected = "So does this one.\n";
var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
var input = StringBytesTestConverter.Convert(s);
input.MoveNext();
var initialByte = input.CurrentByte;
var result = tokenizer.TryTokenize(initialByte, input, out var token);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.Equal(expected, AssertStringToken(token).Data);
}
[Fact]
public void ConvertsFullOctal()
{
const string s = @"(This string contains \245two octal characters\307.)";
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.Equal("This string contains ¥two octal charactersÇ.", AssertStringToken(token).Data);
}
[Fact]
public void ConvertsFullOctalFollowedByNormalNumber()
{
const string s = @"(This string contains \2451 octal character.)";
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.Equal("This string contains ¥1 octal character.", AssertStringToken(token).Data);
}
[Fact]
public void ConvertsPartialOctal()
{
const string s = @"(This string has a plus: \53 as octal)";
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.Equal("This string has a plus: + as octal", AssertStringToken(token).Data);
}
[Fact]
public void ConvertsTwoPartialOctalsInARow()
{
const string s = @"(This string has two \53\326ctals)";
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.Equal("This string has two +Öctals", AssertStringToken(token).Data);
}
private static StringToken AssertStringToken(IToken token)
{
Assert.NotNull(token);
var stringToken = Assert.IsType<StringToken>(token);
Assert.Equal(expected, stringToken.Data);
return stringToken;
}
}
}

View File

@@ -1,6 +1,7 @@
namespace UglyToad.Pdf.Tokenization
{
using IO;
using Parser.Parts;
using Tokens;
public class HexStringTokenizer : ITokenizer
@@ -18,6 +19,11 @@
{
var current = inputBytes.CurrentByte;
if (ReadHelper.IsWhitespace(current))
{
continue;
}
if (!IsValidHexCharacter(current))
{
return false;

View File

@@ -28,7 +28,7 @@
bool octalModeActive = false;
byte[] octal = { 0, 0, 0 };
short[] octal = { 0, 0, 0 };
int octalsRead = 0;
while (inputBytes.MoveNext())
@@ -36,35 +36,41 @@
var b = inputBytes.CurrentByte;
var c = (char)b;
if (octalModeActive && c >= '0' && c <= '7')
if (octalModeActive)
{
if (octalsRead == 3)
var nextCharacterOctal = c >= '0' && c <= '7';
if (nextCharacterOctal)
{
// left shift the octals.
LeftShiftOctal(c, octalsRead, octal);
octalsRead++;
}
if (octalsRead == 3 || !nextCharacterOctal)
{
var characterCode = FromOctal(octal);
// For now :(
// TODO: I have a sneaking suspicion this is wrong...
// TODO: I have a sneaking suspicion this is wrong, not sure what behaviour is for large octal numbers
builder.Append((char)characterCode);
octal[0] = 0;
octal[1] = 0;
octal[2] = 0;
octalsRead = 0;
octalModeActive = false;
}
else
{
// left shift the octals.
LeftShiftOctal(b, octalsRead, octal);
octal[octalsRead] = b;
octalsRead++;
if (nextCharacterOctal)
{
continue;
}
}
switch (c)
{
case ')':
octalModeActive = false;
isLineBreaking = false;
if (!isEscapeActive)
{
@@ -85,7 +91,6 @@
break;
case '(':
octalModeActive = false;
isLineBreaking = false;
@@ -99,7 +104,6 @@
break;
// Escape
case '\\':
octalModeActive = false;
isLineBreaking = false;
// Escaped backslash
if (isEscapeActive)
@@ -112,7 +116,6 @@
}
break;
default:
octalModeActive = false;
if (isLineBreaking)
{
if (ReadHelper.IsEndOfLine(c))
@@ -142,14 +145,16 @@
return true;
}
private static void LeftShiftOctal(byte nextOctalByte, int octalsRead, byte[] octals)
private static void LeftShiftOctal(char nextOctalChar, int octalsRead, short[] octals)
{
for (int i = octalsRead; i > 0; i--)
{
octals[i] = octals[i - 1];
}
octals[0] = nextOctalByte;
var value = OctalCharacterToShort(nextOctalChar);
octals[0] = value;
}
//private static int CheckForEndOfString(IRandomAccessRead reader, int bracesParameter)
@@ -180,7 +185,7 @@
//}
//}
private static void ProcessEscapedCharacter(char c, StringBuilder builder, byte[] octal, ref bool isOctalActive,
private static void ProcessEscapedCharacter(char c, StringBuilder builder, short[] octal, ref bool isOctalActive,
ref int octalsRead, ref bool isLineBreaking)
{
switch (c)
@@ -208,7 +213,7 @@
case '5':
case '6':
case '7':
octal[0] = (byte)c;
octal[0] = OctalCharacterToShort(c);
isOctalActive = true;
octalsRead = 1;
break;
@@ -231,7 +236,36 @@
}
}
private static int FromOctal(byte[] octal)
private static short OctalCharacterToShort(char c)
{
switch (c)
{
case '0':
return 0;
case '1':
return 1;
case '2':
return 2;
case '3':
return 3;
case '4':
return 4;
case '5':
return 5;
case '6':
return 6;
case '7':
return 7;
case '8':
return 8;
case '9':
return 9;
default:
return 0;
}
}
private static int FromOctal(short[] octal)
{
int Power(int x, int pow)
{
@@ -248,9 +282,9 @@
}
int sum = 0;
for (int i = 0; i < octal.Length; i++)
for (int i = octal.Length - 1; i >= 0; i--)
{
var power = 2 - i;
var power = i;
sum += octal[i] * Power(8, power);
}