From 83cc1a6bf128307c877171290deee69d6a6b600b Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Thu, 9 Nov 2017 22:52:48 +0000 Subject: [PATCH] fixes for octal in strings and tests for string and hex tokenizing --- .../StringBytesTestConverter.cs | 29 ++++ .../Tokenization/HexStringTokenizerTests.cs | 25 ++++ .../Tokenization/StringTokenizerTests.cs | 135 +++++++++++------- .../Tokenization/HexStringTokenizer.cs | 6 + .../Tokenization/StringTokenizer.cs | 76 +++++++--- 5 files changed, 196 insertions(+), 75 deletions(-) create mode 100644 src/UglyToad.Pdf.Tests/StringBytesTestConverter.cs create mode 100644 src/UglyToad.Pdf.Tests/Tokenization/HexStringTokenizerTests.cs diff --git a/src/UglyToad.Pdf.Tests/StringBytesTestConverter.cs b/src/UglyToad.Pdf.Tests/StringBytesTestConverter.cs new file mode 100644 index 00000000..333d90b5 --- /dev/null +++ b/src/UglyToad.Pdf.Tests/StringBytesTestConverter.cs @@ -0,0 +1,29 @@ +namespace UglyToad.Pdf.Tests +{ + using System.Linq; + using IO; + + public static class StringBytesTestConverter + { + public static Result Convert(string s) + { + var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray()); + + input.MoveNext(); + var initialByte = input.CurrentByte; + + return new Result + { + First = initialByte, + Bytes = input + }; + } + + public class Result + { + public byte First { get; set; } + + public IInputBytes Bytes { get; set; } + } + } +} diff --git a/src/UglyToad.Pdf.Tests/Tokenization/HexStringTokenizerTests.cs b/src/UglyToad.Pdf.Tests/Tokenization/HexStringTokenizerTests.cs new file mode 100644 index 00000000..f980ef22 --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Tokenization/HexStringTokenizerTests.cs @@ -0,0 +1,25 @@ +namespace UglyToad.Pdf.Tests.Tokenization +{ + using Pdf.Tokenization; + using Xunit; + + public class HexStringTokenizerTests + { + private readonly HexStringTokenizer tokenizer = new HexStringTokenizer(); + + [Theory] + [InlineData(">not hex")] + [InlineData("\\ (byte) x).ToArray()); - - input.MoveNext(); - var initialByte = input.CurrentByte; + var input = StringBytesTestConverter.Convert(s); - var result = tokenizer.TryTokenize(initialByte, input, out var token); + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); Assert.True(result); - Assert.NotNull(token); - var stringToken = Assert.IsType(token); - - Assert.Equal(@"this string )contains escaped ( parentheses", stringToken.Data); + Assert.Equal(@"this string )contains escaped ( parentheses", AssertStringToken(token).Data); } [Theory] @@ -68,18 +61,13 @@ [InlineData("()", "")] public void CanReadValidStrings(string s, string expected) { - var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray()); + var input = StringBytesTestConverter.Convert(s); - input.MoveNext(); - var initialByte = input.CurrentByte; - - var result = tokenizer.TryTokenize(initialByte, input, out var token); + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); Assert.True(result); - Assert.NotNull(token); - var stringToken = Assert.IsType(token); - Assert.Equal(expected, stringToken.Data); + Assert.Equal(expected, AssertStringToken(token).Data); } [Fact] @@ -87,19 +75,13 @@ { const string s = "(this string (contains nested (two levels)) parentheses)"; - var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray()); + var input = StringBytesTestConverter.Convert(s); - input.MoveNext(); - var initialByte = input.CurrentByte; - - var result = tokenizer.TryTokenize(initialByte, input, out var token); + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); Assert.True(result); - Assert.NotNull(token); - var stringToken = Assert.IsType(token); - - Assert.Equal("this string (contains nested (two levels)) parentheses", stringToken.Data); + Assert.Equal("this string (contains nested (two levels)) parentheses", AssertStringToken(token).Data); } [Fact] @@ -107,19 +89,13 @@ { const string s = "(this string )"; - var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray()); + var input = StringBytesTestConverter.Convert(s); - input.MoveNext(); - var initialByte = input.CurrentByte; - - var result = tokenizer.TryTokenize(initialByte, input, out var token); + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); Assert.True(result); - Assert.NotNull(token); - var stringToken = Assert.IsType(token); - - Assert.Equal("this string ", stringToken.Data); + Assert.Equal("this string ", AssertStringToken(token).Data); } [Fact] @@ -130,20 +106,14 @@ two strings \ are the same.)"; const string expected = "These two strings are the same."; + + var input = StringBytesTestConverter.Convert(s); - var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray()); - - input.MoveNext(); - var initialByte = input.CurrentByte; - - var result = tokenizer.TryTokenize(initialByte, input, out var token); + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); Assert.True(result); - Assert.NotNull(token); - var stringToken = Assert.IsType(token); - - Assert.Equal(expected, stringToken.Data); + Assert.Equal(expected, AssertStringToken(token).Data); } [Fact] @@ -153,19 +123,76 @@ are the same.)"; const string expected = "So does this one.\n"; - var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray()); + var input = StringBytesTestConverter.Convert(s); - input.MoveNext(); - var initialByte = input.CurrentByte; - - var result = tokenizer.TryTokenize(initialByte, input, out var token); + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); Assert.True(result); + + Assert.Equal(expected, AssertStringToken(token).Data); + } + + [Fact] + public void ConvertsFullOctal() + { + const string s = @"(This string contains \245two octal characters\307.)"; + + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + + Assert.Equal("This string contains ¥two octal charactersÇ.", AssertStringToken(token).Data); + } + + [Fact] + public void ConvertsFullOctalFollowedByNormalNumber() + { + const string s = @"(This string contains \2451 octal character.)"; + + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + + Assert.Equal("This string contains ¥1 octal character.", AssertStringToken(token).Data); + } + + [Fact] + public void ConvertsPartialOctal() + { + const string s = @"(This string has a plus: \53 as octal)"; + + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + + Assert.Equal("This string has a plus: + as octal", AssertStringToken(token).Data); + } + + [Fact] + public void ConvertsTwoPartialOctalsInARow() + { + const string s = @"(This string has two \53\326ctals)"; + + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + + Assert.Equal("This string has two +Öctals", AssertStringToken(token).Data); + } + + private static StringToken AssertStringToken(IToken token) + { Assert.NotNull(token); - var stringToken = Assert.IsType(token); - - Assert.Equal(expected, stringToken.Data); + return stringToken; } } } diff --git a/src/UglyToad.Pdf/Tokenization/HexStringTokenizer.cs b/src/UglyToad.Pdf/Tokenization/HexStringTokenizer.cs index 7e7fe645..1aedbf8e 100644 --- a/src/UglyToad.Pdf/Tokenization/HexStringTokenizer.cs +++ b/src/UglyToad.Pdf/Tokenization/HexStringTokenizer.cs @@ -1,6 +1,7 @@ namespace UglyToad.Pdf.Tokenization { using IO; + using Parser.Parts; using Tokens; public class HexStringTokenizer : ITokenizer @@ -18,6 +19,11 @@ { var current = inputBytes.CurrentByte; + if (ReadHelper.IsWhitespace(current)) + { + continue; + } + if (!IsValidHexCharacter(current)) { return false; diff --git a/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs b/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs index 386c867c..7496fa52 100644 --- a/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs +++ b/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs @@ -28,7 +28,7 @@ bool octalModeActive = false; - byte[] octal = { 0, 0, 0 }; + short[] octal = { 0, 0, 0 }; int octalsRead = 0; while (inputBytes.MoveNext()) @@ -36,35 +36,41 @@ var b = inputBytes.CurrentByte; var c = (char)b; - if (octalModeActive && c >= '0' && c <= '7') + if (octalModeActive) { - if (octalsRead == 3) + var nextCharacterOctal = c >= '0' && c <= '7'; + + if (nextCharacterOctal) + { + // left shift the octals. + LeftShiftOctal(c, octalsRead, octal); + octalsRead++; + } + + if (octalsRead == 3 || !nextCharacterOctal) { var characterCode = FromOctal(octal); // For now :( - // TODO: I have a sneaking suspicion this is wrong... + // TODO: I have a sneaking suspicion this is wrong, not sure what behaviour is for large octal numbers builder.Append((char)characterCode); octal[0] = 0; octal[1] = 0; octal[2] = 0; octalsRead = 0; + octalModeActive = false; } - else - { - // left shift the octals. - LeftShiftOctal(b, octalsRead, octal); - octal[octalsRead] = b; - octalsRead++; + if (nextCharacterOctal) + { + continue; } } switch (c) { case ')': - octalModeActive = false; isLineBreaking = false; if (!isEscapeActive) { @@ -85,7 +91,6 @@ break; case '(': - octalModeActive = false; isLineBreaking = false; @@ -99,7 +104,6 @@ break; // Escape case '\\': - octalModeActive = false; isLineBreaking = false; // Escaped backslash if (isEscapeActive) @@ -112,7 +116,6 @@ } break; default: - octalModeActive = false; if (isLineBreaking) { if (ReadHelper.IsEndOfLine(c)) @@ -142,14 +145,16 @@ return true; } - private static void LeftShiftOctal(byte nextOctalByte, int octalsRead, byte[] octals) + private static void LeftShiftOctal(char nextOctalChar, int octalsRead, short[] octals) { for (int i = octalsRead; i > 0; i--) { octals[i] = octals[i - 1]; } - octals[0] = nextOctalByte; + var value = OctalCharacterToShort(nextOctalChar); + + octals[0] = value; } //private static int CheckForEndOfString(IRandomAccessRead reader, int bracesParameter) @@ -180,7 +185,7 @@ //} //} - private static void ProcessEscapedCharacter(char c, StringBuilder builder, byte[] octal, ref bool isOctalActive, + private static void ProcessEscapedCharacter(char c, StringBuilder builder, short[] octal, ref bool isOctalActive, ref int octalsRead, ref bool isLineBreaking) { switch (c) @@ -208,7 +213,7 @@ case '5': case '6': case '7': - octal[0] = (byte)c; + octal[0] = OctalCharacterToShort(c); isOctalActive = true; octalsRead = 1; break; @@ -231,7 +236,36 @@ } } - private static int FromOctal(byte[] octal) + private static short OctalCharacterToShort(char c) + { + switch (c) + { + case '0': + return 0; + case '1': + return 1; + case '2': + return 2; + case '3': + return 3; + case '4': + return 4; + case '5': + return 5; + case '6': + return 6; + case '7': + return 7; + case '8': + return 8; + case '9': + return 9; + default: + return 0; + } + } + + private static int FromOctal(short[] octal) { int Power(int x, int pow) { @@ -248,9 +282,9 @@ } int sum = 0; - for (int i = 0; i < octal.Length; i++) + for (int i = octal.Length - 1; i >= 0; i--) { - var power = 2 - i; + var power = i; sum += octal[i] * Power(8, power); }