From 83cc1a6bf128307c877171290deee69d6a6b600b Mon Sep 17 00:00:00 2001
From: Eliot Jones <elioty@hotmail.co.uk>
Date: Thu, 9 Nov 2017 22:52:48 +0000
Subject: [PATCH] fixes for octal in strings and tests for string and hex
 tokenizing

---
 .../StringBytesTestConverter.cs               |  29 ++++
 .../Tokenization/HexStringTokenizerTests.cs   |  25 ++++
 .../Tokenization/StringTokenizerTests.cs      | 135 +++++++++++-------
 .../Tokenization/HexStringTokenizer.cs        |   6 +
 .../Tokenization/StringTokenizer.cs           |  76 +++++++---
 5 files changed, 196 insertions(+), 75 deletions(-)
 create mode 100644 src/UglyToad.Pdf.Tests/StringBytesTestConverter.cs
 create mode 100644 src/UglyToad.Pdf.Tests/Tokenization/HexStringTokenizerTests.cs
diff --git a/src/UglyToad.Pdf.Tests/StringBytesTestConverter.cs b/src/UglyToad.Pdf.Tests/StringBytesTestConverter.cs
new file mode 100644
index 00000000..333d90b5
--- /dev/null
+++ b/src/UglyToad.Pdf.Tests/StringBytesTestConverter.cs
@@ -0,0 +1,29 @@
+﻿namespace UglyToad.Pdf.Tests
+{
+    using System.Linq;
+    using IO;
+
+    public static class StringBytesTestConverter
+    {
+        public static Result Convert(string s)
+        {
+            var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
+
+            input.MoveNext();
+            var initialByte = input.CurrentByte;
+
+            return new Result
+            {
+                First = initialByte,
+                Bytes = input
+            };
+        }
+
+        public class Result
+        {
+            public byte First { get; set; }
+
+            public IInputBytes Bytes { get; set; }
+        }
+    }
+}
diff --git a/src/UglyToad.Pdf.Tests/Tokenization/HexStringTokenizerTests.cs b/src/UglyToad.Pdf.Tests/Tokenization/HexStringTokenizerTests.cs
new file mode 100644
index 00000000..f980ef22
--- /dev/null
+++ b/src/UglyToad.Pdf.Tests/Tokenization/HexStringTokenizerTests.cs
@@ -0,0 +1,25 @@
+﻿namespace UglyToad.Pdf.Tests.Tokenization
+{
+    using Pdf.Tokenization;
+    using Xunit;
+
+    public class HexStringTokenizerTests
+    {
+        private readonly HexStringTokenizer tokenizer = new HexStringTokenizer();
+
+        [Theory]
+        [InlineData(">not hex")]
+        [InlineData("\\<not hex")]
+        [InlineData("not hex")]
+        [InlineData("AE1094 still not hex")]
+        public void CannotTokenizeInvalidBytes(string s)
+        {
+            var input = StringBytesTestConverter.Convert(s);
+
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
+
+            Assert.False(result);
+            Assert.Null(token);
+        }
+    }
+}
diff --git a/src/UglyToad.Pdf.Tests/Tokenization/StringTokenizerTests.cs b/src/UglyToad.Pdf.Tests/Tokenization/StringTokenizerTests.cs
index c8b9c00b..2c532fad 100644
--- a/src/UglyToad.Pdf.Tests/Tokenization/StringTokenizerTests.cs
+++ b/src/UglyToad.Pdf.Tests/Tokenization/StringTokenizerTests.cs
@@ -1,6 +1,5 @@
 ﻿namespace UglyToad.Pdf.Tests.Tokenization
 {
-    using System.Linq;
     using IO;
     using Pdf.Tokenization;
     using Pdf.Tokenization.Tokens;
@@ -45,19 +44,13 @@
         {
             const string s = "(this string \\)contains escaped \\( parentheses)";
 
-            var input = new ByteArrayInputBytes(s.Select(x => (byte) x).ToArray());
-
-            input.MoveNext();
-            var initialByte = input.CurrentByte;
+            var input = StringBytesTestConverter.Convert(s);
             
-            var result = tokenizer.TryTokenize(initialByte, input, out var token);
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
 
             Assert.True(result);
-            Assert.NotNull(token);
 
-            var stringToken = Assert.IsType<StringToken>(token);
-
-            Assert.Equal(@"this string )contains escaped ( parentheses", stringToken.Data);
+            Assert.Equal(@"this string )contains escaped ( parentheses", AssertStringToken(token).Data);
         }
 
         [Theory]
@@ -68,18 +61,13 @@
         [InlineData("()", "")]
         public void CanReadValidStrings(string s, string expected)
         {
-            var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
+            var input = StringBytesTestConverter.Convert(s);
 
-            input.MoveNext();
-            var initialByte = input.CurrentByte;
-
-            var result = tokenizer.TryTokenize(initialByte, input, out var token);
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
 
             Assert.True(result);
-            Assert.NotNull(token);
-            var stringToken = Assert.IsType<StringToken>(token);
 
-            Assert.Equal(expected, stringToken.Data);
+            Assert.Equal(expected, AssertStringToken(token).Data);
         }
         
         [Fact]
@@ -87,19 +75,13 @@
         {
             const string s = "(this string (contains nested (two levels)) parentheses)";
 
-            var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
+            var input = StringBytesTestConverter.Convert(s);
 
-            input.MoveNext();
-            var initialByte = input.CurrentByte;
-
-            var result = tokenizer.TryTokenize(initialByte, input, out var token);
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
 
             Assert.True(result);
-            Assert.NotNull(token);
 
-            var stringToken = Assert.IsType<StringToken>(token);
-
-            Assert.Equal("this string (contains nested (two levels)) parentheses", stringToken.Data);
+            Assert.Equal("this string (contains nested (two levels)) parentheses", AssertStringToken(token).Data);
         }
         
         [Fact]
@@ -107,19 +89,13 @@
         {
             const string s = "(this string <contains>)";
 
-            var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
+            var input = StringBytesTestConverter.Convert(s);
 
-            input.MoveNext();
-            var initialByte = input.CurrentByte;
-
-            var result = tokenizer.TryTokenize(initialByte, input, out var token);
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
 
             Assert.True(result);
-            Assert.NotNull(token);
 
-            var stringToken = Assert.IsType<StringToken>(token);
-
-            Assert.Equal("this string <contains>", stringToken.Data);
+            Assert.Equal("this string <contains>", AssertStringToken(token).Data);
         }
 
         [Fact]
@@ -130,20 +106,14 @@ two strings \
 are the same.)";
 
             const string expected = "These two strings are the same.";
+            
+            var input = StringBytesTestConverter.Convert(s);
 
-            var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
-
-            input.MoveNext();
-            var initialByte = input.CurrentByte;
-
-            var result = tokenizer.TryTokenize(initialByte, input, out var token);
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
 
             Assert.True(result);
-            Assert.NotNull(token);
 
-            var stringToken = Assert.IsType<StringToken>(token);
-
-            Assert.Equal(expected, stringToken.Data);
+            Assert.Equal(expected, AssertStringToken(token).Data);
         }
 
         [Fact]
@@ -153,19 +123,76 @@ are the same.)";
 
             const string expected = "So does this one.\n";
 
-            var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
+            var input = StringBytesTestConverter.Convert(s);
 
-            input.MoveNext();
-            var initialByte = input.CurrentByte;
-
-            var result = tokenizer.TryTokenize(initialByte, input, out var token);
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
 
             Assert.True(result);
+
+            Assert.Equal(expected, AssertStringToken(token).Data);
+        }
+
+        [Fact]
+        public void ConvertsFullOctal()
+        {
+            const string s = @"(This string contains \245two octal characters\307.)";
+
+            var input = StringBytesTestConverter.Convert(s);
+
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
+
+            Assert.True(result);
+
+            Assert.Equal("This string contains ¥two octal charactersÇ.", AssertStringToken(token).Data);
+        }
+
+        [Fact]
+        public void ConvertsFullOctalFollowedByNormalNumber()
+        {
+            const string s = @"(This string contains \2451 octal character.)";
+
+            var input = StringBytesTestConverter.Convert(s);
+
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
+
+            Assert.True(result);
+
+            Assert.Equal("This string contains ¥1 octal character.", AssertStringToken(token).Data);
+        }
+
+        [Fact]
+        public void ConvertsPartialOctal()
+        {
+            const string s = @"(This string has a plus: \53 as octal)";
+
+            var input = StringBytesTestConverter.Convert(s);
+
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
+
+            Assert.True(result);
+
+            Assert.Equal("This string has a plus: + as octal", AssertStringToken(token).Data);
+        }
+
+        [Fact]
+        public void ConvertsTwoPartialOctalsInARow()
+        {
+            const string s = @"(This string has two \53\326ctals)";
+
+            var input = StringBytesTestConverter.Convert(s);
+
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
+
+            Assert.True(result);
+            
+            Assert.Equal("This string has two +Öctals", AssertStringToken(token).Data);
+        }
+
+        private static StringToken AssertStringToken(IToken token)
+        {
             Assert.NotNull(token);
-
             var stringToken = Assert.IsType<StringToken>(token);
-
-            Assert.Equal(expected, stringToken.Data);
+            return stringToken;
         }
     }
 }
diff --git a/src/UglyToad.Pdf/Tokenization/HexStringTokenizer.cs b/src/UglyToad.Pdf/Tokenization/HexStringTokenizer.cs
index 7e7fe645..1aedbf8e 100644
--- a/src/UglyToad.Pdf/Tokenization/HexStringTokenizer.cs
+++ b/src/UglyToad.Pdf/Tokenization/HexStringTokenizer.cs
@@ -1,6 +1,7 @@
 ﻿namespace UglyToad.Pdf.Tokenization
 {
     using IO;
+    using Parser.Parts;
     using Tokens;
 
     public class HexStringTokenizer : ITokenizer
@@ -18,6 +19,11 @@
             {
                 var current = inputBytes.CurrentByte;
 
+                if (ReadHelper.IsWhitespace(current))
+                {
+                    continue;
+                }
+
                 if (!IsValidHexCharacter(current))
                 {
                     return false;
diff --git a/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs b/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs
index 386c867c..7496fa52 100644
--- a/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs
+++ b/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs
@@ -28,7 +28,7 @@
 
             bool octalModeActive = false;
 
-            byte[] octal = { 0, 0, 0 };
+            short[] octal = { 0, 0, 0 };
             int octalsRead = 0;
 
             while (inputBytes.MoveNext())
@@ -36,35 +36,41 @@
                 var b = inputBytes.CurrentByte;
                 var c = (char)b;
 
-                if (octalModeActive && c >= '0' && c <= '7')
+                if (octalModeActive)
                 {
-                    if (octalsRead == 3)
+                    var nextCharacterOctal = c >= '0' && c <= '7';
+
+                    if (nextCharacterOctal)
+                    {
+                        // left shift the octals.
+                        LeftShiftOctal(c, octalsRead, octal);
+                        octalsRead++;
+                    }
+
+                    if (octalsRead == 3 || !nextCharacterOctal)
                     {
                         var characterCode = FromOctal(octal);
 
                         // For now :(
-                        // TODO: I have a sneaking suspicion this is wrong...
+                        // TODO: I have a sneaking suspicion this is wrong, not sure what behaviour is for large octal numbers
                         builder.Append((char)characterCode);
 
                         octal[0] = 0;
                         octal[1] = 0;
                         octal[2] = 0;
                         octalsRead = 0;
+                        octalModeActive = false;
                     }
-                    else
-                    {
-                        // left shift the octals.
-                        LeftShiftOctal(b, octalsRead, octal);
 
-                        octal[octalsRead] = b;
-                        octalsRead++;
+                    if (nextCharacterOctal)
+                    {
+                        continue;
                     }
                 }
 
                 switch (c)
                 {
                     case ')':
-                        octalModeActive = false;
                         isLineBreaking = false;
                         if (!isEscapeActive)
                         {
@@ -85,7 +91,6 @@
 
                         break;
                     case '(':
-                        octalModeActive = false;
                         isLineBreaking = false;
 
 
@@ -99,7 +104,6 @@
                         break;
                     // Escape
                     case '\\':
-                        octalModeActive = false;
                         isLineBreaking = false;
                         // Escaped backslash
                         if (isEscapeActive)
@@ -112,7 +116,6 @@
                         }
                         break;
                     default:
-                        octalModeActive = false;
                         if (isLineBreaking)
                         {
                             if (ReadHelper.IsEndOfLine(c))
@@ -142,14 +145,16 @@
             return true;
         }
 
-        private static void LeftShiftOctal(byte nextOctalByte, int octalsRead, byte[] octals)
+        private static void LeftShiftOctal(char nextOctalChar, int octalsRead, short[] octals)
         {
             for (int i = octalsRead; i > 0; i--)
             {
                 octals[i] = octals[i - 1];
             }
 
-            octals[0] = nextOctalByte;
+            var value = OctalCharacterToShort(nextOctalChar);
+
+            octals[0] = value;
         }
 
         //private static int CheckForEndOfString(IRandomAccessRead reader, int bracesParameter)
@@ -180,7 +185,7 @@
         //}
         //}
 
-        private static void ProcessEscapedCharacter(char c, StringBuilder builder, byte[] octal, ref bool isOctalActive,
+        private static void ProcessEscapedCharacter(char c, StringBuilder builder, short[] octal, ref bool isOctalActive,
             ref int octalsRead, ref bool isLineBreaking)
         {
             switch (c)
@@ -208,7 +213,7 @@
                 case '5':
                 case '6':
                 case '7':
-                    octal[0] = (byte)c;
+                    octal[0] = OctalCharacterToShort(c);
                     isOctalActive = true;
                     octalsRead = 1;
                     break;
@@ -231,7 +236,36 @@
             }
         }
 
-        private static int FromOctal(byte[] octal)
+        private static short OctalCharacterToShort(char c)
+        {
+            switch (c)
+            {
+                case '0':
+                    return 0;
+                case '1':
+                    return 1;
+                case '2':
+                    return 2;
+                case '3':
+                    return 3;
+                case '4':
+                    return 4;
+                case '5':
+                    return 5;
+                case '6':
+                    return 6;
+                case '7':
+                    return 7;
+                case '8':
+                    return 8;
+                case '9':
+                    return 9;
+                default:
+                    return 0;
+            }
+        }
+
+        private static int FromOctal(short[] octal)
         {
             int Power(int x, int pow)
             {
@@ -248,9 +282,9 @@
             }
 
             int sum = 0;
-            for (int i = 0; i < octal.Length; i++)
+            for (int i = octal.Length - 1; i >= 0; i--)
             {
-                var power = 2 - i;
+                var power = i;
                 sum += octal[i] * Power(8, power);
             }