fixes for octal in strings and tests for string and hex tokenizing

2025-09-18 18:27:55 +08:00 · 2017-11-09 22:52:48 +00:00
parent afe07849d4
commit 83cc1a6bf1
5 changed files with 196 additions and 75 deletions
--- a/src/UglyToad.Pdf.Tests/StringBytesTestConverter.cs
+++ b/src/UglyToad.Pdf.Tests/StringBytesTestConverter.cs
@@ -0,0 +1,29 @@
+namespace UglyToad.Pdf.Tests
+{
+    using System.Linq;
+    using IO;
+
+    public static class StringBytesTestConverter
+    {
+        public static Result Convert(string s)
+        {
+            var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
+
+            input.MoveNext();
+            var initialByte = input.CurrentByte;
+
+            return new Result
+            {
+                First = initialByte,
+                Bytes = input
+            };
+        }
+
+        public class Result
+        {
+            public byte First { get; set; }
+
+            public IInputBytes Bytes { get; set; }
+        }
+    }
+}
--- a/src/UglyToad.Pdf.Tests/Tokenization/HexStringTokenizerTests.cs
+++ b/src/UglyToad.Pdf.Tests/Tokenization/HexStringTokenizerTests.cs
@@ -0,0 +1,25 @@
+namespace UglyToad.Pdf.Tests.Tokenization
+{
+    using Pdf.Tokenization;
+    using Xunit;
+
+    public class HexStringTokenizerTests
+    {
+        private readonly HexStringTokenizer tokenizer = new HexStringTokenizer();
+
+        [Theory]
+        [InlineData(">not hex")]
+        [InlineData("\\<not hex")]
+        [InlineData("not hex")]
+        [InlineData("AE1094 still not hex")]
+        public void CannotTokenizeInvalidBytes(string s)
+        {
+            var input = StringBytesTestConverter.Convert(s);
+
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
+
+            Assert.False(result);
+            Assert.Null(token);
+        }
+    }
+}
--- a/src/UglyToad.Pdf.Tests/Tokenization/StringTokenizerTests.cs
+++ b/src/UglyToad.Pdf.Tests/Tokenization/StringTokenizerTests.cs
@@ -1,6 +1,5 @@
 namespace UglyToad.Pdf.Tests.Tokenization
 {
-    using System.Linq;
    using IO;
    using Pdf.Tokenization;
    using Pdf.Tokenization.Tokens;
@@ -45,19 +44,13 @@
        {
            const string s = "(this string \\)contains escaped \\( parentheses)";

-            var input = new ByteArrayInputBytes(s.Select(x => (byte) x).ToArray());
-
-            input.MoveNext();
-            var initialByte = input.CurrentByte;
+            var input = StringBytesTestConverter.Convert(s);
            
-            var result = tokenizer.TryTokenize(initialByte, input, out var token);
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);

            Assert.True(result);
-            Assert.NotNull(token);

-            var stringToken = Assert.IsType<StringToken>(token);
-
-            Assert.Equal(@"this string )contains escaped ( parentheses", stringToken.Data);
+            Assert.Equal(@"this string )contains escaped ( parentheses", AssertStringToken(token).Data);
        }

        [Theory]
@@ -68,18 +61,13 @@
        [InlineData("()", "")]
        public void CanReadValidStrings(string s, string expected)
        {
-            var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
+            var input = StringBytesTestConverter.Convert(s);

-            input.MoveNext();
-            var initialByte = input.CurrentByte;
-
-            var result = tokenizer.TryTokenize(initialByte, input, out var token);
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);

            Assert.True(result);
-            Assert.NotNull(token);
-            var stringToken = Assert.IsType<StringToken>(token);

-            Assert.Equal(expected, stringToken.Data);
+            Assert.Equal(expected, AssertStringToken(token).Data);
        }
        
        [Fact]
@@ -87,19 +75,13 @@
        {
            const string s = "(this string (contains nested (two levels)) parentheses)";

-            var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
+            var input = StringBytesTestConverter.Convert(s);

-            input.MoveNext();
-            var initialByte = input.CurrentByte;
-
-            var result = tokenizer.TryTokenize(initialByte, input, out var token);
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);

            Assert.True(result);
-            Assert.NotNull(token);

-            var stringToken = Assert.IsType<StringToken>(token);
-
-            Assert.Equal("this string (contains nested (two levels)) parentheses", stringToken.Data);
+            Assert.Equal("this string (contains nested (two levels)) parentheses", AssertStringToken(token).Data);
        }
        
        [Fact]
@@ -107,19 +89,13 @@
        {
            const string s = "(this string <contains>)";

-            var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
+            var input = StringBytesTestConverter.Convert(s);

-            input.MoveNext();
-            var initialByte = input.CurrentByte;
-
-            var result = tokenizer.TryTokenize(initialByte, input, out var token);
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);

            Assert.True(result);
-            Assert.NotNull(token);

-            var stringToken = Assert.IsType<StringToken>(token);
-
-            Assert.Equal("this string <contains>", stringToken.Data);
+            Assert.Equal("this string <contains>", AssertStringToken(token).Data);
        }

        [Fact]
@@ -130,20 +106,14 @@ two strings \
 are the same.)";

            const string expected = "These two strings are the same.";
+            
+            var input = StringBytesTestConverter.Convert(s);

-            var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
-
-            input.MoveNext();
-            var initialByte = input.CurrentByte;
-
-            var result = tokenizer.TryTokenize(initialByte, input, out var token);
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);

            Assert.True(result);
-            Assert.NotNull(token);

-            var stringToken = Assert.IsType<StringToken>(token);
-
-            Assert.Equal(expected, stringToken.Data);
+            Assert.Equal(expected, AssertStringToken(token).Data);
        }

        [Fact]
@@ -153,19 +123,76 @@ are the same.)";

            const string expected = "So does this one.\n";

-            var input = new ByteArrayInputBytes(s.Select(x => (byte)x).ToArray());
+            var input = StringBytesTestConverter.Convert(s);

-            input.MoveNext();
-            var initialByte = input.CurrentByte;
-
-            var result = tokenizer.TryTokenize(initialByte, input, out var token);
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);

            Assert.True(result);
+
+            Assert.Equal(expected, AssertStringToken(token).Data);
+        }
+
+        [Fact]
+        public void ConvertsFullOctal()
+        {
+            const string s = @"(This string contains \245two octal characters\307.)";
+
+            var input = StringBytesTestConverter.Convert(s);
+
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
+
+            Assert.True(result);
+
+            Assert.Equal("This string contains ¥two octal charactersÇ.", AssertStringToken(token).Data);
+        }
+
+        [Fact]
+        public void ConvertsFullOctalFollowedByNormalNumber()
+        {
+            const string s = @"(This string contains \2451 octal character.)";
+
+            var input = StringBytesTestConverter.Convert(s);
+
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
+
+            Assert.True(result);
+
+            Assert.Equal("This string contains ¥1 octal character.", AssertStringToken(token).Data);
+        }
+
+        [Fact]
+        public void ConvertsPartialOctal()
+        {
+            const string s = @"(This string has a plus: \53 as octal)";
+
+            var input = StringBytesTestConverter.Convert(s);
+
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
+
+            Assert.True(result);
+
+            Assert.Equal("This string has a plus: + as octal", AssertStringToken(token).Data);
+        }
+
+        [Fact]
+        public void ConvertsTwoPartialOctalsInARow()
+        {
+            const string s = @"(This string has two \53\326ctals)";
+
+            var input = StringBytesTestConverter.Convert(s);
+
+            var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
+
+            Assert.True(result);
+            
+            Assert.Equal("This string has two +Öctals", AssertStringToken(token).Data);
+        }
+
+        private static StringToken AssertStringToken(IToken token)
+        {
            Assert.NotNull(token);
-
            var stringToken = Assert.IsType<StringToken>(token);
-
-            Assert.Equal(expected, stringToken.Data);
+            return stringToken;
        }
    }
 }
--- a/src/UglyToad.Pdf/Tokenization/HexStringTokenizer.cs
+++ b/src/UglyToad.Pdf/Tokenization/HexStringTokenizer.cs
@@ -1,6 +1,7 @@
 namespace UglyToad.Pdf.Tokenization
 {
    using IO;
+    using Parser.Parts;
    using Tokens;

    public class HexStringTokenizer : ITokenizer
@@ -18,6 +19,11 @@
            {
                var current = inputBytes.CurrentByte;

+                if (ReadHelper.IsWhitespace(current))
+                {
+                    continue;
+                }
+
                if (!IsValidHexCharacter(current))
                {
                    return false;
--- a/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs
+++ b/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs
@@ -28,7 +28,7 @@

            bool octalModeActive = false;

-            byte[] octal = { 0, 0, 0 };
+            short[] octal = { 0, 0, 0 };
            int octalsRead = 0;

            while (inputBytes.MoveNext())
@@ -36,35 +36,41 @@
                var b = inputBytes.CurrentByte;
                var c = (char)b;

-                if (octalModeActive && c >= '0' && c <= '7')
+                if (octalModeActive)
                {
-                    if (octalsRead == 3)
+                    var nextCharacterOctal = c >= '0' && c <= '7';
+
+                    if (nextCharacterOctal)
+                    {
+                        // left shift the octals.
+                        LeftShiftOctal(c, octalsRead, octal);
+                        octalsRead++;
+                    }
+
+                    if (octalsRead == 3 || !nextCharacterOctal)
                    {
                        var characterCode = FromOctal(octal);

                        // For now :(
-                        // TODO: I have a sneaking suspicion this is wrong...
+                        // TODO: I have a sneaking suspicion this is wrong, not sure what behaviour is for large octal numbers
                        builder.Append((char)characterCode);

                        octal[0] = 0;
                        octal[1] = 0;
                        octal[2] = 0;
                        octalsRead = 0;
+                        octalModeActive = false;
                    }
-                    else
-                    {
-                        // left shift the octals.
-                        LeftShiftOctal(b, octalsRead, octal);

-                        octal[octalsRead] = b;
-                        octalsRead++;
+                    if (nextCharacterOctal)
+                    {
+                        continue;
                    }
                }

                switch (c)
                {
                    case ')':
-                        octalModeActive = false;
                        isLineBreaking = false;
                        if (!isEscapeActive)
                        {
@@ -85,7 +91,6 @@

                        break;
                    case '(':
-                        octalModeActive = false;
                        isLineBreaking = false;


@@ -99,7 +104,6 @@
                        break;
                    // Escape
                    case '\\':
-                        octalModeActive = false;
                        isLineBreaking = false;
                        // Escaped backslash
                        if (isEscapeActive)
@@ -112,7 +116,6 @@
                        }
                        break;
                    default:
-                        octalModeActive = false;
                        if (isLineBreaking)
                        {
                            if (ReadHelper.IsEndOfLine(c))
@@ -142,14 +145,16 @@
            return true;
        }

-        private static void LeftShiftOctal(byte nextOctalByte, int octalsRead, byte[] octals)
+        private static void LeftShiftOctal(char nextOctalChar, int octalsRead, short[] octals)
        {
            for (int i = octalsRead; i > 0; i--)
            {
                octals[i] = octals[i - 1];
            }

-            octals[0] = nextOctalByte;
+            var value = OctalCharacterToShort(nextOctalChar);
+
+            octals[0] = value;
        }

        //private static int CheckForEndOfString(IRandomAccessRead reader, int bracesParameter)
@@ -180,7 +185,7 @@
        //}
        //}

-        private static void ProcessEscapedCharacter(char c, StringBuilder builder, byte[] octal, ref bool isOctalActive,
+        private static void ProcessEscapedCharacter(char c, StringBuilder builder, short[] octal, ref bool isOctalActive,
            ref int octalsRead, ref bool isLineBreaking)
        {
            switch (c)
@@ -208,7 +213,7 @@
                case '5':
                case '6':
                case '7':
-                    octal[0] = (byte)c;
+                    octal[0] = OctalCharacterToShort(c);
                    isOctalActive = true;
                    octalsRead = 1;
                    break;
@@ -231,7 +236,36 @@
            }
        }

-        private static int FromOctal(byte[] octal)
+        private static short OctalCharacterToShort(char c)
+        {
+            switch (c)
+            {
+                case '0':
+                    return 0;
+                case '1':
+                    return 1;
+                case '2':
+                    return 2;
+                case '3':
+                    return 3;
+                case '4':
+                    return 4;
+                case '5':
+                    return 5;
+                case '6':
+                    return 6;
+                case '7':
+                    return 7;
+                case '8':
+                    return 8;
+                case '9':
+                    return 9;
+                default:
+                    return 0;
+            }
+        }
+
+        private static int FromOctal(short[] octal)
        {
            int Power(int x, int pow)
            {
@@ -248,9 +282,9 @@
            }

            int sum = 0;
-            for (int i = 0; i < octal.Length; i++)
+            for (int i = octal.Length - 1; i >= 0; i--)
            {
-                var power = 2 - i;
+                var power = i;
                sum += octal[i] * Power(8, power);
            }