change hex token construction to use a lookup for mapping to bytes. add parsing for codespace range and base font characters in the cmap

2025-07-17 05:11:51 +08:00 · 2017-11-14 22:58:06 +00:00 · 2017-11-14 22:58:06 +00:00 · 2e5aa37c85
commit 2e5aa37c85
parent 511385a253
6 changed files with 213 additions and 73 deletions
--- a/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs
+++ b/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs
@ -1,5 +1,10 @@
 namespace UglyToad.Pdf.Fonts.Cmap
 {
+    using System.Collections.Generic;
+    using System.Linq;
+    using System.Text;
+    using Util;
+
    /// <summary>
    /// A mutable class used when parsing and generating a <see cref="CMap"/>.
    /// </summary>
@ -44,5 +49,39 @@
        /// Defined as required.
        /// </remarks>
        public int Type { get; set; } = -1;
+
+        public IReadOnlyList<CodespaceRange> CodespaceRanges { get; set; }
+
+        public Dictionary<int, string> BaseFontCharacterMap { get; } = new Dictionary<int, string>();
+
+        public void AddBaseFontCharacter(IReadOnlyList<byte> bytes, IReadOnlyList<byte> value)
+        {
+            AddBaseFontCharacter(bytes, CreateStringFromBytes(value.ToArray()));
+        }
+
+        public void AddBaseFontCharacter(IReadOnlyList<byte> bytes, string value)
+        {
+            var code = GetCodeFromArray(bytes, bytes.Count);
+
+            BaseFontCharacterMap[code] = value;
+        }
+
+        private int GetCodeFromArray(IReadOnlyList<byte> data, int length)
+        {
+            int code = 0;
+            for (int i = 0; i < length; i++)
+            {
+                code <<= 8;
+                code |= (data[i] + 256) % 256;
+            }
+            return code;
+        }
+
+        private string CreateStringFromBytes(byte[] bytes)
+        {
+            return bytes.Length == 1
+                ? OtherEncodings.BytesAsLatin1String(bytes)
+                : Encoding.BigEndianUnicode.GetString(bytes);
+        }
    }
 }
--- a/src/UglyToad.Pdf/Fonts/Cmap/CmapUtils.cs
+++ b/src/UglyToad.Pdf/Fonts/Cmap/CmapUtils.cs
@ -4,7 +4,7 @@

    internal static class CmapUtils
    {
-        public static int ToInt(this byte[] data, int length)
+        public static int ToInt(this IReadOnlyList<byte> data, int length)
        {
            int code = 0;
            for (int i = 0; i < length; ++i)
--- a/src/UglyToad.Pdf/Fonts/Cmap/CodespaceRange.cs
+++ b/src/UglyToad.Pdf/Fonts/Cmap/CodespaceRange.cs
@ -1,66 +1,32 @@
-using System;
-using System.Collections.Generic;
-using System.Text;
-
-namespace UglyToad.Pdf.Fonts.Cmap
+namespace UglyToad.Pdf.Fonts.Cmap
 {
+    using System.Collections.Generic;
+
    /// <summary>
    ///  A codespace range is specified by a pair of codes of some particular length giving the lower and upper bounds of that range.
    /// </summary>
    public class CodespaceRange
    {
-        private byte[] start;
-        private byte[] end;
-        private int startInt;
-        private int endInt;
+        public IReadOnlyList<byte> Start { get; }

-        public int CodeLength { get; private set; }
+        public IReadOnlyList<byte> End { get; }

-        /**
-         * Creates a new instance of CodespaceRange.
-         */
-        public CodespaceRange()
+        public int StartInt { get; }
+
+        public int EndInt { get; }
+        
+        public int CodeLength { get; }
+        
+        /// <summary>
+        /// Creates a new instance of <see cref="CodespaceRange"/>.
+        /// </summary>
+        public CodespaceRange(IReadOnlyList<byte> start, IReadOnlyList<byte> end)
        {
-        }
-        
-
-        /** Getter for property end.
-         * @return Value of property end.
-         *
-         */
-        public byte[] getEnd()
-        {
-            return end;
-        }
-
-        /** Setter for property end.
-         * @param endBytes New value of property end.
-         *
-         */
-        void setEnd(byte[] endBytes)
-        {
-            end = endBytes;
-            endInt = endBytes.ToInt(endBytes.Length);
-        }
-
-        /** Getter for property start.
-         * @return Value of property start.
-         *
-         */
-        public byte[] getStart()
-        {
-            return start;
-        }
-
-        /** Setter for property start.
-         * @param startBytes New value of property start.
-         *
-         */
-        void setStart(byte[] startBytes)
-        {
-            start = startBytes;
-            CodeLength = start.Length;
-            startInt = startBytes.ToInt(startBytes.Length);
+            Start = start;
+            End = end;
+            StartInt = start.ToInt(start.Count);
+            EndInt = end.ToInt(end.Count);
+            CodeLength = start.Count;
        }
        
        /**
@ -80,7 +46,7 @@ namespace UglyToad.Pdf.Fonts.Cmap
            if (codeLen == CodeLength)
            {
                int value = code.ToInt(codeLen);
-                if (value >= startInt && value <= endInt)
+                if (value >= StartInt && value <= EndInt)
                {
                    return true;
                }
--- a/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs
@ -1,6 +1,7 @@
 namespace UglyToad.Pdf.Fonts.Parser
 {
    using System;
+    using System.Collections.Generic;
    using System.Globalization;
    using Cmap;
    using Cos;
@ -27,7 +28,38 @@
                {
                    switch (operatorToken.Data)
                    {
-                        default:
+                        case "usecmap":
+                            throw new NotImplementedException("External CMap files not yet supported, please submit a pull request!");
+                        case "begincodespacerange":
+                            {
+                                if (previousToken is NumericToken numeric)
+                                {
+                                    ParseCodespaceRange(numeric, scanner, builder);
+                                }
+                                else
+                                {
+                                    throw new InvalidOperationException("Unexpected token preceding start of codespace range: " + previousToken);
+                                }
+
+                            }
+                            break;
+                        case "beginbfchar":
+                            {
+                                if (previousToken is NumericToken numeric)
+                                {
+                                    ParseBaseFontCharacters(numeric, scanner, builder);
+                                }
+                                else
+                                {
+                                    throw new InvalidOperationException("Unexpected token preceding start of base font characters: " + previousToken);
+                                }
+                            }
+                            break;
+                        case "beginbfrange":
+                            break;
+                        case "begincidchar":
+                            break;
+                        case "begingcidrange":
                            break;
                    }
                }
@ -42,6 +74,66 @@
            return null;
        }

+        private static void ParseCodespaceRange(NumericToken count, ITokenScanner tokenScanner, CharacterMapBuilder builder)
+        {
+            /*
+             * For example:
+             3 begincodespacerange
+                <00>    <80>
+                <8140>  <9ffc>
+                <a0>    <de>
+             endcodespacerange
+             */
+
+            var ranges = new List<CodespaceRange>(count.Int);
+
+            for (var i = 0; i < count.Int; i++)
+            {
+                if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start))
+                {
+                    throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
+                }
+
+                if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end))
+                {
+                    throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
+                }
+
+                ranges.Add(new CodespaceRange(start.Bytes, end.Bytes));
+            }
+
+            builder.CodespaceRanges = ranges;
+        }
+
+        private static void ParseBaseFontCharacters(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder)
+        {
+            for (var i = 0; i < numeric.Int; i++)
+            {
+                if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode))
+                {
+                    throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
+                }
+
+                if (!tokenScanner.MoveNext())
+                {
+                    throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
+                }
+
+                if (tokenScanner.CurrentToken is NameToken characterName)
+                {
+                    builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name);
+                }
+                else if (tokenScanner.CurrentToken is HexToken characterCode)
+                {
+                    builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes);
+                }
+                else
+                {
+                    throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
+                }
+            }
+        }
+
        private static void ParseName(NameToken nameToken, CoreTokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
        {
            switch (nameToken.Data.Name)
--- a/src/UglyToad.Pdf/Tokenization/HexTokenizer.cs
+++ b/src/UglyToad.Pdf/Tokenization/HexTokenizer.cs
@ -1,6 +1,6 @@
 namespace UglyToad.Pdf.Tokenization
 {
-    using System.Text;
+    using System.Collections.Generic;
    using IO;
    using Parser.Parts;
    using Tokens;
@ -18,7 +18,7 @@
                return false;
            }
            
-            var characters = new StringBuilder();
+            var characters = new List<char>();

            while (inputBytes.MoveNext())
            {
@ -39,10 +39,10 @@
                    return false;
                }

-                characters.Append((char)current);
+                characters.Add((char)current);
            }

-            token = new HexToken(characters.ToString());
+            token = new HexToken(characters);

            return true;
        }
--- a/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs
+++ b/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs
@ -1,30 +1,73 @@
 namespace UglyToad.Pdf.Tokenization.Tokens
 {
-    using System;
    using System.Collections.Generic;
    using System.Text;

    public class HexToken : IDataToken<string>
    {
+        private static readonly Dictionary<char, byte> HexMap = new Dictionary<char, byte>
+        {
+            {'0', 0x00 },
+            {'1', 0x01 },
+            {'2', 0x02 },
+            {'3', 0x03 },
+            {'4', 0x04 },
+            {'5', 0x05 },
+            {'6', 0x06 },
+            {'7', 0x07 },
+            {'8', 0x08 },
+            {'9', 0x09 },
+
+            {'A', 0x0A },
+            {'a', 0x0A },
+            {'B', 0x0B },
+            {'b', 0x0B },
+            {'C', 0x0C },
+            {'c', 0x0C },
+            {'D', 0x0D },
+            {'d', 0x0D },
+            {'E', 0x0E },
+            {'e', 0x0E },
+            {'F', 0x0F },
+            {'f', 0x0F }
+        };
+
+        private static byte Convert(char high, char low)
+        {
+            var highByte = HexMap[high];
+            var lowByte = HexMap[low];
+
+            return (byte)(highByte << 4 | lowByte);
+        }
+
        public string Data { get; }

        public IReadOnlyList<byte> Bytes { get; }

-        public HexToken(string characters)
+        public HexToken(IReadOnlyList<char> characters)
        {
-            if (characters.Length % 2 != 0)
-            {
-                characters += "0";
-            }
-
+            var bytes = new List<byte>();
            var builder = new StringBuilder();
-            byte[] raw = new byte[characters.Length / 2];
-            for (int i = 0; i < raw.Length; i++)
+
+            for (int i = 0; i < characters.Count; i += 2)
            {
-               builder.Append((char)Convert.ToByte(characters.Substring(i * 2, 2), 16));
+                char high = characters[i];
+                char low;
+                if (i == characters.Count - 1)
+                {
+                    low = '0';
+                }
+                else
+                {
+                    low = characters[i + 1];
+                }
+
+                var b = Convert(high, low);
+                bytes.Add(b);
+                builder.Append((char)b);
            }

-            Bytes = raw;
+            Bytes = bytes;
            Data = builder.ToString();
        }
    }