change hex token construction to use a lookup for mapping to bytes. add parsing for codespace range and base font characters in the cmap

2025-07-17 04:08:29 +08:00 · 2017-11-14 22:58:06 +00:00 · 2017-11-14 22:58:06 +00:00 · 2e5aa37c85
commit 2e5aa37c85
parent 511385a253
6 changed files with 213 additions and 73 deletions
--- a/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs
+++ b/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs
@ -1,5 +1,10 @@
 namespace UglyToad.Pdf.Fonts.Cmap
 {
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using Util;
    /// <summary>
    /// A mutable class used when parsing and generating a <see cref="CMap"/>.
    /// </summary>
@ -44,5 +49,39 @@
        /// Defined as required.
        /// </remarks>
        public int Type { get; set; } = -1;
        public IReadOnlyList<CodespaceRange> CodespaceRanges { get; set; }
        public Dictionary<int, string> BaseFontCharacterMap { get; } = new Dictionary<int, string>();
        public void AddBaseFontCharacter(IReadOnlyList<byte> bytes, IReadOnlyList<byte> value)
        {
            AddBaseFontCharacter(bytes, CreateStringFromBytes(value.ToArray()));
        }
        public void AddBaseFontCharacter(IReadOnlyList<byte> bytes, string value)
        {
            var code = GetCodeFromArray(bytes, bytes.Count);
            BaseFontCharacterMap[code] = value;
        }
        private int GetCodeFromArray(IReadOnlyList<byte> data, int length)
        {
            int code = 0;
            for (int i = 0; i < length; i++)
            {
                code <<= 8;
                code |= (data[i] + 256) % 256;
            }
            return code;
        }
        private string CreateStringFromBytes(byte[] bytes)
        {
            return bytes.Length == 1
                ? OtherEncodings.BytesAsLatin1String(bytes)
                : Encoding.BigEndianUnicode.GetString(bytes);
        }
    }
 }
--- a/src/UglyToad.Pdf/Fonts/Cmap/CmapUtils.cs
+++ b/src/UglyToad.Pdf/Fonts/Cmap/CmapUtils.cs
@ -4,7 +4,7 @@
    internal static class CmapUtils
    {
-        public static int ToInt(this byte[] data, int length)
+        public static int ToInt(this IReadOnlyList<byte> data, int length)
        {
            int code = 0;
            for (int i = 0; i < length; ++i)
--- a/src/UglyToad.Pdf/Fonts/Cmap/CodespaceRange.cs
+++ b/src/UglyToad.Pdf/Fonts/Cmap/CodespaceRange.cs
@ -1,68 +1,34 @@
-using System;
+namespace UglyToad.Pdf.Fonts.Cmap
 using System.Collections.Generic;
 using System.Text;
 namespace UglyToad.Pdf.Fonts.Cmap
 {
    using System.Collections.Generic;
    /// <summary>
    ///  A codespace range is specified by a pair of codes of some particular length giving the lower and upper bounds of that range.
    /// </summary>
    public class CodespaceRange
    {
-        private byte[] start;
+        public IReadOnlyList<byte> Start { get; }
        private byte[] end;
        private int startInt;
        private int endInt;
-        public int CodeLength { get; private set; }
+        public IReadOnlyList<byte> End { get; }
-        /**
+        public int StartInt { get; }
-         * Creates a new instance of CodespaceRange.
+
-         */
+        public int EndInt { get; }
-        public CodespaceRange()
+        
        public int CodeLength { get; }
        /// <summary>
        /// Creates a new instance of <see cref="CodespaceRange"/>.
        /// </summary>
        public CodespaceRange(IReadOnlyList<byte> start, IReadOnlyList<byte> end)
        {
            Start = start;
            End = end;
            StartInt = start.ToInt(start.Count);
            EndInt = end.ToInt(end.Count);
            CodeLength = start.Count;
        }
        /** Getter for property end.
         * @return Value of property end.
         *
         */
        public byte[] getEnd()
        {
            return end;
        }
        /** Setter for property end.
         * @param endBytes New value of property end.
         *
         */
        void setEnd(byte[] endBytes)
        {
            end = endBytes;
            endInt = endBytes.ToInt(endBytes.Length);
        }
        /** Getter for property start.
         * @return Value of property start.
         *
         */
        public byte[] getStart()
        {
            return start;
        }
        /** Setter for property start.
         * @param startBytes New value of property start.
         *
         */
        void setStart(byte[] startBytes)
        {
            start = startBytes;
            CodeLength = start.Length;
            startInt = startBytes.ToInt(startBytes.Length);
        }
        /**
         * Returns true if the given code bytes match this codespace range.
         */
@ -80,7 +46,7 @@ namespace UglyToad.Pdf.Fonts.Cmap
            if (codeLen == CodeLength)
            {
                int value = code.ToInt(codeLen);
-                if (value >= startInt && value <= endInt)
+                if (value >= StartInt && value <= EndInt)
                {
                    return true;
                }
--- a/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs
@ -1,6 +1,7 @@
 namespace UglyToad.Pdf.Fonts.Parser
 {
    using System;
    using System.Collections.Generic;
    using System.Globalization;
    using Cmap;
    using Cos;
@ -27,7 +28,38 @@
                {
                    switch (operatorToken.Data)
                    {
-                        default:
+                        case "usecmap":
                            throw new NotImplementedException("External CMap files not yet supported, please submit a pull request!");
                        case "begincodespacerange":
                            {
                                if (previousToken is NumericToken numeric)
                                {
                                    ParseCodespaceRange(numeric, scanner, builder);
                                }
                                else
                                {
                                    throw new InvalidOperationException("Unexpected token preceding start of codespace range: " + previousToken);
                                }
                            }
                            break;
                        case "beginbfchar":
                            {
                                if (previousToken is NumericToken numeric)
                                {
                                    ParseBaseFontCharacters(numeric, scanner, builder);
                                }
                                else
                                {
                                    throw new InvalidOperationException("Unexpected token preceding start of base font characters: " + previousToken);
                                }
                            }
                            break;
                        case "beginbfrange":
                            break;
                        case "begincidchar":
                            break;
                        case "begingcidrange":
                            break;
                    }
                }
@ -42,6 +74,66 @@
            return null;
        }
        private static void ParseCodespaceRange(NumericToken count, ITokenScanner tokenScanner, CharacterMapBuilder builder)
        {
            /*
             * For example:
             3 begincodespacerange
                <00>    <80>
                <8140>  <9ffc>
                <a0>    <de>
             endcodespacerange
             */
            var ranges = new List<CodespaceRange>(count.Int);
            for (var i = 0; i < count.Int; i++)
            {
                if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start))
                {
                    throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
                }
                if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end))
                {
                    throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
                }
                ranges.Add(new CodespaceRange(start.Bytes, end.Bytes));
            }
            builder.CodespaceRanges = ranges;
        }
        private static void ParseBaseFontCharacters(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder)
        {
            for (var i = 0; i < numeric.Int; i++)
            {
                if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode))
                {
                    throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
                }
                if (!tokenScanner.MoveNext())
                {
                    throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
                }
                if (tokenScanner.CurrentToken is NameToken characterName)
                {
                    builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name);
                }
                else if (tokenScanner.CurrentToken is HexToken characterCode)
                {
                    builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes);
                }
                else
                {
                    throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
                }
            }
        }
        private static void ParseName(NameToken nameToken, CoreTokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
        {
            switch (nameToken.Data.Name)
--- a/src/UglyToad.Pdf/Tokenization/HexTokenizer.cs
+++ b/src/UglyToad.Pdf/Tokenization/HexTokenizer.cs
@ -1,6 +1,6 @@
 namespace UglyToad.Pdf.Tokenization
 {
-    using System.Text;
+    using System.Collections.Generic;
    using IO;
    using Parser.Parts;
    using Tokens;
@ -17,8 +17,8 @@
            {
                return false;
            }
-
+            
-            var characters = new StringBuilder();
+            var characters = new List<char>();
            while (inputBytes.MoveNext())
            {
@ -39,10 +39,10 @@
                    return false;
                }
-                characters.Append((char)current);
+                characters.Add((char)current);
            }
-            token = new HexToken(characters.ToString());
+            token = new HexToken(characters);
            return true;
        }
--- a/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs
+++ b/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs
@ -1,30 +1,73 @@
 namespace UglyToad.Pdf.Tokenization.Tokens
 {
    using System;
    using System.Collections.Generic;
    using System.Text;
    public class HexToken : IDataToken<string>
    {
        private static readonly Dictionary<char, byte> HexMap = new Dictionary<char, byte>
        {
            {'0', 0x00 },
            {'1', 0x01 },
            {'2', 0x02 },
            {'3', 0x03 },
            {'4', 0x04 },
            {'5', 0x05 },
            {'6', 0x06 },
            {'7', 0x07 },
            {'8', 0x08 },
            {'9', 0x09 },
            {'A', 0x0A },
            {'a', 0x0A },
            {'B', 0x0B },
            {'b', 0x0B },
            {'C', 0x0C },
            {'c', 0x0C },
            {'D', 0x0D },
            {'d', 0x0D },
            {'E', 0x0E },
            {'e', 0x0E },
            {'F', 0x0F },
            {'f', 0x0F }
        };
        private static byte Convert(char high, char low)
        {
            var highByte = HexMap[high];
            var lowByte = HexMap[low];
            return (byte)(highByte << 4 | lowByte);
        }
        public string Data { get; }
        public IReadOnlyList<byte> Bytes { get; }
-        public HexToken(string characters)
+        public HexToken(IReadOnlyList<char> characters)
        {
-            if (characters.Length % 2 != 0)
+            var bytes = new List<byte>();
            {
                characters += "0";
            }
            var builder = new StringBuilder();
-            byte[] raw = new byte[characters.Length / 2];
+
-            for (int i = 0; i < raw.Length; i++)
+            for (int i = 0; i < characters.Count; i += 2)
            {
-               builder.Append((char)Convert.ToByte(characters.Substring(i * 2, 2), 16));
+                char high = characters[i];
                char low;
                if (i == characters.Count - 1)
                {
                    low = '0';
                }
                else
                {
                    low = characters[i + 1];
                }
                var b = Convert(high, low);
                bytes.Add(b);
                builder.Append((char)b);
            }
-            Bytes = raw;
+            Bytes = bytes;
            Data = builder.ToString();
        }
    }