split out classes for parsing the cmap format and add assertions to tests. add bytes to int method for hex token and test

2026-03-10 00:23:29 +08:00 · 2017-11-20 16:42:18 +00:00
parent 0fd433240b
commit 4b91300466
14 changed files with 500 additions and 537 deletions
--- a/src/UglyToad.Pdf.Tests/Fonts/Parser/CMapParserTests.cs
+++ b/src/UglyToad.Pdf.Tests/Fonts/Parser/CMapParserTests.cs
@@ -40,11 +40,49 @@ end";
        private readonly CMapParser cMapParser = new CMapParser(); 

        [Fact]
-        public void CanParseCMap()
+        public void CanParseCidSystemInfoAndOtherInformation()
        {
            var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false);

            var cmap = cMapParser.Parse(input.Bytes, false);
+
+            Assert.Equal("Adobe", cmap.Info.Registry);
+            Assert.Equal("UCS", cmap.Info.Ordering);
+            Assert.Equal(0, cmap.Info.Supplement);
+
+            Assert.Equal("Adobe-Identity-UCS", cmap.Name);
+            Assert.Equal(2, cmap.Type);
+        }
+
+        [Fact]
+        public void CanParseCodespaceRange()
+        {
+            var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false);
+
+            var cmap = cMapParser.Parse(input.Bytes, false);
+
+            Assert.Equal(1, cmap.CodespaceRanges.Count);
+
+            Assert.Equal(0, cmap.CodespaceRanges[0].StartInt);
+            Assert.Equal(65535, cmap.CodespaceRanges[0].EndInt);
+            Assert.Equal(2, cmap.CodespaceRanges[0].CodeLength);
+        }
+
+        [Fact]
+        public void CanParseBaseFontCharacters()
+        {
+            var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false);
+
+            var cmap = cMapParser.Parse(input.Bytes, false);
+
+            Assert.True(cmap.BaseFontCharacterMap.Count >= 6);
+
+            Assert.Equal(" ", cmap.BaseFontCharacterMap[3]);
+            Assert.Equal(".", cmap.BaseFontCharacterMap[17]);
+            Assert.Equal("A", cmap.BaseFontCharacterMap[36]);
+            Assert.Equal("T", cmap.BaseFontCharacterMap[55]);
+            Assert.Equal("a", cmap.BaseFontCharacterMap[68]);
+            Assert.Equal("x", cmap.BaseFontCharacterMap[91]);
        }
    }
 }
--- a/src/UglyToad.Pdf.Tests/Tokenization/Tokens/HexTokenTests.cs
+++ b/src/UglyToad.Pdf.Tests/Tokenization/Tokens/HexTokenTests.cs
@@ -0,0 +1,36 @@
+namespace UglyToad.Pdf.Tests.Tokenization.Tokens
+{
+    using Pdf.Tokenization.Tokens;
+    using Xunit;
+
+    public class HexTokenTests
+    {
+        [Theory]
+        [InlineData("AE", "®")]
+        [InlineData("61", "a")]
+        [InlineData("0061", "\0a")]
+        [InlineData("7465787420736f", "text so")]
+        public void MapsCorrectlyToString(string input, string expected)
+        {
+            var token = new HexToken(input.ToCharArray());
+
+            Assert.Equal(expected, token.Data);
+        }
+
+        [Theory]
+        [InlineData("0003", 3)]
+        [InlineData("0011", 17)]
+        [InlineData("0024", 36)]
+        [InlineData("0037", 55)]
+        [InlineData("0044", 68)]
+        [InlineData("005B", 91)]
+        public void MapsCorrectlyToInt(string input, int expected)
+        {
+            var token = new HexToken(input.ToCharArray());
+
+            var value = HexToken.ConvertHexBytesToInt(token);
+
+            Assert.Equal(expected, value);
+        }
+    }
+}
--- a/src/UglyToad.Pdf/Fonts/Cmap/CharacterIdentifierSystemInfo.cs
+++ b/src/UglyToad.Pdf/Fonts/Cmap/CharacterIdentifierSystemInfo.cs
@@ -26,5 +26,10 @@
            Ordering = ordering;
            Supplement = supplement;
        }
+
+        public override string ToString()
+        {
+            return $"{Registry} | {Ordering} | {Supplement}";
+        }
    }
 }
--- a/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs
+++ b/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs
@@ -54,6 +54,8 @@

        public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; set; }

+        public IReadOnlyList<CidRange> CidRanges { get; set; }
+
        public Dictionary<int, string> BaseFontCharacterMap { get; } = new Dictionary<int, string>();

        public void AddBaseFontCharacter(IReadOnlyList<byte> bytes, IReadOnlyList<byte> value)
@@ -68,6 +70,15 @@
            BaseFontCharacterMap[code] = value;
        }

+        public CMap Build()
+        {
+            return new CMap(CharacterIdentifierSystemInfo, Type, WMode, Name, Version,
+                BaseFontCharacterMap ?? new Dictionary<int, string>(),
+                CodespaceRanges ?? new CodespaceRange[0],
+                CidRanges ?? new CidRange[0],
+                CidCharacterMappings ?? new CidCharacterMapping[0]);
+        }
+
        private int GetCodeFromArray(IReadOnlyList<byte> data, int length)
        {
            int code = 0;
@@ -79,7 +90,7 @@
            return code;
        }

-        private string CreateStringFromBytes(byte[] bytes)
+        private static string CreateStringFromBytes(byte[] bytes)
        {
            return bytes.Length == 1
                ? OtherEncodings.BytesAsLatin1String(bytes)
--- a/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs
+++ b/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs
@@ -1,11 +1,50 @@
-using System;
-using System.Collections.Generic;
-using System.Text;
-
-namespace UglyToad.Pdf.Fonts.Cmap
+namespace UglyToad.Pdf.Fonts.Cmap
 {
+    using System;
+    using System.Collections.Generic;
+    using Util.JetBrains.Annotations;
+
    public class CMap
    {
+        public CharacterIdentifierSystemInfo Info { get; }
+
+        public int Type { get; }
+
+        public int WMode { get; }
+
+        public string Name { get; }
+
+        public string Version { get; }
+
+        [NotNull]
+        public IReadOnlyDictionary<int, string> BaseFontCharacterMap { get; }
+
+        [NotNull]
+        public IReadOnlyList<CodespaceRange> CodespaceRanges { get; }
+
+        [NotNull]
+        public IReadOnlyList<CidRange> CidRanges { get; }
+
+        [NotNull]
+        public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; }
+
+        public bool HasCidMappings => CidCharacterMappings.Count > 0 || CidRanges.Count > 0;
+
+        public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0;
+
+        public CMap(CharacterIdentifierSystemInfo info, int type, int wMode, string name, string version, IReadOnlyDictionary<int, string> baseFontCharacterMap, IReadOnlyList<CodespaceRange> codespaceRanges, IReadOnlyList<CidRange> cidRanges, IReadOnlyList<CidCharacterMapping> cidCharacterMappings)
+        {
+            Info = info;
+            Type = type;
+            WMode = wMode;
+            Name = name;
+            Version = version;
+            BaseFontCharacterMap = baseFontCharacterMap ?? throw new ArgumentNullException(nameof(baseFontCharacterMap));
+            CodespaceRanges = codespaceRanges ?? throw new ArgumentNullException(nameof(codespaceRanges));
+            CidRanges = cidRanges ?? throw new ArgumentNullException(nameof(cidRanges));
+            CidCharacterMappings = cidCharacterMappings ?? throw new ArgumentNullException(nameof(cidCharacterMappings));
+        }
+
        private int wmode = 0;
        private string cmapName = null;
        private string cmapVersion = null;
@@ -17,13 +56,7 @@ namespace UglyToad.Pdf.Fonts.Cmap

        private int minCodeLength = 4;
        private int maxCodeLength;
-
-        // code lengths
-        private readonly List<CodespaceRange> codespaceRanges = new List<CodespaceRange>();
-
-        // Unicode mappings
-        private readonly Dictionary<int, string> charToUnicode = new Dictionary<int, string>();
-
+        
        // CID mappings
        private readonly Dictionary<int, int> codeToCid = new Dictionary<int, int>();
        private readonly List<CidRange> codeToCidRanges = new List<CidRange>();
@@ -31,44 +64,17 @@ namespace UglyToad.Pdf.Fonts.Cmap
        private static readonly string SPACE = " ";
        private int spaceMapping = -1;

-        /**
-         * Creates a new instance of CMap.
-         */
-        public CMap()
+        /// <summary>
+        /// Returns the sequence of Unicode characters for the given character code.
+        /// </summary>
+        /// <param name="code">Character code</param>
+        /// <param name="result">Unicode characters(may be more than one, e.g "fi" ligature)</param>
+        /// <returns><see langword="true"/> if this character map contains an entry for this code, <see langword="false"/> otherwise.</returns>
+        public bool TryConvertToUnicode(int code, out string result)
        {
-        }
+            var found = BaseFontCharacterMap.TryGetValue(code, out result);

-        /**
-         * This will tell if this cmap has any CID mappings.
-         * 
-         * @return true If there are any CID mappings, false otherwise.
-         */
-        public bool hasCIDMappings()
-        {
-            return codeToCid.Count > 0 || codeToCidRanges.Count > 0;
-        }
-
-        /**
-         * This will tell if this cmap has any Unicode mappings.
-         *
-         * @return true If there are any Unicode mappings, false otherwise.
-         */
-        public bool hasUnicodeMappings()
-        {
-            return charToUnicode.Count > 0;
-        }
-
-        /**
-         * Returns the sequence of Unicode characters for the given character code.
-         *
-         * @param code character code
-         * @return Unicode characters (may be more than one, e.g "fi" ligature)
-         */
-        public string toUnicode(int code)
-        {
-            charToUnicode.TryGetValue(code, out var result);
-
-            return result;
+            return found;
        }

        /**
@@ -102,27 +108,14 @@ namespace UglyToad.Pdf.Fonts.Cmap
        //    throw new InvalidOperationException("CMap is invalid");
        //}

-        /**
-         * Returns an int for the given byte array
-         */
-        static int toInt(byte[] data, int dataLen)
-        {
-            int code = 0;
-            for (int i = 0; i < dataLen; ++i)
-            {
-                code <<= 8;
-                code |= (data[i] & 0xFF);
-            }
-            return code;
-        }
-
+       
        /**
         * Returns the CID for the given character code.
         *
         * @param code character code
         * @return CID
         */
-        public int toCID(int code)
+        public int ConvertToCid(int code)
        {
            if (codeToCid.TryGetValue(code, out var cid))
            {
@@ -137,251 +130,11 @@ namespace UglyToad.Pdf.Fonts.Cmap
                    return ch;
                }
            }
+
            return 0;
        }
-
-        /**
-         * Convert the given part of a byte array to an int.
-         * @param data the byte array
-         * @param offset The offset into the byte array.
-         * @param length The length of the data we are getting.
-         * @return the resulting int
-         */
-        private int getCodeFromArray(byte[] data, int offset, int length)
-        {
-            int code = 0;
-            for (int i = 0; i < length; i++)
-            {
-                code <<= 8;
-                code |= (data[offset + i] + 256) % 256;
-            }
-            return code;
-        }
-
-        /**
-         * This will add a character code to Unicode character sequence mapping.
-         *
-         * @param codes The character codes to map from.
-         * @param unicode The Unicode characters to map to.
-         */
-        void addCharMapping(byte[] codes, string unicode)
-        {
-            int code = getCodeFromArray(codes, 0, codes.Length);
-            charToUnicode[code] = unicode;
-
-            // fixme: ugly little hack
-            if (SPACE.Equals(unicode))
-            {
-                spaceMapping = code;
-            }
-        }
-
-        /**
-         * This will add a CID mapping.
-         *
-         * @param code character code
-         * @param cid CID
-         */
-        void addCIDMapping(int code, int cid)
-        {
-            codeToCid[cid] = code;
-        }
-
-        /**
-         * This will add a CID Range.
-         *
-         * @param from starting charactor of the CID range.
-         * @param to ending character of the CID range.
-         * @param cid the cid to be started with.
-         *
-         */
-        void addCIDRange(char from, char to, int cid)
-        {
-            codeToCidRanges.Add(new CidRange(from, to, cid));
-        }
-
-        /**
-         * This will add a codespace range.
-         *
-         * @param range A single codespace range.
-         */
-        void addCodespaceRange(CodespaceRange range)
-        {
-            codespaceRanges.Add(range);
-            maxCodeLength = Math.Max(maxCodeLength, range.CodeLength);
-            minCodeLength = Math.Min(minCodeLength, range.CodeLength);
-        }
-
-        /**
-         * Implementation of the usecmap operator.  This will
-         * copy all of the mappings from one cmap to another.
-         * 
-         * @param cmap The cmap to load mappings from.
-         */
-        private void useCmap(CMap cmap)
-        {
-            foreach (CodespaceRange codespaceRange in cmap.codespaceRanges)
-            {
-                addCodespaceRange(codespaceRange);
-            }
-            charToUnicode.PutAll(cmap.charToUnicode);
-            codeToCid.PutAll(cmap.codeToCid);
-            codeToCidRanges.AddRange(cmap.codeToCidRanges);
-        }
-
-        /**
-         * Returns the WMode of a CMap.
-         *
-         * 0 represents a horizontal and 1 represents a vertical orientation.
-         * 
-         * @return the wmode
-         */
-        public int getWMode()
-        {
-            return wmode;
-        }
-
-        /**
-         * Sets the WMode of a CMap.
-         * 
-         * @param newWMode the new WMode.
-         */
-        public void setWMode(int newWMode)
-        {
-            wmode = newWMode;
-        }
-
-        /**
-         * Returns the name of the CMap.
-         * 
-         * @return the CMap name.
-         */
-        public string getName()
-        {
-            return cmapName;
-        }
-
-        /**
-         * Sets the name of the CMap.
-         * 
-         * @param name the CMap name.
-         */
-        public void setName(string name)
-        {
-            cmapName = name;
-        }
-
-        /**
-         * Returns the version of the CMap.
-         * 
-         * @return the CMap version.
-         */
-        public string getVersion()
-        {
-            return cmapVersion;
-        }
-
-        /**
-         * Sets the version of the CMap.
-         * 
-         * @param version the CMap version.
-         */
-        public void setVersion(string version)
-        {
-            cmapVersion = version;
-        }
-
-        /**
-         * Returns the type of the CMap.
-         * 
-         * @return the CMap type.
-         */
-        public int getType()
-        {
-            return cmapType;
-        }
-
-        /**
-         * Sets the type of the CMap.
-         * 
-         * @param type the CMap type.
-         */
-        public void setType(int type)
-        {
-            cmapType = type;
-        }
-
-        /**
-         * Returns the registry of the CIDSystemInfo.
-         * 
-         * @return the registry.
-         */
-        public string getRegistry()
-        {
-            return registry;
-        }
-
-        /**
-         * Sets the registry of the CIDSystemInfo.
-         * 
-         * @param newRegistry the registry.
-         */
-        public void setRegistry(string newRegistry)
-        {
-            registry = newRegistry;
-        }
-
-        /**
-         * Returns the ordering of the CIDSystemInfo.
-         * 
-         * @return the ordering.
-         */
-        public string getOrdering()
-        {
-            return ordering;
-        }
-
-        /**
-         * Sets the ordering of the CIDSystemInfo.
-         * 
-         * @param newOrdering the ordering.
-         */
-        public void setOrdering(string newOrdering)
-        {
-            ordering = newOrdering;
-        }
-
-        /**
-         * Returns the supplement of the CIDSystemInfo.
-         * 
-         * @return the supplement.
-         */
-        public int getSupplement()
-        {
-            return supplement;
-        }
-
-        /**
-         * Sets the supplement of the CIDSystemInfo.
-         * 
-         * @param newSupplement the supplement.
-         */
-        public void setSupplement(int newSupplement)
-        {
-            supplement = newSupplement;
-        }
-
-        /** 
-         * Returns the mapping for the space character.
-         * 
-         * @return the mapped code for the space character
-         */
-        public int getSpaceMapping()
-        {
-            return spaceMapping;
-        }
-
-
+        
+        
        public override string ToString()
        {
            return cmapName;
--- a/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs
@@ -1,23 +1,26 @@
 namespace UglyToad.Pdf.Fonts.Parser
 {
    using System;
-    using System.Collections.Generic;
-    using System.Globalization;
    using Cmap;
-    using Cos;
    using IO;
+    using Parts;
    using Tokenization.Scanner;
    using Tokenization.Tokens;
-    using Util.JetBrains.Annotations;

    public class CMapParser
    {
+        private static readonly BaseFontRangeParser BaseFontRangeParser = new BaseFontRangeParser();
+        private static readonly BaseFontCharacterParser BaseFontCharacterParser = new BaseFontCharacterParser();
+        private static readonly CidRangeParser CidRangeParser = new CidRangeParser();
+        private static readonly CidFontNameParser CidFontNameParser = new CidFontNameParser();
+        private static readonly CodespaceRangeParser CodespaceRangeParser = new CodespaceRangeParser();
+        private static readonly CidCharacterParser CidCharacterParser = new CidCharacterParser();
+
        public CMap Parse(IInputBytes inputBytes, bool isLenientParsing)
        {
            var scanner = new CoreTokenScanner(inputBytes);

            var builder = new CharacterMapBuilder();
-            var result = new CMap();

            IToken previousToken = null;
            while (scanner.MoveNext())
@@ -34,20 +37,19 @@
                            {
                                if (previousToken is NumericToken numeric)
                                {
-                                    ParseCodespaceRange(numeric, scanner, builder);
+                                    CodespaceRangeParser.Parse(numeric, scanner, builder, isLenientParsing);
                                }
                                else
                                {
                                    throw new InvalidOperationException("Unexpected token preceding start of codespace range: " + previousToken);
                                }
-
                            }
                            break;
                        case "beginbfchar":
                            {
                                if (previousToken is NumericToken numeric)
                                {
-                                    ParseBaseFontCharacters(numeric, scanner, builder);
+                                    BaseFontCharacterParser.Parse(numeric, scanner, builder, isLenientParsing);
                                }
                                else
                                {
@@ -59,8 +61,7 @@
                            {
                                if (previousToken is NumericToken numeric)
                                {
-                                    var parser = new BaseFontRangeParser();
-                                    parser.Parse(numeric, scanner, builder);
+                                    BaseFontRangeParser.Parse(numeric, scanner, builder, isLenientParsing);
                                }
                                else
                                {
@@ -72,9 +73,7 @@
                            {
                                if (previousToken is NumericToken numeric)
                                {
-                                    var characters = ParseCidCharacters(numeric, scanner);
-
-                                    builder.CidCharacterMappings = characters;
+                                    CidCharacterParser.Parse(numeric, scanner, builder, isLenientParsing);
                                }
                                else
                                {
@@ -83,229 +82,28 @@
                                break;
                            }
                        case "begincidrange":
+                            {
+                                if (previousToken is NumericToken numeric)
+                                {
+                                    CidRangeParser.Parse(numeric, scanner, builder, isLenientParsing);
+                                }
+                                else
+                                {
+                                    throw new InvalidOperationException("Unexpected token preceding start of Cid ranges: " + previousToken);
+                                }
+                            }
                            break;
                    }
                }
                else if (token is NameToken name)
                {
-                    ParseName(name, scanner, builder, isLenientParsing);
+                    CidFontNameParser.Parse(name, scanner, builder, isLenientParsing);
                }

                previousToken = token;
            }

-            return null;
-        }
-
-        private static void ParseCodespaceRange(NumericToken count, ITokenScanner tokenScanner, CharacterMapBuilder builder)
-        {
-            /*
-             * For example:
-             3 begincodespacerange
-                <00>    <80>
-                <8140>  <9ffc>
-                <a0>    <de>
-             endcodespacerange
-             */
-
-            var ranges = new List<CodespaceRange>(count.Int);
-
-            for (var i = 0; i < count.Int; i++)
-            {
-                if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start))
-                {
-                    throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
-                }
-
-                if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end))
-                {
-                    throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
-                }
-
-                ranges.Add(new CodespaceRange(start.Bytes, end.Bytes));
-            }
-
-            builder.CodespaceRanges = ranges;
-        }
-
-        private static void ParseBaseFontCharacters(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder)
-        {
-            for (var i = 0; i < numeric.Int; i++)
-            {
-                if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode))
-                {
-                    throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
-                }
-
-                if (!tokenScanner.MoveNext())
-                {
-                    throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
-                }
-
-                if (tokenScanner.CurrentToken is NameToken characterName)
-                {
-                    builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name);
-                }
-                else if (tokenScanner.CurrentToken is HexToken characterCode)
-                {
-                    builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes);
-                }
-                else
-                {
-                    throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
-                }
-            }
-        }
-
-        private static IReadOnlyList<CidCharacterMapping> ParseCidCharacters(NumericToken numeric, ITokenScanner scanner)
-        {
-            var results = new List<CidCharacterMapping>();
-
-            for (var i = 0; i < numeric.Int; i++)
-            {
-                if (!scanner.TryReadToken(out HexToken sourceCode))
-                {
-                    throw new InvalidOperationException("The first token in a line for Cid Characters should be a hex, instead it was: " + scanner.CurrentToken);
-                }
-
-                if (!scanner.TryReadToken(out NumericToken destinationCode))
-                {
-                    throw new InvalidOperationException("The destination token in a line for Cid Character should be an integer, instead it was: " + scanner.CurrentToken);
-                }
-
-                var sourceInteger = sourceCode.Bytes.ToInt(sourceCode.Bytes.Count);
-                var mapping = new CidCharacterMapping(sourceInteger, destinationCode.Int);
-
-                results.Add(mapping);
-            }
-
-            return results;
-        }
-
-        private static void ParseName(NameToken nameToken, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
-        {
-            switch (nameToken.Data.Name)
-            {
-                case "WMode":
-                    {
-                        var next = TryMoveNext(scanner);
-                        if (next is NumericToken numeric)
-                        {
-                            builder.WMode = numeric.Int;
-                        }
-                        break;
-                    }
-                case "CMapName":
-                    {
-                        var next = TryMoveNext(scanner);
-                        if (next is NameToken name)
-                        {
-                            builder.Name = name.Data.Name;
-                        }
-                        break;
-                    }
-                case "CMapVersion":
-                    {
-                        var next = TryMoveNext(scanner);
-                        if (next is NumericToken number)
-                        {
-                            builder.Version = number.Data.ToString(NumberFormatInfo.InvariantInfo);
-                        }
-                        else if (next is StringToken stringToken)
-                        {
-                            builder.Version = stringToken.Data;
-                        }
-                        break;
-                    }
-                case "CMapType":
-                    {
-                        var next = TryMoveNext(scanner);
-                        if (next is NumericToken numeric)
-                        {
-                            builder.Type = numeric.Int;
-                        }
-                        break;
-                    }
-                case "Registry":
-                    {
-                        throw new NotImplementedException("Registry should be in a dictionary");
-                    }
-                case "Ordering":
-                    {
-                        throw new NotImplementedException("Ordering should be in a dictionary");
-                    }
-                case "Supplement":
-                    {
-                        throw new NotImplementedException("Supplement should be in a dictionary");
-                    }
-                case "CIDSystemInfo":
-                    {
-                        var next = TryMoveNext(scanner);
-
-                        if (next is DictionaryToken dictionary)
-                        {
-                            builder.CharacterIdentifierSystemInfo = GetCharacterIdentifier(dictionary, isLenientParsing);
-                        }
-                        break;
-                    }
-            }
-        }
-
-        private static CharacterIdentifierSystemInfo GetCharacterIdentifier(DictionaryToken dictionary, bool isLenientParsing)
-        {
-            string GetErrorMessage(string missingKey)
-            {
-                return $"No {missingKey} found in the CIDSystemInfo dictionary: " + dictionary;
-            }
-
-            if (!dictionary.TryGetByName(CosName.REGISTRY, out var registry) || !(registry is StringToken registryString))
-            {
-                if (isLenientParsing)
-                {
-                    registryString = new StringToken("Adobe");
-                }
-                else
-                {
-                    throw new InvalidOperationException(GetErrorMessage("registry"));
-                }
-            }
-
-            if (!dictionary.TryGetByName(CosName.ORDERING, out var ordering) || !(ordering is StringToken orderingString))
-            {
-                if (isLenientParsing)
-                {
-                    orderingString = new StringToken("");
-                }
-                else
-                {
-                    throw new InvalidOperationException(GetErrorMessage("ordering"));
-                }
-            }
-
-            if (!dictionary.TryGetByName(CosName.SUPPLEMENT, out var supplement) || !(supplement is NumericToken supplementNumeric))
-            {
-                if (isLenientParsing)
-                {
-                    supplementNumeric = new NumericToken(0);
-                }
-                else
-                {
-                    throw new InvalidOperationException(GetErrorMessage("supplement"));
-                }
-            }
-
-            return new CharacterIdentifierSystemInfo(registryString.Data, orderingString.Data, supplementNumeric.Int);
-        }
-
-        [CanBeNull]
-        private static IToken TryMoveNext(ITokenScanner scanner)
-        {
-            if (!scanner.MoveNext())
-            {
-                return null;
-            }
-
-            return scanner.CurrentToken;
+            return builder.Build();
        }
    }
 }
--- a/src/UglyToad.Pdf/Fonts/Parser/Parts/BaseFontCharacterParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/BaseFontCharacterParser.cs
@@ -0,0 +1,39 @@
+namespace UglyToad.Pdf.Fonts.Parser.Parts
+{
+    using System;
+    using Cmap;
+    using Tokenization.Scanner;
+    using Tokenization.Tokens;
+
+    internal class BaseFontCharacterParser : ICidFontPartParser<NumericToken>
+    {
+        public void Parse(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder, bool isLenientParsing)
+        {
+            for (var i = 0; i < numeric.Int; i++)
+            {
+                if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode))
+                {
+                    throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
+                }
+
+                if (!tokenScanner.MoveNext())
+                {
+                    throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
+                }
+
+                if (tokenScanner.CurrentToken is NameToken characterName)
+                {
+                    builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name);
+                }
+                else if (tokenScanner.CurrentToken is HexToken characterCode)
+                {
+                    builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes);
+                }
+                else
+                {
+                    throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
+                }
+            }
+        }
+    }
+}
--- a/src/UglyToad.Pdf/Fonts/Parser/Parts/BaseFontRangeParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/BaseFontRangeParser.cs
@@ -1,4 +1,4 @@
-namespace UglyToad.Pdf.Fonts.Parser
+namespace UglyToad.Pdf.Fonts.Parser.Parts
 {
    using System;
    using System.Collections.Generic;
@@ -7,9 +7,9 @@
    using Tokenization.Scanner;
    using Tokenization.Tokens;

-    internal class BaseFontRangeParser
+    internal class BaseFontRangeParser : ICidFontPartParser<NumericToken>
    {
-        public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder)
+        public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
        {
            for (var i = 0; i < numeric.Int; i++)
            {
--- a/src/UglyToad.Pdf/Fonts/Parser/Parts/CidCharacterParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/CidCharacterParser.cs
@@ -0,0 +1,36 @@
+namespace UglyToad.Pdf.Fonts.Parser.Parts
+{
+    using System;
+    using System.Collections.Generic;
+    using Cmap;
+    using Tokenization.Scanner;
+    using Tokenization.Tokens;
+
+    internal class CidCharacterParser : ICidFontPartParser<NumericToken>
+    {
+        public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
+        {
+            var results = new List<CidCharacterMapping>();
+
+            for (var i = 0; i < numeric.Int; i++)
+            {
+                if (!scanner.TryReadToken(out HexToken sourceCode))
+                {
+                    throw new InvalidOperationException("The first token in a line for Cid Characters should be a hex, instead it was: " + scanner.CurrentToken);
+                }
+
+                if (!scanner.TryReadToken(out NumericToken destinationCode))
+                {
+                    throw new InvalidOperationException("The destination token in a line for Cid Character should be an integer, instead it was: " + scanner.CurrentToken);
+                }
+
+                var sourceInteger = sourceCode.Bytes.ToInt(sourceCode.Bytes.Count);
+                var mapping = new CidCharacterMapping(sourceInteger, destinationCode.Int);
+
+                results.Add(mapping);
+            }
+            
+            builder.CidCharacterMappings = results;
+        }
+    }
+}
--- a/src/UglyToad.Pdf/Fonts/Parser/Parts/CidFontNameParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/CidFontNameParser.cs
@@ -0,0 +1,128 @@
+namespace UglyToad.Pdf.Fonts.Parser.Parts
+{
+    using System;
+    using System.Globalization;
+    using Cmap;
+    using Cos;
+    using Tokenization.Scanner;
+    using Tokenization.Tokens;
+
+    internal class CidFontNameParser : ICidFontPartParser<NameToken>
+    {
+        public void Parse(NameToken nameToken, ITokenScanner scanner, CharacterMapBuilder builder,
+            bool isLenientParsing)
+        {
+            switch (nameToken.Data.Name)
+            {
+                case "WMode":
+                    {
+                        if (scanner.TryReadToken(out NumericToken numeric))
+                        {
+                            builder.WMode = numeric.Int;
+                        }
+                        break;
+                    }
+                case "CMapName":
+                    {
+                        if (scanner.TryReadToken(out NameToken name))
+                        {
+                            builder.Name = name.Data.Name;
+                        }
+                        break;
+                    }
+                case "CMapVersion":
+                    {
+                        if (!scanner.MoveNext())
+                        {
+                            break;
+                        }
+
+                        var next = scanner.CurrentToken;
+                        if (next is NumericToken number)
+                        {
+                            builder.Version = number.Data.ToString(NumberFormatInfo.InvariantInfo);
+                        }
+                        else if (next is StringToken stringToken)
+                        {
+                            builder.Version = stringToken.Data;
+                        }
+                        break;
+                    }
+                case "CMapType":
+                    {
+                        if (scanner.TryReadToken(out NumericToken numeric))
+                        {
+                            builder.Type = numeric.Int;
+                        }
+                        break;
+                    }
+                case "Registry":
+                    {
+                        throw new NotImplementedException("Registry should be in a dictionary");
+                    }
+                case "Ordering":
+                    {
+                        throw new NotImplementedException("Ordering should be in a dictionary");
+                    }
+                case "Supplement":
+                    {
+                        throw new NotImplementedException("Supplement should be in a dictionary");
+                    }
+                case "CIDSystemInfo":
+                    {
+                        if (scanner.TryReadToken(out DictionaryToken dictionary))
+                        {
+                            builder.CharacterIdentifierSystemInfo = GetCharacterIdentifier(dictionary, isLenientParsing);
+                        }
+                        break;
+                    }
+            }
+        }
+
+        private static CharacterIdentifierSystemInfo GetCharacterIdentifier(DictionaryToken dictionary, bool isLenientParsing)
+        {
+            string GetErrorMessage(string missingKey)
+            {
+                return $"No {missingKey} found in the CIDSystemInfo dictionary: " + dictionary;
+            }
+
+            if (!dictionary.TryGetByName(CosName.REGISTRY, out var registry) || !(registry is StringToken registryString))
+            {
+                if (isLenientParsing)
+                {
+                    registryString = new StringToken("Adobe");
+                }
+                else
+                {
+                    throw new InvalidOperationException(GetErrorMessage("registry"));
+                }
+            }
+
+            if (!dictionary.TryGetByName(CosName.ORDERING, out var ordering) || !(ordering is StringToken orderingString))
+            {
+                if (isLenientParsing)
+                {
+                    orderingString = new StringToken("");
+                }
+                else
+                {
+                    throw new InvalidOperationException(GetErrorMessage("ordering"));
+                }
+            }
+
+            if (!dictionary.TryGetByName(CosName.SUPPLEMENT, out var supplement) || !(supplement is NumericToken supplementNumeric))
+            {
+                if (isLenientParsing)
+                {
+                    supplementNumeric = new NumericToken(0);
+                }
+                else
+                {
+                    throw new InvalidOperationException(GetErrorMessage("supplement"));
+                }
+            }
+
+            return new CharacterIdentifierSystemInfo(registryString.Data, orderingString.Data, supplementNumeric.Int);
+        }
+    }
+}
--- a/src/UglyToad.Pdf/Fonts/Parser/Parts/CidRangeParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/CidRangeParser.cs
@@ -0,0 +1,46 @@
+namespace UglyToad.Pdf.Fonts.Parser.Parts
+{
+    using System;
+    using System.Collections.Generic;
+    using Cmap;
+    using Tokenization.Scanner;
+    using Tokenization.Tokens;
+
+    internal class CidRangeParser : ICidFontPartParser<NumericToken>
+    {
+        public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
+        {
+            var ranges = new List<CidRange>();
+
+            for (var i = 0; i < numeric.Int; i++)
+            {
+                if (!scanner.TryReadToken(out HexToken startHexToken))
+                {
+                    // TODO: message
+                    throw new InvalidOperationException();
+                }
+
+                if (!scanner.TryReadToken(out HexToken endHexToken))
+                {
+                    // TODO: message
+                    throw new InvalidOperationException();
+                }
+
+                if (!scanner.TryReadToken(out NumericToken mappedCode))
+                {
+                    // TODO: message
+                    throw new InvalidOperationException();
+                }
+
+                var start = HexToken.ConvertHexBytesToInt(startHexToken);
+                var end = HexToken.ConvertHexBytesToInt(endHexToken);
+
+                var range = new CidRange((char)start, (char)end, mappedCode.Int);
+
+                ranges.Add(range);
+            }
+
+            builder.CidRanges = ranges;
+        }
+    }
+}
--- a/src/UglyToad.Pdf/Fonts/Parser/Parts/CodespaceRangeParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/CodespaceRangeParser.cs
@@ -0,0 +1,42 @@
+namespace UglyToad.Pdf.Fonts.Parser.Parts
+{
+    using System;
+    using System.Collections.Generic;
+    using Cmap;
+    using Tokenization.Scanner;
+    using Tokenization.Tokens;
+
+    internal class CodespaceRangeParser : ICidFontPartParser<NumericToken>
+    {
+        public void Parse(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder, bool isLenientParsing)
+        {
+            /*
+             * For example:
+             3 begincodespacerange
+                <00>    <80>
+                <8140>  <9ffc>
+                <a0>    <de>
+             endcodespacerange
+             */
+
+            var ranges = new List<CodespaceRange>(numeric.Int);
+
+            for (var i = 0; i < numeric.Int; i++)
+            {
+                if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start))
+                {
+                    throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
+                }
+
+                if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end))
+                {
+                    throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
+                }
+
+                ranges.Add(new CodespaceRange(start.Bytes, end.Bytes));
+            }
+
+            builder.CodespaceRanges = ranges;
+        }
+    }
+}
--- a/src/UglyToad.Pdf/Fonts/Parser/Parts/ICidFontPartParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/ICidFontPartParser.cs
@@ -0,0 +1,17 @@
+namespace UglyToad.Pdf.Fonts.Parser.Parts
+{
+    using Cmap;
+    using Tokenization.Scanner;
+
+    /// <summary>
+    /// Provides parsing for a certain operator type in a CID font definition.
+    /// </summary>
+    /// <typeparam name="TToken">The type of the token preceding the operation we wish to parse.</typeparam>
+    internal interface ICidFontPartParser<in TToken>
+    {
+        /// <summary>
+        /// Parse the definition for this part of the CID font and write the results to the <see cref="CharacterMapBuilder"/>.
+        /// </summary>
+        void Parse(TToken previous, ITokenScanner tokenScanner, CharacterMapBuilder builder, bool isLenientParsing);
+    }
+}
--- a/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs
+++ b/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs
@@ -70,5 +70,19 @@ namespace UglyToad.Pdf.Tokenization.Tokens
            Bytes = bytes;
            Data = builder.ToString();
        }
+
+        public static int ConvertHexBytesToInt(HexToken token)
+        {
+            var bytes = token.Bytes;
+
+            var value = bytes[0] & 0xFF;
+            if (bytes.Count == 2)
+            {
+                value <<= 8;
+                value += bytes[1] & 0xFF;
+            }
+
+            return value;
+        }
    }
 }