add initial port of base font range parsing

2025-11-28 17:47:12 +08:00 · 2017-11-19 15:29:05 +00:00
parent 2e5aa37c85
commit 0fd433240b
6 changed files with 202 additions and 3 deletions
--- a/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs
+++ b/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs
@@ -52,6 +52,8 @@

        public IReadOnlyList<CodespaceRange> CodespaceRanges { get; set; }

+        public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; set; }
+
        public Dictionary<int, string> BaseFontCharacterMap { get; } = new Dictionary<int, string>();

        public void AddBaseFontCharacter(IReadOnlyList<byte> bytes, IReadOnlyList<byte> value)
--- a/src/UglyToad.Pdf/Fonts/Cmap/CidCharacterMapping.cs
+++ b/src/UglyToad.Pdf/Fonts/Cmap/CidCharacterMapping.cs
@@ -0,0 +1,14 @@
+namespace UglyToad.Pdf.Fonts.Cmap
+{
+    public class CidCharacterMapping
+    {
+        public int Source { get; }
+        public int Destination { get; }
+
+        public CidCharacterMapping(int source, int destination)
+        {
+            Source = source;
+            Destination = destination;
+        }
+    }
+}
--- a/src/UglyToad.Pdf/Fonts/Parser/BaseFontRangeParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/BaseFontRangeParser.cs
@@ -0,0 +1,116 @@
+namespace UglyToad.Pdf.Fonts.Parser
+{
+    using System;
+    using System.Collections.Generic;
+    using System.Linq;
+    using Cmap;
+    using Tokenization.Scanner;
+    using Tokenization.Tokens;
+
+    internal class BaseFontRangeParser
+    {
+        public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder)
+        {
+            for (var i = 0; i < numeric.Int; i++)
+            {
+                if (!scanner.TryReadToken(out HexToken lowSourceCode))
+                {
+                    // TODO: message
+                    throw new InvalidOperationException();
+                }
+
+                if (!scanner.TryReadToken(out HexToken highSourceCode))
+                {
+                    // TODO: message
+                    throw new InvalidOperationException();
+                }
+
+                if (!scanner.MoveNext())
+                {
+                    // TODO: message
+                    throw new InvalidOperationException();
+                }
+
+                List<byte> destinationBytes = null;
+                ArrayToken destinationArray = null;
+                switch (scanner.CurrentToken)
+                {
+                    case ArrayToken arrayToken:
+                        destinationArray = arrayToken;
+                        break;
+                    case HexToken hexToken:
+                        destinationBytes = hexToken.Bytes.ToList();
+                        break;
+                    case NumericToken _:
+                        throw new NotImplementedException("From the spec it seems this possible but the meaning is unclear...");
+                    default:
+                        throw new InvalidOperationException();
+                }
+
+                var done = false;
+                var startCode = new List<byte>(lowSourceCode.Bytes);
+                var endCode = highSourceCode.Bytes;
+
+                int arrayIndex = 0;
+                while (!done)
+                {
+                    if (Compare(startCode, endCode) >= 0)
+                    {
+                        done = true;
+                    }
+
+                    builder.AddBaseFontCharacter(startCode, destinationBytes);
+
+                    Increment(startCode, startCode.Count - 1);
+
+                    if (destinationArray == null)
+                    {
+                        Increment(destinationBytes, destinationBytes.Count - 1);
+                    }
+                    else
+                    {
+                        arrayIndex++;
+                        if (arrayIndex < destinationArray.Data.Count)
+                        {
+                            destinationBytes = ((HexToken)destinationArray.Data[arrayIndex]).Bytes.ToList();
+                        }
+                    }
+                }
+            }
+        }
+
+        private static void Increment(IList<byte> data, int position)
+        {
+            if (position > 0 && (data[position] & 0xFF) == 255)
+            {
+                data[position] = 0;
+                Increment(data, position - 1);
+            }
+            else
+            {
+                data[position] = (byte)(data[position] + 1);
+            }
+        }
+
+        private static int Compare(IReadOnlyList<byte> first, IReadOnlyList<byte> second)
+        {
+            for (var i = 0; i < first.Count; i++)
+            {
+                if (first[i] == second[i])
+                {
+                    continue;
+                }
+
+                if ((first[i] & 0xFF) < (second[i] & 0xFF))
+                {
+                    return -1;
+                }
+
+                return 1;
+            }
+
+            return 0;
+        }
+
+    }
+}
--- a/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs
@@ -56,10 +56,33 @@
                            }
                            break;
                        case "beginbfrange":
+                            {
+                                if (previousToken is NumericToken numeric)
+                                {
+                                    var parser = new BaseFontRangeParser();
+                                    parser.Parse(numeric, scanner, builder);
+                                }
+                                else
+                                {
+                                    throw new InvalidOperationException("Unexpected token preceding start of base font character ranges: " + previousToken);
+                                }
+                            }
                            break;
                        case "begincidchar":
-                            break;
-                        case "begingcidrange":
+                            {
+                                if (previousToken is NumericToken numeric)
+                                {
+                                    var characters = ParseCidCharacters(numeric, scanner);
+
+                                    builder.CidCharacterMappings = characters;
+                                }
+                                else
+                                {
+                                    throw new InvalidOperationException("Unexpected token preceding start of Cid character mapping: " + previousToken);
+                                }
+                                break;
+                            }
+                        case "begincidrange":
                            break;
                    }
                }
@@ -134,7 +157,32 @@
            }
        }

-        private static void ParseName(NameToken nameToken, CoreTokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
+        private static IReadOnlyList<CidCharacterMapping> ParseCidCharacters(NumericToken numeric, ITokenScanner scanner)
+        {
+            var results = new List<CidCharacterMapping>();
+
+            for (var i = 0; i < numeric.Int; i++)
+            {
+                if (!scanner.TryReadToken(out HexToken sourceCode))
+                {
+                    throw new InvalidOperationException("The first token in a line for Cid Characters should be a hex, instead it was: " + scanner.CurrentToken);
+                }
+
+                if (!scanner.TryReadToken(out NumericToken destinationCode))
+                {
+                    throw new InvalidOperationException("The destination token in a line for Cid Character should be an integer, instead it was: " + scanner.CurrentToken);
+                }
+
+                var sourceInteger = sourceCode.Bytes.ToInt(sourceCode.Bytes.Count);
+                var mapping = new CidCharacterMapping(sourceInteger, destinationCode.Int);
+
+                results.Add(mapping);
+            }
+
+            return results;
+        }
+
+        private static void ParseName(NameToken nameToken, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
        {
            switch (nameToken.Data.Name)
            {
--- a/src/UglyToad.Pdf/Tokenization/Scanner/CoreTokenScanner.cs
+++ b/src/UglyToad.Pdf/Tokenization/Scanner/CoreTokenScanner.cs
@@ -31,6 +31,23 @@
        private readonly List<byte> currentBuffer = new List<byte>();
        
        public IToken CurrentToken { get; private set; }
+        public bool TryReadToken<T>(out T token) where T : class, IToken
+        {
+            token = default(T);
+
+            if (!MoveNext())
+            {
+                return false;
+            }
+
+            if (CurrentToken is T canCast)
+            {
+                token = canCast;
+                return true;
+            }
+
+            return false;
+        }

        private bool hasBytePreRead;

--- a/src/UglyToad.Pdf/Tokenization/Scanner/ITokenScanner.cs
+++ b/src/UglyToad.Pdf/Tokenization/Scanner/ITokenScanner.cs
@@ -7,5 +7,7 @@
        bool MoveNext();

        IToken CurrentToken { get; }
+
+        bool TryReadToken<T>(out T token) where T : class, IToken;
    }
 }