port type 1 lexer from pdf box and add test data

2025-10-15 19:54:52 +08:00 · 2018-10-23 20:02:20 +01:00
parent c8c32eab24
commit df0b60c2e1
9 changed files with 528 additions and 6 deletions
--- a/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMBX10.pfa
+++ b/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMBX10.pfa
--- a/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMBX12.pfa
+++ b/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMBX12.pfa
--- a/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMCSC10.pfa
+++ b/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMCSC10.pfa
--- a/src/UglyToad.PdfPig.Tests/Fonts/Type1/Type1FontParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Fonts/Type1/Type1FontParserTests.cs
@@ -3,10 +3,8 @@
    using System;
    using System.IO;
    using System.Linq;
-    using System.Text;
    using PdfPig.Fonts.Type1.Parser;
    using PdfPig.IO;
-    using PdfPig.Util;
    using Xunit;

    public class Type1FontParserTests
@@ -30,6 +28,14 @@
            parser.Parse(new ByteArrayInputBytes(bytes), 0, 0);
        }

+        [Fact]
+        public void CanReadCharStrings()
+        {
+            var bytes = GetFileBytes("CMBX10.pfa");
+
+            parser.Parse(new ByteArrayInputBytes(bytes), 0, 0);
+        }
+
        [Fact]
        public void CanReadAsciiPart()
        {
--- a/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs
@@ -28,6 +28,17 @@
            }
        }

+        [Fact]
+        public void LettersHaveHeight()
+        {
+            using (var document = PdfDocument.Open(GetFilename()))
+            {
+                var page = document.GetPage(1);
+
+                Assert.NotEqual(0, page.Letters[0].GlyphRectangle.Height);
+            }
+        }
+
        [Fact]
        public void HasCorrectNumberOfPages()
        {
--- a/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1EncryptedPortionParser.cs
+++ b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1EncryptedPortionParser.cs
@@ -2,6 +2,7 @@
 {
    using System.Collections.Generic;
    using System.Linq;
+    using IO;
    using PdfPig.Parser.Parts;
    using Tokenization.Tokens;
    using Util;
@@ -11,7 +12,7 @@
        private const ushort EexecEncryptionKey = 55665;
        private const int EexecRandomBytes = 4;

-        public void Parse(IReadOnlyList<byte> bytes)
+        public IReadOnlyList<byte> Parse(IReadOnlyList<byte> bytes)
        {
            if (!IsBinary(bytes))
            {
@@ -20,7 +21,23 @@

            var decrypted = Decrypt(bytes, EexecEncryptionKey, EexecRandomBytes);

+            // line 461 of type1parser.java
            var str = OtherEncodings.BytesAsLatin1String(decrypted.ToArray());
+
+            var tokenizer = new Type1Tokenizer(new ByteArrayInputBytes(decrypted));
+            while (tokenizer.CurrentToken != null)
+            {
+                tokenizer.GetNext();
+            }
+
+            /*
+             * After 4 random characters follows the /Private dictionary and the /CharString dictionary.
+             * The first defines a number of technical terms involving character construction, and contains also an array of subroutines used in character paths.
+             * The second contains the character descriptions themselves.
+             * Both the subroutines and the character descriptions are yet again encrypted in a fashion similar to the entire binary segment, but now with an initial value of R = 4330 instead of 55665.
+             */
+
+            return decrypted;
        }

        /// <summary>
@@ -89,6 +106,16 @@

        private static IReadOnlyList<byte> Decrypt(IReadOnlyList<byte> bytes, int key, int randomBytes)
        {
+            /*
+             * We start with three constants R = 55665, c1 = 52845 and c2 = 22719.
+             * Then we apply to the entire binary array c[i] of length n the decryption procedure:
+             * for in [0, n):
+             *    p[i] = c[i]^(R >> 8)
+             *    R = ((c[i] + R)*c1 + c2) & ((1 << 16) - 1)
+             *
+             * Here ^ means xor addition, in which one interprets the bits modulo 2.
+             * The encryption key R changes as the procedure is carried out.
+             */
            if (randomBytes == -1)
            {
                return bytes;
--- a/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs
+++ b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs
@@ -5,6 +5,7 @@
    using Exceptions;
    using Geometry;
    using IO;
+    using PdfPig.Parser.Parts;
    using Tokenization;
    using Tokenization.Scanner;
    using Tokenization.Tokens;
@@ -14,7 +15,8 @@
        private const string ClearToMark = "cleartomark";

        private const int PfbFileIndicator = 0x80;
-        
+        private const int EexecKey = 55665;
+
        private readonly Type1EncryptedPortionParser encryptedPortionParser;

        public Type1FontParser(Type1EncryptedPortionParser encryptedPortionParser)
@@ -146,7 +148,7 @@
            var matrix = GetFontMatrix(dictionaries);
            var boundingBox = GetBoundingBox(dictionaries);

-            encryptedPortionParser.Parse(eexecPortion);
+            var binaryPortion = encryptedPortionParser.Parse(eexecPortion);

            return new Type1Font(name, encoding, matrix, boundingBox ?? new PdfRectangle());
        }
@@ -349,7 +351,6 @@

            return new ArrayToken(result);
        }
-
        private static Dictionary<int, string> GetEncoding(IReadOnlyList<DictionaryToken> dictionaries)
        {
            var result = new Dictionary<int, string>();
--- a/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1Token.cs
+++ b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1Token.cs
@@ -0,0 +1,90 @@
+namespace UglyToad.PdfPig.Fonts.Type1.Parser
+{
+    using System;
+    using System.Collections.Generic;
+
+    internal class Type1DataToken : Type1Token
+    {
+        public IReadOnlyList<byte> Data { get; }
+
+        public Type1DataToken(TokenType type, IReadOnlyList<byte> data) : base(type)
+        {
+            if (type != TokenType.Charstring)
+            {
+                throw new ArgumentException($"Invalid token type for type 1 token receiving bytes, expected Charstring, got {type}.");
+            }
+
+            Data = data;
+        }
+
+        public override string ToString()
+        {
+            return $"Token[type = {Type}, data = {Data.Count} bytes]";
+
+        }
+    }
+
+    internal class Type1TextToken : Type1Token
+    {
+        public string Text { get; }
+
+        public Type1TextToken(char c, TokenType type) : this(c.ToString(), type) { }
+        public Type1TextToken(string text, TokenType type) : base(type)
+        {
+            Text = text;
+        }
+
+        public int AsInt()
+        {
+            return (int)AsFloat();
+        }
+
+        public float AsFloat()
+        {
+            return float.Parse(Text);
+        }
+
+        public bool AsBool()
+        {
+            return string.Equals(Text, "true", StringComparison.OrdinalIgnoreCase);
+        }
+
+        public override string ToString()
+        {
+            return $"Token[type={Type}, text={Text}]";
+        }
+    }
+
+    internal class Type1Token
+    {
+        public TokenType Type { get; }
+
+        public Type1Token(TokenType type)
+        {
+            Type = type;
+        }
+
+        public enum TokenType
+        {
+            None,
+            String,
+            Name,
+            Literal,
+            Real,
+            Integer,
+            /// <summary>
+            /// An array must begin with either '[' or '{'. 
+            /// </summary>
+            StartArray,
+            /// <summary>
+            /// An array must end with either ']' or '}'. 
+            /// </summary>
+            EndArray,
+            StartProc,
+            EndProc,
+            StartDict,
+            EndDict,
+            Charstring
+        }
+    }
+}
--- a/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1Tokenizer.cs
+++ b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1Tokenizer.cs
@@ -0,0 +1,387 @@
+namespace UglyToad.PdfPig.Fonts.Type1.Parser
+{
+    using System;
+    using System.Text;
+    using IO;
+    using PdfPig.Parser.Parts;
+
+    internal class Type1Tokenizer
+    {
+        private readonly StringBuilder commentBuffer = new StringBuilder();
+        private readonly StringBuilder literalBuffer = new StringBuilder();
+        private readonly StringBuilder stringBuffer = new StringBuilder();
+
+        private readonly IInputBytes bytes;
+
+        private int openParens;
+        private Type1Token previousToken;
+
+        public Type1Token CurrentToken { get; private set; }
+
+        public Type1Tokenizer(IInputBytes bytes)
+        {
+            this.bytes = bytes;
+            CurrentToken = ReadNextToken();
+        }
+
+        public Type1Token GetNext()
+        {
+            CurrentToken = ReadNextToken();
+            return CurrentToken;
+        }
+
+        private Type1Token ReadNextToken()
+        {
+            previousToken = CurrentToken;
+            bool skip;
+            do
+            {
+                skip = false;
+                while (bytes.MoveNext())
+                {
+                    var b = bytes.CurrentByte;
+                    var c = (char)b;
+
+                    switch (c)
+                    {
+                        case '%':
+                            var comment = ReadComment();
+                            break;
+                        case '(':
+                            return ReadString();
+                        case ')':
+                            throw new InvalidOperationException("Encountered an end of string ')' outside of string.");
+                        case '[':
+                            return new Type1TextToken(c, Type1Token.TokenType.StartArray);
+                        case ']':
+                            return new Type1TextToken(c, Type1Token.TokenType.EndArray);
+                        case '{':
+                            return new Type1TextToken(c, Type1Token.TokenType.StartProc);
+                        case '}':
+                            return new Type1TextToken(c, Type1Token.TokenType.EndProc);
+                        case '/':
+                            {
+                                var name = ReadLiteral();
+                                return new Type1TextToken(name, Type1Token.TokenType.Literal);
+                            }
+                        case '<':
+                            {
+                                var following = bytes.Peek();
+                                if (following == '<')
+                                {
+                                    bytes.MoveNext();
+                                    return new Type1TextToken("<<", Type1Token.TokenType.StartDict);
+                                }
+
+                                return new Type1TextToken(c, Type1Token.TokenType.Name);
+                            }
+                        case '>':
+                            {
+                                var following = bytes.Peek();
+                                if (following == '>')
+                                {
+                                    bytes.MoveNext();
+                                    return new Type1TextToken(">>", Type1Token.TokenType.EndDict);
+                                }
+
+                                return new Type1TextToken(c, Type1Token.TokenType.Name);
+                            }
+                        default:
+                            {
+                                if (ReadHelper.IsWhitespace(b))
+                                {
+                                    skip = true;
+                                    break;
+                                }
+
+                                if (b == 0)
+                                {
+                                    skip = true;
+                                    break;
+                                }
+
+                                if (TryReadNumber(out var number))
+                                {
+                                    return number;
+                                }
+
+                                var name = ReadLiteral(c);
+                                if (name == null)
+                                {
+                                    throw new InvalidOperationException($"The binary portion of the type 1 font was invalid at position {bytes.CurrentOffset}.");
+                                }
+
+                                if (name.Equals("RD") || name.Equals("-|"))
+                                {
+                                    if (previousToken.Type == Type1Token.TokenType.Integer)
+                                    {
+                                        return ReadCharString(((Type1TextToken)previousToken).AsInt());
+                                    }
+
+                                    throw new InvalidOperationException($"Expected integer token before {name} at offset {bytes.CurrentOffset}.");
+                                }
+
+                                return new Type1TextToken(name, Type1Token.TokenType.Name);
+                            }
+                    }
+                }
+            } while (skip);
+
+            return null;
+        }
+
+        private Type1TextToken ReadString()
+        {
+            char GetNext()
+            {
+                bytes.MoveNext();
+                return (char)bytes.CurrentByte;
+            }
+            stringBuffer.Clear();
+
+            while (bytes.MoveNext())
+            {
+                var c = (char)bytes.CurrentByte;
+
+                // string context
+                switch (c)
+                {
+                    case '(':
+                        openParens++;
+                        stringBuffer.Append('(');
+                        break;
+                    case ')':
+                        if (openParens == 0)
+                        {
+                            // end of string
+                            return new Type1TextToken(stringBuffer.ToString(), Type1Token.TokenType.String);
+                        }
+                        stringBuffer.Append(')');
+                        openParens--;
+                        break;
+                    case '\\':
+                        // escapes: \n \r \t \b \f \\ \( \)
+                        char c1 = GetNext();
+                        switch (c1)
+                        {
+                            case 'n':
+                            case 'r': stringBuffer.Append("\n"); break;
+                            case 't': stringBuffer.Append('\t'); break;
+                            case 'b': stringBuffer.Append('\b'); break;
+                            case 'f': stringBuffer.Append('\f'); break;
+                            case '\\': stringBuffer.Append('\\'); break;
+                            case '(': stringBuffer.Append('('); break;
+                            case ')': stringBuffer.Append(')'); break;
+                        }
+                        // octal \ddd
+                        if (char.IsDigit(c1))
+                        {
+                            var rawOctal = new string(new[] { c1, GetNext(), GetNext() });
+                            var code = Convert.ToInt32(rawOctal, 8);
+                            stringBuffer.Append((char)code);
+                        }
+                        break;
+                    case '\r':
+                    case '\n':
+                        stringBuffer.Append("\n");
+                        break;
+                    default:
+                        stringBuffer.Append(c);
+                        break;
+                }
+            }
+            return null;
+        }
+
+        private bool TryReadNumber(out Type1TextToken numberToken)
+        {
+            char GetNext()
+            {
+                bytes.MoveNext();
+                return (char)bytes.CurrentByte;
+            }
+
+            numberToken = null;
+
+            var currentPosition = bytes.CurrentOffset;
+
+            var sb = new StringBuilder();
+            StringBuilder radix = null;
+
+            char c = GetNext();
+            var hasDigit = false;
+
+            // optional + or -
+            if (c == '+' || c == '-')
+            {
+                sb.Append(c);
+                c = GetNext();
+            }
+
+            // optional digits
+            while (char.IsDigit(c))
+            {
+                sb.Append(c);
+                c = GetNext();
+                hasDigit = true;
+            }
+
+            // optional .
+            if (c == '.')
+            {
+                sb.Append(c);
+                c = GetNext();
+            }
+            else if (c == '#')
+            {
+                // PostScript radix number takes the form base#number
+                radix = sb;
+                sb = new StringBuilder();
+                c = GetNext();
+            }
+            else if (sb.Length == 0 || !hasDigit)
+            {
+                // failure
+                bytes.Seek(currentPosition);
+                return false;
+            }
+            else
+            {
+                // integer
+                bytes.Seek(bytes.CurrentOffset - 1);
+
+                numberToken = new Type1TextToken(sb.ToString(), Type1Token.TokenType.Integer);
+                return true;
+            }
+
+            // required digit
+            if (char.IsDigit(c))
+            {
+                sb.Append(c);
+                c = GetNext();
+            }
+            else
+            {
+                bytes.Seek(currentPosition);
+                return false;
+            }
+
+            // optional digits
+            while (char.IsDigit(c))
+            {
+                sb.Append(c);
+                c = GetNext();
+            }
+
+            // optional E
+            if (c == 'E')
+            {
+                sb.Append(c);
+                c = GetNext();
+
+                // optional minus
+                if (c == '-')
+                {
+                    sb.Append(c);
+                    c = GetNext();
+                }
+
+                // required digit
+                if (char.IsDigit(c))
+                {
+                    sb.Append(c);
+                    c = GetNext();
+                }
+                else
+                {
+                    bytes.Seek(currentPosition);
+                    return false;
+                }
+
+                // optional digits
+                while (char.IsDigit(c))
+                {
+                    sb.Append(c);
+                    c = GetNext();
+                }
+            }
+
+            bytes.Seek(bytes.CurrentOffset - 1);
+            if (radix != null)
+            {
+                var number = Convert.ToInt32(sb.ToString(), int.Parse(radix.ToString()));
+                numberToken = new Type1TextToken(number.ToString(), Type1Token.TokenType.Integer);
+            }
+            else
+            {
+                numberToken = new Type1TextToken(sb.ToString(), Type1Token.TokenType.Real);
+            }
+
+            return true;
+        }
+
+        private string ReadLiteral(char? previousCharacter = null)
+        {
+            literalBuffer.Clear();
+            if (previousCharacter.HasValue)
+            {
+                literalBuffer.Append(previousCharacter);
+            }
+
+            do
+            {
+                var b = bytes.Peek();
+                if (!b.HasValue)
+                {
+                    break;
+                }
+
+                var c = (char)b;
+
+                if (char.IsWhiteSpace(c) || c == '(' || c == ')' || c == '<' || c == '>' ||
+                    c == '[' || c == ']' || c == '{' || c == '}' || c == '/' || c == '%')
+                {
+                    break;
+                }
+
+                literalBuffer.Append(c);
+            } while (bytes.MoveNext());
+
+            var literal = literalBuffer.ToString();
+            return literal.Length == 0 ? null : literal;
+        }
+
+        private string ReadComment()
+        {
+            commentBuffer.Clear();
+
+            while (bytes.MoveNext())
+            {
+                var c = (char)bytes.CurrentByte;
+                if (ReadHelper.IsEndOfLine(c))
+                {
+                    continue;
+                }
+
+                commentBuffer.Append(c);
+            }
+
+            return commentBuffer.ToString();
+        }
+
+        private Type1DataToken ReadCharString(int length)
+        {
+            // Skip preceding space.
+            bytes.MoveNext();
+
+            byte[] data = new byte[length];
+            for (int i = 0; i < length; i++)
+            {
+                bytes.MoveNext();
+                data[i] = bytes.CurrentByte;
+            }
+
+            return new Type1DataToken(Type1Token.TokenType.Charstring, data);
+        }
+    }
+}