add support for parsing pfb files in type 1 fonts and an extra integration test

2025-10-15 03:34:52 +08:00 · 2018-04-12 22:34:38 +01:00
parent 7af2b1bcb9
commit e063ac45fe
8 changed files with 170 additions and 25 deletions
--- a/src/UglyToad.PdfPig.Tests/Fonts/Type1/Raleway-Black.pfb
+++ b/src/UglyToad.PdfPig.Tests/Fonts/Type1/Raleway-Black.pfb
--- a/src/UglyToad.PdfPig.Tests/Fonts/Type1/Type1FontParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Fonts/Type1/Type1FontParserTests.cs
@@ -18,16 +18,16 @@
        {
            var bytes = GetFileBytes("AdobeUtopia.pfa");
-            parser.Parse(new ByteArrayInputBytes(bytes));
+            parser.Parse(new ByteArrayInputBytes(bytes),0, 0);
        }
        [Fact]
-        public void CanReadBinaryEncryptedPortion()
+        public void CanReadBinaryEncryptedPortionOfFullPfb()
        {
            // TODO: support reading in these pfb files
-            //var bytes = GetFileBytes("cmbx8.pfb");
+            var bytes = GetFileBytes("Raleway-Black.pfb");
-            //parser.Parse(new ByteArrayInputBytes(bytes));
+            parser.Parse(new ByteArrayInputBytes(bytes), 0, 0);
        }
        [Fact]
@@ -35,7 +35,7 @@
        {
            var bytes = StringBytesTestConverter.Convert(Cmbx12, false);
-            parser.Parse(bytes.Bytes);
+            parser.Parse(bytes.Bytes, 0, 0);
        }
        private const string Cmbx12 = @"%!PS-AdobeFont-1.1: CMBX12 1.0
--- a/src/UglyToad.PdfPig.Tests/Integration/Documents/Pig
+++ b/src/UglyToad.PdfPig.Tests/Integration/Documents/Pig
--- a/src/UglyToad.PdfPig.Tests/Integration/PigReproductionPowerpointTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Integration/PigReproductionPowerpointTests.cs
@@ -0,0 +1,36 @@
 namespace UglyToad.PdfPig.Tests.Integration
 {
    using System;
    using System.IO;
    using Xunit;
    public class PigReproductionPowerpointTests
    {
        private static string GetFilename()
        {
            var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
            return Path.Combine(documentFolder, "Pig Reproduction Powerpoint.pdf");
        }
        [Fact]
        public void CanReadContent()
        {
            using (var document = PdfDocument.Open(GetFilename()))
            {
                var page = document.GetPage(1);
                Assert.Contains("Pigs per sow per year: 18 to 27", page.Text);
            }
        }
        [Fact]
        public void HasCorrectNumberOfPages()
        {
            using (var document = PdfDocument.Open(GetFilename()))
            {
                Assert.Equal(35, document.NumberOfPages);
            }
        }
    }
 }
--- a/src/UglyToad.PdfPig.Tests/UglyToad.PdfPig.Tests.csproj
+++ b/src/UglyToad.PdfPig.Tests/UglyToad.PdfPig.Tests.csproj
@@ -11,12 +11,14 @@
    <None Remove="Fonts\TrueType\google-simple-doc.ttf" />
    <None Remove="Fonts\TrueType\Roboto-Regular.ttf" />
    <None Remove="Fonts\Type1\AdobeUtopia.pfa" />
    <None Remove="Fonts\Type1\Raleway-Black.pfb" />
    <None Remove="Integration\Documents\FarmerMac.pdf" />
    <None Remove="Integration\Documents\Font Size Test - from google chrome print pdf.pdf" />
    <None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
    <None Remove="Integration\Documents\ICML03-081.pdf" />
    <None Remove="Integration\Documents\Judgement Document.pdf" />
    <None Remove="Integration\Documents\Multiple Page - from Mortality Statistics.pdf" />
    <None Remove="Integration\Documents\Pig Reproduction Powerpoint.pdf" />
    <None Remove="Integration\Documents\Single Page Form Content - from itext 1_1.pdf" />
    <None Remove="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf" />
    <None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" />
@@ -35,6 +37,9 @@
    <EmbeddedResource Include="Fonts\Type1\AdobeUtopia.pfa">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </EmbeddedResource>
    <EmbeddedResource Include="Fonts\Type1\Raleway-Black.pfb">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </EmbeddedResource>
    <Content Include="Integration\Documents\FarmerMac.pdf">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </Content>
@@ -53,6 +58,9 @@
    <Content Include="Integration\Documents\Multiple Page - from Mortality Statistics.pdf">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </Content>
    <Content Include="Integration\Documents\Pig Reproduction Powerpoint.pdf">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </Content>
    <Content Include="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </Content>
--- a/src/UglyToad.PdfPig/Fonts/Parser/Handlers/Type1FontHandler.cs
+++ b/src/UglyToad.PdfPig/Fonts/Parser/Handlers/Type1FontHandler.cs
@@ -103,18 +103,17 @@
            try
            {
-                var stream = pdfScanner.Get(descriptor.FontFile.ObjectKey.Data).Data as StreamToken;
+                if (!(pdfScanner.Get(descriptor.FontFile.ObjectKey.Data).Data is StreamToken stream))
                if (stream == null)
                {
                    return null;
                }
                var length1 = stream.StreamDictionary.Get<NumericToken>(NameToken.Length1, pdfScanner);
                var length2 = stream.StreamDictionary.Get<NumericToken>(NameToken.Length2, pdfScanner);
                var bytes = stream.Decode(filterProvider);
-
+                
-                var text = OtherEncodings.BytesAsLatin1String(bytes);
+                var font = type1FontParser.Parse(new ByteArrayInputBytes(bytes), length1.Int, length2.Int);
                var font = type1FontParser.Parse(new ByteArrayInputBytes(bytes));
                return font;
            }
--- a/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs
+++ b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs
@@ -12,6 +12,9 @@
    internal class Type1FontParser
    {
        private const string ClearToMark = "cleartomark";
        private const int PfbFileIndicator = 0x80;
        private readonly Type1EncryptedPortionParser encryptedPortionParser;
        public Type1FontParser(Type1EncryptedPortionParser encryptedPortionParser)
@@ -19,8 +22,27 @@
            this.encryptedPortionParser = encryptedPortionParser;
        }
-        public Type1Font Parse(IInputBytes inputBytes)
+        /// <summary>
        /// Parses an embedded Adobe Type 1 font file.
        /// </summary>
        /// <param name="inputBytes">The bytes of the font program.</param>
        /// <param name="length1">The length in bytes of the clear text portion of the font program.</param>
        /// <param name="length2">The length in bytes of the encrypted portion of the font program.</param>
        /// <returns>The parsed type 1 font.</returns>
        public Type1Font Parse(IInputBytes inputBytes, int length1, int length2)
        {
            var isEntirePfbFile = inputBytes.Peek() == PfbFileIndicator;
            IReadOnlyList<byte> eexecPortion = new byte[0];
            if (isEntirePfbFile)
            {
                var (ascii, binary) = ReadPfbHeader(inputBytes);
                eexecPortion = binary;
                inputBytes = new ByteArrayInputBytes(ascii);
            }
            var scanner = new CoreTokenScanner(inputBytes);
            if (!scanner.TryReadToken(out CommentToken comment) || !comment.Data.StartsWith("!"))
@@ -53,11 +75,10 @@
            var nameTokenizer = new Type1NameTokenizer();
            scanner.RegisterCustomTokenizer((byte)'{', arrayTokenizer);
            scanner.RegisterCustomTokenizer((byte)'/', nameTokenizer);
-
+            
            var eexecPortion = new List<byte>();
            try
            {
                var tempEexecPortion = new List<byte>();
                var tokenSet = new PreviousTokenSet();
                tokenSet.Add(scanner.CurrentToken);
                while (scanner.MoveNext())
@@ -80,7 +101,7 @@
                                    {
                                        for (int i = 0; i < offset; i++)
                                        {
-                                            eexecPortion.Add((byte)ClearToMark[i]);
+                                            tempEexecPortion.Add((byte)ClearToMark[i]);
                                        }
                                    }
@@ -97,7 +118,7 @@
                                    continue;
                                }
-                                eexecPortion.Add(inputBytes.CurrentByte);
+                                tempEexecPortion.Add(inputBytes.CurrentByte);
                            }
                        }
                        else
@@ -108,6 +129,11 @@
                    tokenSet.Add(scanner.CurrentToken);
                }
                if (!isEntirePfbFile)
                {
                    eexecPortion = tempEexecPortion;
                }
            }
            finally
            {
@@ -124,6 +150,65 @@
            return new Type1Font(name, encoding, matrix, boundingBox);
        }
        /// <summary>
        /// Where an entire PFB file has been embedded in the PDF we read the header first.
        /// </summary>
        private static (byte[] ascii, byte[] binary) ReadPfbHeader(IInputBytes bytes)
        {
            int ReadSize(byte recordType)
            {
                bytes.MoveNext();
                if (bytes.CurrentByte != PfbFileIndicator)
                {
                    throw new InvalidOperationException($"File does not start with 0x80, which indicates a full PFB file. Instead got: {bytes.CurrentByte}");
                }
                bytes.MoveNext();
                if (bytes.CurrentByte != recordType)
                {
                    throw new InvalidOperationException($"Encountered unexpected header type in the PFB file: {bytes.CurrentByte}");
                }
                bytes.MoveNext();
                int size = bytes.CurrentByte;
                bytes.MoveNext();
                size += bytes.CurrentByte << 8;
                bytes.MoveNext();
                size += bytes.CurrentByte << 16;
                bytes.MoveNext();
                size += bytes.CurrentByte << 24;
                return size;
            }
            var asciiSize = ReadSize(0x01);
            var asciiPart = new byte[asciiSize];
            int i = 0;
            while (i < asciiSize)
            {
                bytes.MoveNext();
                asciiPart[i] = bytes.CurrentByte;
                i++;
            }
            var binarySize = ReadSize(0x02);
            var binaryPart = new byte[binarySize];
            i = 0;
            while (i < binarySize)
            {
                bytes.MoveNext();
                binaryPart[i] = bytes.CurrentByte;
                i++;
            }
            return (asciiPart, binaryPart);
        }
        private static void HandleOperator(OperatorToken token, ISeekableTokenScanner scanner, PreviousTokenSet set, List<DictionaryToken> dictionaries)
        {
            switch (token.Data)
@@ -266,8 +351,8 @@
                {
                    for (var i = 0; i < encodingArray.Data.Count; i += 2)
                    {
-                        var code = (NumericToken) encodingArray.Data[i];
+                        var code = (NumericToken)encodingArray.Data[i];
-                        var name = (NameToken) encodingArray.Data[i + 1];
+                        var name = (NameToken)encodingArray.Data[i + 1];
                        result[code.Int] = name.Data;
                    }
@@ -298,10 +383,10 @@
            {
                if (dictionary.TryGet(NameToken.FontBbox, out var token) && token is ArrayToken array && array.Data.Count == 4)
                {
-                    var x1 = (NumericToken) array.Data[0];
+                    var x1 = (NumericToken)array.Data[0];
-                    var y1 = (NumericToken) array.Data[1];
+                    var y1 = (NumericToken)array.Data[1];
-                    var x2 = (NumericToken) array.Data[2];
+                    var x2 = (NumericToken)array.Data[2];
-                    var y2 = (NumericToken) array.Data[3];
+                    var y2 = (NumericToken)array.Data[3];
                    return new PdfRectangle(x1.Data, y1.Data, x2.Data, y2.Data);
                }
@@ -309,7 +394,7 @@
            return null;
        }
-        
+
        private class PreviousTokenSet
        {
            private readonly IToken[] tokens = new IToken[3];
--- a/src/UglyToad.PdfPig/Tokenization/Tokens/DictionaryToken.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Tokens/DictionaryToken.cs
@@ -3,6 +3,8 @@
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using Parser.Parts;
    using Scanner;
    using Util.JetBrains.Annotations;
    internal class DictionaryToken : IDataToken<IReadOnlyDictionary<string, IToken>>
@@ -39,6 +41,21 @@
        {
            Data = data;
        }
        public T Get<T>(NameToken name, IPdfTokenScanner scanner) where T : IToken
        {
            if (!TryGet(name, out var token) || !(token is T typedToken))
            {
                if (!(token is IndirectReferenceToken indirectReference))
                {
                    throw new InvalidOperationException($"Dictionary does not contain token with name {name} of type {typeof(T).Name}.");
                }
                typedToken = DirectObjectFinder.Get<T>(indirectReference, scanner);
            }
            return typedToken;
        }
        public bool TryGet(NameToken name, out IToken token)
        {