add support for parsing pfb files in type 1 fonts and an extra integration test

This commit is contained in:
Eliot Jones
2018-04-12 22:34:38 +01:00
parent 7af2b1bcb9
commit e063ac45fe
8 changed files with 170 additions and 25 deletions

View File

@@ -18,16 +18,16 @@
{
var bytes = GetFileBytes("AdobeUtopia.pfa");
parser.Parse(new ByteArrayInputBytes(bytes));
parser.Parse(new ByteArrayInputBytes(bytes),0, 0);
}
[Fact]
public void CanReadBinaryEncryptedPortion()
public void CanReadBinaryEncryptedPortionOfFullPfb()
{
// TODO: support reading in these pfb files
//var bytes = GetFileBytes("cmbx8.pfb");
var bytes = GetFileBytes("Raleway-Black.pfb");
//parser.Parse(new ByteArrayInputBytes(bytes));
parser.Parse(new ByteArrayInputBytes(bytes), 0, 0);
}
[Fact]
@@ -35,7 +35,7 @@
{
var bytes = StringBytesTestConverter.Convert(Cmbx12, false);
parser.Parse(bytes.Bytes);
parser.Parse(bytes.Bytes, 0, 0);
}
private const string Cmbx12 = @"%!PS-AdobeFont-1.1: CMBX12 1.0

View File

@@ -0,0 +1,36 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using System;
using System.IO;
using Xunit;
public class PigReproductionPowerpointTests
{
private static string GetFilename()
{
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
return Path.Combine(documentFolder, "Pig Reproduction Powerpoint.pdf");
}
[Fact]
public void CanReadContent()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
Assert.Contains("Pigs per sow per year: 18 to 27", page.Text);
}
}
[Fact]
public void HasCorrectNumberOfPages()
{
using (var document = PdfDocument.Open(GetFilename()))
{
Assert.Equal(35, document.NumberOfPages);
}
}
}
}

View File

@@ -11,12 +11,14 @@
<None Remove="Fonts\TrueType\google-simple-doc.ttf" />
<None Remove="Fonts\TrueType\Roboto-Regular.ttf" />
<None Remove="Fonts\Type1\AdobeUtopia.pfa" />
<None Remove="Fonts\Type1\Raleway-Black.pfb" />
<None Remove="Integration\Documents\FarmerMac.pdf" />
<None Remove="Integration\Documents\Font Size Test - from google chrome print pdf.pdf" />
<None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
<None Remove="Integration\Documents\ICML03-081.pdf" />
<None Remove="Integration\Documents\Judgement Document.pdf" />
<None Remove="Integration\Documents\Multiple Page - from Mortality Statistics.pdf" />
<None Remove="Integration\Documents\Pig Reproduction Powerpoint.pdf" />
<None Remove="Integration\Documents\Single Page Form Content - from itext 1_1.pdf" />
<None Remove="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf" />
<None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" />
@@ -35,6 +37,9 @@
<EmbeddedResource Include="Fonts\Type1\AdobeUtopia.pfa">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</EmbeddedResource>
<EmbeddedResource Include="Fonts\Type1\Raleway-Black.pfb">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</EmbeddedResource>
<Content Include="Integration\Documents\FarmerMac.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
@@ -53,6 +58,9 @@
<Content Include="Integration\Documents\Multiple Page - from Mortality Statistics.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Pig Reproduction Powerpoint.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>

View File

@@ -103,18 +103,17 @@
try
{
var stream = pdfScanner.Get(descriptor.FontFile.ObjectKey.Data).Data as StreamToken;
if (stream == null)
if (!(pdfScanner.Get(descriptor.FontFile.ObjectKey.Data).Data is StreamToken stream))
{
return null;
}
var length1 = stream.StreamDictionary.Get<NumericToken>(NameToken.Length1, pdfScanner);
var length2 = stream.StreamDictionary.Get<NumericToken>(NameToken.Length2, pdfScanner);
var bytes = stream.Decode(filterProvider);
var text = OtherEncodings.BytesAsLatin1String(bytes);
var font = type1FontParser.Parse(new ByteArrayInputBytes(bytes));
var font = type1FontParser.Parse(new ByteArrayInputBytes(bytes), length1.Int, length2.Int);
return font;
}

View File

@@ -12,6 +12,9 @@
internal class Type1FontParser
{
private const string ClearToMark = "cleartomark";
private const int PfbFileIndicator = 0x80;
private readonly Type1EncryptedPortionParser encryptedPortionParser;
public Type1FontParser(Type1EncryptedPortionParser encryptedPortionParser)
@@ -19,8 +22,27 @@
this.encryptedPortionParser = encryptedPortionParser;
}
public Type1Font Parse(IInputBytes inputBytes)
/// <summary>
/// Parses an embedded Adobe Type 1 font file.
/// </summary>
/// <param name="inputBytes">The bytes of the font program.</param>
/// <param name="length1">The length in bytes of the clear text portion of the font program.</param>
/// <param name="length2">The length in bytes of the encrypted portion of the font program.</param>
/// <returns>The parsed type 1 font.</returns>
public Type1Font Parse(IInputBytes inputBytes, int length1, int length2)
{
var isEntirePfbFile = inputBytes.Peek() == PfbFileIndicator;
IReadOnlyList<byte> eexecPortion = new byte[0];
if (isEntirePfbFile)
{
var (ascii, binary) = ReadPfbHeader(inputBytes);
eexecPortion = binary;
inputBytes = new ByteArrayInputBytes(ascii);
}
var scanner = new CoreTokenScanner(inputBytes);
if (!scanner.TryReadToken(out CommentToken comment) || !comment.Data.StartsWith("!"))
@@ -54,10 +76,9 @@
scanner.RegisterCustomTokenizer((byte)'{', arrayTokenizer);
scanner.RegisterCustomTokenizer((byte)'/', nameTokenizer);
var eexecPortion = new List<byte>();
try
{
var tempEexecPortion = new List<byte>();
var tokenSet = new PreviousTokenSet();
tokenSet.Add(scanner.CurrentToken);
while (scanner.MoveNext())
@@ -80,7 +101,7 @@
{
for (int i = 0; i < offset; i++)
{
eexecPortion.Add((byte)ClearToMark[i]);
tempEexecPortion.Add((byte)ClearToMark[i]);
}
}
@@ -97,7 +118,7 @@
continue;
}
eexecPortion.Add(inputBytes.CurrentByte);
tempEexecPortion.Add(inputBytes.CurrentByte);
}
}
else
@@ -108,6 +129,11 @@
tokenSet.Add(scanner.CurrentToken);
}
if (!isEntirePfbFile)
{
eexecPortion = tempEexecPortion;
}
}
finally
{
@@ -124,6 +150,65 @@
return new Type1Font(name, encoding, matrix, boundingBox);
}
/// <summary>
/// Where an entire PFB file has been embedded in the PDF we read the header first.
/// </summary>
private static (byte[] ascii, byte[] binary) ReadPfbHeader(IInputBytes bytes)
{
int ReadSize(byte recordType)
{
bytes.MoveNext();
if (bytes.CurrentByte != PfbFileIndicator)
{
throw new InvalidOperationException($"File does not start with 0x80, which indicates a full PFB file. Instead got: {bytes.CurrentByte}");
}
bytes.MoveNext();
if (bytes.CurrentByte != recordType)
{
throw new InvalidOperationException($"Encountered unexpected header type in the PFB file: {bytes.CurrentByte}");
}
bytes.MoveNext();
int size = bytes.CurrentByte;
bytes.MoveNext();
size += bytes.CurrentByte << 8;
bytes.MoveNext();
size += bytes.CurrentByte << 16;
bytes.MoveNext();
size += bytes.CurrentByte << 24;
return size;
}
var asciiSize = ReadSize(0x01);
var asciiPart = new byte[asciiSize];
int i = 0;
while (i < asciiSize)
{
bytes.MoveNext();
asciiPart[i] = bytes.CurrentByte;
i++;
}
var binarySize = ReadSize(0x02);
var binaryPart = new byte[binarySize];
i = 0;
while (i < binarySize)
{
bytes.MoveNext();
binaryPart[i] = bytes.CurrentByte;
i++;
}
return (asciiPart, binaryPart);
}
private static void HandleOperator(OperatorToken token, ISeekableTokenScanner scanner, PreviousTokenSet set, List<DictionaryToken> dictionaries)
{
switch (token.Data)

View File

@@ -3,6 +3,8 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Parser.Parts;
using Scanner;
using Util.JetBrains.Annotations;
internal class DictionaryToken : IDataToken<IReadOnlyDictionary<string, IToken>>
@@ -40,6 +42,21 @@
Data = data;
}
public T Get<T>(NameToken name, IPdfTokenScanner scanner) where T : IToken
{
if (!TryGet(name, out var token) || !(token is T typedToken))
{
if (!(token is IndirectReferenceToken indirectReference))
{
throw new InvalidOperationException($"Dictionary does not contain token with name {name} of type {typeof(T).Name}.");
}
typedToken = DirectObjectFinder.Get<T>(indirectReference, scanner);
}
return typedToken;
}
public bool TryGet(NameToken name, out IToken token)
{
if (name == null)