add very hacky parsing for type 1 font files in order to read the encoding

This commit is contained in:
Eliot Jones
2018-01-14 18:59:03 +00:00
parent 615ee88a46
commit 4443cde229
13 changed files with 627 additions and 17 deletions

View File

@@ -0,0 +1,95 @@
namespace UglyToad.PdfPig.Tests.Fonts.Type1
{
using PdfPig.Fonts.Type1.Parser;
using Xunit;
public class Type1FontParserTests
{
private readonly Type1FontParser parser = new Type1FontParser();
[Fact]
public void CanRead()
{
var bytes = StringBytesTestConverter.Convert(Cmbx12, false);
parser.Parse(bytes.Bytes);
}
private const string Cmbx12 = @"%!PS-AdobeFont-1.1: CMBX12 1.0
%%CreationDate: 1991 Aug 20 16:34:54
% Copyright (C) 1997 American Mathematical Society. All Rights Reserved.
11 dict begin
/FontInfo 7 dict dup begin
/version (1.0) readonly def
/Notice (Copyright (C) 1997 American Mathematical Society. All Rights Reserved) readonly def
/FullName (CMBX12) readonly def
/FamilyName (Computer Modern) readonly def
/Weight (Bold) readonly def
/ItalicAngle 0 def
/isFixedPitch false def
end readonly def
/FontName /WDKAAR+CMBX12 def
/PaintType 0 def
/FontType 1 def
/FontMatrix [0.001 0 0 0.001 0 0] readonly def
/Encoding 256 array
0 1 255 {1 index exch /.notdef put} for
dup 12 /fi put
dup 46 /period put
dup 49 /one put
dup 50 /two put
dup 51 /three put
dup 52 /four put
dup 53 /five put
dup 65 /A put
dup 66 /B put
dup 67 /C put
dup 69 /E put
dup 73 /I put
dup 77 /M put
dup 78 /N put
dup 80 /P put
dup 82 /R put
dup 83 /S put
dup 84 /T put
dup 97 /a put
dup 98 /b put
dup 99 /c put
dup 100 /d put
dup 101 /e put
dup 102 /f put
dup 103 /g put
dup 104 /h put
dup 105 /i put
dup 107 /k put
dup 108 /l put
dup 109 /m put
dup 110 /n put
dup 111 /o put
dup 112 /p put
dup 114 /r put
dup 115 /s put
dup 116 /t put
dup 117 /u put
dup 118 /v put
dup 120 /x put
dup 121 /y put
readonly def
/FontBBox{-53 -251 1139 750}readonly def
/UniqueID 5000769 def
currentdict end
currentfile eexec
ÙÖoc;„j—¶†©~E£ÐªÙÖoc;„j—¶†©~E£Ð7Ԑ×1¼Iu`“ÂõÎ>ä9Á?î\ºlüýÄ6Ag_Â_²ÂGÄ´/³0¨;2j~þªÙÖoc;„j—¶†©~E£ÐªÙÖoc;„j—¶†©~E£ÐªÙÖoc;„j—¶†
©~E£ÐªÙÖoc;„j—¶†©~E£ÐªÙÖoc;„j—¶†©~E£ÐªÙÖoc;„j—¶†©~E£Ðª7Ԑ×1¼Iu`“ÂõÎ>ä9Á?î\ºlüýÄ6Ag_Â_²ÂGÄ´/³0¨;2j~þ
ÙÖoc;„j—¶†©~E£ÐªÙÖoc;„j—¶†©~E£ÐªÙÖoc;„j—¶†©~E£ÐªÙÖoc;„j—¶†©~E£ÐªÙÖoc;„j—¶†©~E£ÐªÙÖoc;„j—¶†©~E£Ðª7Ԑ×1¼Iu`“ÂõÎ>ä9Á?î\ºlüýÄ6Ag_Â_²ÂGÄ´/³0¨;2j~þ
×1¼Iu`“ÂõÎ>ä9Á?î\ºlüýÄ6Ag_Â_²ÂGÄ´/³0¨;2j~þv7Ԑ×1¼Iu`“ÂõÎ>ä9Á?î\ºlüýÄ6Ag_Â_²ÂGÄ´/³0¨;2j~þ000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
cleartomark";
}
}

View File

@@ -0,0 +1,40 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using System;
using System.IO;
using Xunit;
public class LaTexTests
{
private static string GetFilename()
{
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
return Path.Combine(documentFolder, "ICML03-081.pdf");
}
[Fact]
public void CanReadContent()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
Assert.Contains("TacklingthePoorAssumptionsofNaiveBayesTextClassifiers", page.Text);
var page2 = document.GetPage(2);
Assert.Contains("isθc={θc1,θc2,...,θcn},", page2.Text);
}
}
[Fact]
public void HasCorrectNumberOfPages()
{
using (var document = PdfDocument.Open(GetFilename()))
{
Assert.Equal(8, document.NumberOfPages);
}
}
}
}

View File

@@ -10,10 +10,7 @@
[Fact]
public void Tests()
{
using (var document = PdfDocument.Open(@"C:\Users\eliot\Downloads\ICML03-081.pdf"))
{
var page = document.GetPage(1);
}
}
}
}

View File

@@ -13,6 +13,7 @@
<None Remove="Integration\Documents\FarmerMac.pdf" />
<None Remove="Integration\Documents\Font Size Test - from google chrome print pdf.pdf" />
<None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
<None Remove="Integration\Documents\ICML03-081.pdf" />
<None Remove="Integration\Documents\Judgement Document.pdf" />
<None Remove="Integration\Documents\Multiple Page - from Mortality Statistics.pdf" />
<None Remove="Integration\Documents\Single Page Form Content - from itext 1_1.pdf" />
@@ -39,6 +40,9 @@
<Content Include="Integration\Documents\Font Size Test - from google chrome print pdf.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\ICML03-081.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Judgement Document.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>

View File

@@ -1,6 +1,5 @@
namespace UglyToad.PdfPig.Fonts.Parser.Handlers
{
using System;
using Cmap;
using ContentStream;
using Cos;
@@ -13,6 +12,8 @@
using Simple;
using Tokenization.Scanner;
using Tokenization.Tokens;
using Type1;
using Type1.Parser;
internal class Type1FontHandler : IFontHandler
{
@@ -22,10 +23,12 @@
private readonly FontDescriptorFactory fontDescriptorFactory;
private readonly IEncodingReader encodingReader;
private readonly IPdfObjectScanner scanner;
private readonly Type1FontParser type1FontParser;
public Type1FontHandler(IPdfObjectParser pdfObjectParser, CMapCache cMapCache, IFilterProvider filterProvider,
FontDescriptorFactory fontDescriptorFactory, IEncodingReader encodingReader,
IPdfObjectScanner scanner)
IPdfObjectScanner scanner,
Type1FontParser type1FontParser)
{
this.pdfObjectParser = pdfObjectParser;
this.cMapCache = cMapCache;
@@ -33,6 +36,7 @@
this.fontDescriptorFactory = fontDescriptorFactory;
this.encodingReader = encodingReader;
this.scanner = scanner;
this.type1FontParser = type1FontParser;
}
public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
@@ -60,7 +64,7 @@
var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfObjectParser, fontDescriptorFactory, dictionary, reader, isLenientParsing);
ParseType1Font(descriptor, isLenientParsing);
var font = ParseType1Font(descriptor, isLenientParsing);
var name = FontDictionaryAccessHelper.GetName(pdfObjectParser, dictionary, descriptor, reader, isLenientParsing);
@@ -79,19 +83,24 @@
Encoding encoding = encodingReader.Read(dictionary, reader, isLenientParsing, descriptor);
return new Type1Font(name, firstCharacter, lastCharacter, widths, descriptor, encoding, toUnicodeCMap);
if (encoding == null && font?.Encoding.Count > 0)
{
encoding = new BuiltInEncoding(font.Encoding);
}
return new Type1FontSimple(name, firstCharacter, lastCharacter, widths, descriptor, encoding, toUnicodeCMap);
}
private void ParseType1Font(FontDescriptor descriptor, bool isLenientParsing)
private Type1Font ParseType1Font(FontDescriptor descriptor, bool isLenientParsing)
{
if (descriptor?.FontFile == null)
{
return;
return null;
}
if (descriptor.FontFile.ObjectKey.ObjectNumber == 0)
{
return;
return null;
}
try
@@ -100,14 +109,16 @@
if (stream == null)
{
return;
return null;
}
var raw = new PdfRawStream(stream);
var bytes = raw.Decode(filterProvider);
// TODO: parse
var font = type1FontParser.Parse(new ByteArrayInputBytes(bytes));
return font;
}
catch
{
@@ -116,6 +127,8 @@
throw;
}
}
return null;
}
}
}

View File

@@ -11,7 +11,7 @@
/// <summary>
/// TODO: implement this properly if you find a Type 1 font in the wild.
/// </summary>
internal class Type1Font : IFont
internal class Type1FontSimple : IFont
{
private readonly int firstChar;
private readonly int lastChar;
@@ -25,7 +25,7 @@
public bool IsVertical { get; } = false;
public Type1Font(CosName name, int firstChar, int lastChar, decimal[] widths, FontDescriptor fontDescriptor, Encoding encoding, CMap toUnicodeCMap)
public Type1FontSimple(CosName name, int firstChar, int lastChar, decimal[] widths, FontDescriptor fontDescriptor, Encoding encoding, CMap toUnicodeCMap)
{
this.firstChar = firstChar;
this.lastChar = lastChar;

View File

@@ -1,10 +1,299 @@
namespace UglyToad.PdfPig.Fonts.Type1.Parser
{
using System;
using System.Collections.Generic;
using Cos;
using Exceptions;
using Geometry;
using IO;
using Tokenization;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class Type1FontParser
{
public void Parse()
public Type1Font Parse(IInputBytes inputBytes)
{
var scanner = new CoreTokenScanner(inputBytes);
if (!scanner.TryReadToken(out CommentToken comment) || !comment.Data.StartsWith("!"))
{
throw new InvalidFontFormatException("The Type1 program did not start with '%!'.");
}
string name;
var parts = comment.Data.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries);
if (parts.Length == 3)
{
name = parts[1];
}
else
{
name = "Unknown";
}
var comments = new List<string>();
while (scanner.MoveNext() && scanner.CurrentToken is CommentToken commentToken)
{
comments.Add(commentToken.Data);
}
var dictionaries = new List<DictionaryToken>();
// Override arrays and names since type 1 handles these differently.
var arrayTokenizer = new Type1ArrayTokenizer();
var nameTokenizer = new Type1NameTokenizer();
scanner.RegisterCustomTokenizer((byte)'{', arrayTokenizer);
scanner.RegisterCustomTokenizer((byte)'/', nameTokenizer);
try
{
var tokenSet = new PreviousTokenSet();
tokenSet.Add(scanner.CurrentToken);
while (scanner.MoveNext())
{
if (scanner.CurrentToken is OperatorToken operatorToken)
{
HandleOperator(operatorToken, inputBytes, scanner, tokenSet, dictionaries);
}
tokenSet.Add(scanner.CurrentToken);
}
}
finally
{
scanner.DeregisterCustomTokenizer(arrayTokenizer);
scanner.DeregisterCustomTokenizer(nameTokenizer);
}
var encoding = GetEncoding(dictionaries);
var matrix = GetFontMatrix(dictionaries);
var boundingBox = GetBoundingBox(dictionaries);
return new Type1Font(name, encoding, matrix, boundingBox);
}
private void HandleOperator(OperatorToken token, IInputBytes bytes, ISeekableTokenScanner scanner, PreviousTokenSet set, List<DictionaryToken> dictionaries)
{
switch (token.Data)
{
case "dict":
var number = ((NumericToken)set[0]).Int;
var dictionary = ReadDictionary(number, scanner);
dictionaries.Add(dictionary);
break;
case "currentfile":
if (!scanner.MoveNext() || scanner.CurrentToken != OperatorToken.Eexec)
{
return;
}
// For now we will not read this stuff.
SkipEncryptedContent(bytes);
break;
default:
return;
}
}
private void SkipEncryptedContent(IInputBytes bytes)
{
bytes.Seek(bytes.Length - 1);
while (bytes.MoveNext())
{
// skip to end.
}
}
private static DictionaryToken ReadDictionary(int keys, ISeekableTokenScanner scanner)
{
IToken previousToken = null;
var dictionary = new Dictionary<IToken, IToken>();
// Skip the operators "dup" etc to reach "begin".
while (scanner.MoveNext() && (!(scanner.CurrentToken is OperatorToken operatorToken) || operatorToken.Data != "begin"))
{
// Skipping.
}
for (int i = 0; i < keys; i++)
{
if (!scanner.TryReadToken(out NameToken key))
{
return new DictionaryToken(dictionary);
}
if (key.Data.Equals(CosName.ENCODING))
{
dictionary[key] = ReadEncoding(scanner);
continue;
}
while (scanner.MoveNext())
{
if (scanner.CurrentToken == OperatorToken.Def)
{
dictionary[key] = previousToken;
break;
}
if (scanner.CurrentToken == OperatorToken.Dict)
{
if (!(previousToken is NumericToken numeric))
{
return new DictionaryToken(dictionary);
}
var inner = ReadDictionary(numeric.Int, scanner);
previousToken = inner;
}
else if (scanner.CurrentToken == OperatorToken.Readonly)
{
// skip
}
else if (scanner.CurrentToken is OperatorToken op && op.Data == "end")
{
// skip
}
else
{
previousToken = scanner.CurrentToken;
}
}
}
return new DictionaryToken(dictionary);
}
private static ArrayToken ReadEncoding(ISeekableTokenScanner scanner)
{
var result = new List<IToken>();
// Treat encoding differently, it's what we came here for!
if (!scanner.TryReadToken(out NumericToken _))
{
return new ArrayToken(result);
}
if (!scanner.TryReadToken(out OperatorToken arrayOperatorToken) || arrayOperatorToken.Data != "array")
{
return new ArrayToken(result);
}
while (scanner.MoveNext() && (!(scanner.CurrentToken is OperatorToken forOperator) || forOperator.Data != "for"))
{
// skip these operators for now, they're probably important...
}
if (scanner.CurrentToken != OperatorToken.For)
{
return new ArrayToken(result);
}
while (scanner.MoveNext() && scanner.CurrentToken != OperatorToken.Def && scanner.CurrentToken != OperatorToken.Readonly)
{
if (scanner.CurrentToken != OperatorToken.Dup)
{
throw new InvalidFontFormatException("Expected the array for encoding to begin with 'dup'.");
}
scanner.MoveNext();
var number = (NumericToken)scanner.CurrentToken;
scanner.MoveNext();
var name = (NameToken)scanner.CurrentToken;
if (!scanner.TryReadToken(out OperatorToken put) || put != OperatorToken.Put)
{
throw new InvalidFontFormatException("Expected the array entry to end with 'put'.");
}
result.Add(number);
result.Add(name);
}
while (scanner.CurrentToken != OperatorToken.Def && scanner.MoveNext())
{
// skip
}
return new ArrayToken(result);
}
private static Dictionary<int, string> GetEncoding(IReadOnlyList<DictionaryToken> dictionaries)
{
var result = new Dictionary<int, string>();
foreach (var dictionary in dictionaries)
{
if (dictionary.TryGetByName(CosName.ENCODING, out var token) && token is ArrayToken encodingArray)
{
for (var i = 0; i < encodingArray.Data.Count; i += 2)
{
var code = (NumericToken) encodingArray.Data[i];
var name = (NameToken) encodingArray.Data[i + 1];
result[code.Int] = name.Data.Name;
}
return result;
}
}
return result;
}
private static ArrayToken GetFontMatrix(IReadOnlyList<DictionaryToken> dictionaries)
{
foreach (var dictionaryToken in dictionaries)
{
if (dictionaryToken.TryGetByName(CosName.FONT_MATRIX, out var token) && token is ArrayToken array)
{
return array;
}
}
return null;
}
private static PdfRectangle GetBoundingBox(IReadOnlyList<DictionaryToken> dictionaries)
{
foreach (var dictionary in dictionaries)
{
if (dictionary.TryGetByName(CosName.FONT_BBOX, out var token) && token is ArrayToken array && array.Data.Count == 4)
{
var x1 = (NumericToken) array.Data[0];
var y1 = (NumericToken) array.Data[1];
var x2 = (NumericToken) array.Data[2];
var y2 = (NumericToken) array.Data[3];
return new PdfRectangle(x1.Data, y1.Data, x2.Data, y2.Data);
}
}
return null;
}
private class PreviousTokenSet
{
private readonly IToken[] tokens = new IToken[3];
public IToken this[int index] => tokens[2 - index];
public void Add(IToken token)
{
tokens[0] = tokens[1];
tokens[1] = tokens[2];
tokens[2] = token;
}
}
}
}

View File

@@ -0,0 +1,32 @@
namespace UglyToad.PdfPig.Fonts.Type1
{
using System.Collections.Generic;
using Cos;
using Geometry;
using Tokenization.Tokens;
using Util.JetBrains.Annotations;
/// <summary>
/// The information from the Type 1 font file.
/// </summary>
internal class Type1Font
{
public string Name { get; }
public IReadOnlyDictionary<int, string> Encoding { get; }
[CanBeNull]
public ArrayToken FontMatrix { get; }
[CanBeNull]
public PdfRectangle BoundingBox { get; }
public Type1Font(string name, IReadOnlyDictionary<int, string> encoding, ArrayToken fontMatrix, PdfRectangle boundingBox)
{
Name = name;
Encoding = encoding;
FontMatrix = fontMatrix;
BoundingBox = boundingBox;
}
}
}

View File

@@ -13,6 +13,7 @@
using Fonts.Parser.Handlers;
using Fonts.Parser.Parts;
using Fonts.TrueType.Parser;
using Fonts.Type1.Parser;
using Graphics;
using IO;
using Logging;
@@ -90,7 +91,7 @@
filterProvider,
pdfObjectParser),
new TrueTypeFontHandler(pdfObjectParser, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser, encodingReader),
new Type1FontHandler(pdfObjectParser, cMapCache, filterProvider, fontDescriptorFactory, encodingReader, pdfScanner),
new Type1FontHandler(pdfObjectParser, cMapCache, filterProvider, fontDescriptorFactory, encodingReader, pdfScanner, new Type1FontParser()),
new Type3FontHandler(pdfObjectParser, cMapCache, filterProvider, encodingReader));
var dynamicParser = container.Get<DynamicParser>();

View File

@@ -11,6 +11,13 @@
public static readonly OperatorToken EndObject = new OperatorToken("endobj");
public static readonly OperatorToken StartStream = new OperatorToken("stream");
public static readonly OperatorToken EndStream = new OperatorToken("endstream");
public static readonly OperatorToken Eexec = new OperatorToken("eexec");
public static readonly OperatorToken Def = new OperatorToken("def");
public static readonly OperatorToken Dict = new OperatorToken("dict");
public static readonly OperatorToken Readonly = new OperatorToken("readonly");
public static readonly OperatorToken Dup = new OperatorToken("dup");
public static readonly OperatorToken For = new OperatorToken("for");
public static readonly OperatorToken Put = new OperatorToken("put");
public string Data { get; }
@@ -39,6 +46,20 @@
return StartStream;
case "endstream":
return EndStream;
case "eexec":
return Eexec;
case "def":
return Def;
case "dict":
return Dict;
case "readonly":
return Readonly;
case "dup":
return Dup;
case "for":
return For;
case "put":
return Put;
default:
return new OperatorToken(data);
}

View File

@@ -0,0 +1,76 @@
namespace UglyToad.PdfPig.Tokenization
{
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Text;
using IO;
using Tokens;
internal class Type1ArrayTokenizer : ITokenizer
{
public bool ReadsNextByte { get; } = false;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;
if (currentByte != '{')
{
return false;
}
var builder = new StringBuilder();
while (inputBytes.MoveNext())
{
if (inputBytes.CurrentByte == '}')
{
break;
}
builder.Append((char) inputBytes.CurrentByte);
}
var parts = builder.ToString().Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries);
var tokens = new List<IToken>();
foreach (var part in parts)
{
if (char.IsNumber(part[0]) || part[0] == '-')
{
if (decimal.TryParse(part, NumberStyles.AllowLeadingSign, null, out var value))
{
tokens.Add(new NumericToken(value));
}
else
{
tokens.Add(OperatorToken.Create(part));
}
continue;
}
if (part[0] == '/')
{
tokens.Add(new NameToken(part.Substring(1)));
continue;
}
if (part[0] == '(' && part[part.Length - 1] == ')')
{
tokens.Add(new StringToken(part));
continue;
}
tokens.Add(OperatorToken.Create(part));
}
token = new ArrayToken(tokens);
return true;
}
}
}

View File

@@ -0,0 +1,42 @@
namespace UglyToad.PdfPig.Tokenization
{
using System.Text;
using IO;
using Parser.Parts;
using Tokens;
internal class Type1NameTokenizer : ITokenizer
{
public bool ReadsNextByte { get; } = true;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;
if (currentByte != '/')
{
return false;
}
var builder = new StringBuilder();
while (inputBytes.MoveNext())
{
if (ReadHelper.IsWhitespace(inputBytes.CurrentByte)
|| inputBytes.CurrentByte == '{'
|| inputBytes.CurrentByte == '<'
|| inputBytes.CurrentByte == '/'
|| inputBytes.CurrentByte == '['
|| inputBytes.CurrentByte == '(')
{
break;
}
builder.Append((char)inputBytes.CurrentByte);
}
token = new NameToken(builder.ToString());
return true;
}
}
}