port type 1 lexer from pdf box and add test data

This commit is contained in:
Eliot Jones
2018-10-23 20:02:20 +01:00
parent c8c32eab24
commit df0b60c2e1
9 changed files with 528 additions and 6 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -3,10 +3,8 @@
using System;
using System.IO;
using System.Linq;
using System.Text;
using PdfPig.Fonts.Type1.Parser;
using PdfPig.IO;
using PdfPig.Util;
using Xunit;
public class Type1FontParserTests
@@ -30,6 +28,14 @@
parser.Parse(new ByteArrayInputBytes(bytes), 0, 0);
}
[Fact]
public void CanReadCharStrings()
{
var bytes = GetFileBytes("CMBX10.pfa");
parser.Parse(new ByteArrayInputBytes(bytes), 0, 0);
}
[Fact]
public void CanReadAsciiPart()
{

View File

@@ -28,6 +28,17 @@
}
}
[Fact]
public void LettersHaveHeight()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
Assert.NotEqual(0, page.Letters[0].GlyphRectangle.Height);
}
}
[Fact]
public void HasCorrectNumberOfPages()
{

View File

@@ -2,6 +2,7 @@
{
using System.Collections.Generic;
using System.Linq;
using IO;
using PdfPig.Parser.Parts;
using Tokenization.Tokens;
using Util;
@@ -11,7 +12,7 @@
private const ushort EexecEncryptionKey = 55665;
private const int EexecRandomBytes = 4;
public void Parse(IReadOnlyList<byte> bytes)
public IReadOnlyList<byte> Parse(IReadOnlyList<byte> bytes)
{
if (!IsBinary(bytes))
{
@@ -20,7 +21,23 @@
var decrypted = Decrypt(bytes, EexecEncryptionKey, EexecRandomBytes);
// line 461 of type1parser.java
var str = OtherEncodings.BytesAsLatin1String(decrypted.ToArray());
var tokenizer = new Type1Tokenizer(new ByteArrayInputBytes(decrypted));
while (tokenizer.CurrentToken != null)
{
tokenizer.GetNext();
}
/*
* After 4 random characters follows the /Private dictionary and the /CharString dictionary.
* The first defines a number of technical terms involving character construction, and contains also an array of subroutines used in character paths.
* The second contains the character descriptions themselves.
* Both the subroutines and the character descriptions are yet again encrypted in a fashion similar to the entire binary segment, but now with an initial value of R = 4330 instead of 55665.
*/
return decrypted;
}
/// <summary>
@@ -89,6 +106,16 @@
private static IReadOnlyList<byte> Decrypt(IReadOnlyList<byte> bytes, int key, int randomBytes)
{
/*
* We start with three constants R = 55665, c1 = 52845 and c2 = 22719.
* Then we apply to the entire binary array c[i] of length n the decryption procedure:
* for in [0, n):
* p[i] = c[i]^(R >> 8)
* R = ((c[i] + R)*c1 + c2) & ((1 << 16) - 1)
*
* Here ^ means xor addition, in which one interprets the bits modulo 2.
* The encryption key R changes as the procedure is carried out.
*/
if (randomBytes == -1)
{
return bytes;

View File

@@ -5,6 +5,7 @@
using Exceptions;
using Geometry;
using IO;
using PdfPig.Parser.Parts;
using Tokenization;
using Tokenization.Scanner;
using Tokenization.Tokens;
@@ -14,7 +15,8 @@
private const string ClearToMark = "cleartomark";
private const int PfbFileIndicator = 0x80;
private const int EexecKey = 55665;
private readonly Type1EncryptedPortionParser encryptedPortionParser;
public Type1FontParser(Type1EncryptedPortionParser encryptedPortionParser)
@@ -146,7 +148,7 @@
var matrix = GetFontMatrix(dictionaries);
var boundingBox = GetBoundingBox(dictionaries);
encryptedPortionParser.Parse(eexecPortion);
var binaryPortion = encryptedPortionParser.Parse(eexecPortion);
return new Type1Font(name, encoding, matrix, boundingBox ?? new PdfRectangle());
}
@@ -349,7 +351,6 @@
return new ArrayToken(result);
}
private static Dictionary<int, string> GetEncoding(IReadOnlyList<DictionaryToken> dictionaries)
{
var result = new Dictionary<int, string>();

View File

@@ -0,0 +1,90 @@
namespace UglyToad.PdfPig.Fonts.Type1.Parser
{
using System;
using System.Collections.Generic;
internal class Type1DataToken : Type1Token
{
public IReadOnlyList<byte> Data { get; }
public Type1DataToken(TokenType type, IReadOnlyList<byte> data) : base(type)
{
if (type != TokenType.Charstring)
{
throw new ArgumentException($"Invalid token type for type 1 token receiving bytes, expected Charstring, got {type}.");
}
Data = data;
}
public override string ToString()
{
return $"Token[type = {Type}, data = {Data.Count} bytes]";
}
}
internal class Type1TextToken : Type1Token
{
public string Text { get; }
public Type1TextToken(char c, TokenType type) : this(c.ToString(), type) { }
public Type1TextToken(string text, TokenType type) : base(type)
{
Text = text;
}
public int AsInt()
{
return (int)AsFloat();
}
public float AsFloat()
{
return float.Parse(Text);
}
public bool AsBool()
{
return string.Equals(Text, "true", StringComparison.OrdinalIgnoreCase);
}
public override string ToString()
{
return $"Token[type={Type}, text={Text}]";
}
}
internal class Type1Token
{
public TokenType Type { get; }
public Type1Token(TokenType type)
{
Type = type;
}
public enum TokenType
{
None,
String,
Name,
Literal,
Real,
Integer,
/// <summary>
/// An array must begin with either '[' or '{'.
/// </summary>
StartArray,
/// <summary>
/// An array must end with either ']' or '}'.
/// </summary>
EndArray,
StartProc,
EndProc,
StartDict,
EndDict,
Charstring
}
}
}

View File

@@ -0,0 +1,387 @@
namespace UglyToad.PdfPig.Fonts.Type1.Parser
{
using System;
using System.Text;
using IO;
using PdfPig.Parser.Parts;
internal class Type1Tokenizer
{
private readonly StringBuilder commentBuffer = new StringBuilder();
private readonly StringBuilder literalBuffer = new StringBuilder();
private readonly StringBuilder stringBuffer = new StringBuilder();
private readonly IInputBytes bytes;
private int openParens;
private Type1Token previousToken;
public Type1Token CurrentToken { get; private set; }
public Type1Tokenizer(IInputBytes bytes)
{
this.bytes = bytes;
CurrentToken = ReadNextToken();
}
public Type1Token GetNext()
{
CurrentToken = ReadNextToken();
return CurrentToken;
}
private Type1Token ReadNextToken()
{
previousToken = CurrentToken;
bool skip;
do
{
skip = false;
while (bytes.MoveNext())
{
var b = bytes.CurrentByte;
var c = (char)b;
switch (c)
{
case '%':
var comment = ReadComment();
break;
case '(':
return ReadString();
case ')':
throw new InvalidOperationException("Encountered an end of string ')' outside of string.");
case '[':
return new Type1TextToken(c, Type1Token.TokenType.StartArray);
case ']':
return new Type1TextToken(c, Type1Token.TokenType.EndArray);
case '{':
return new Type1TextToken(c, Type1Token.TokenType.StartProc);
case '}':
return new Type1TextToken(c, Type1Token.TokenType.EndProc);
case '/':
{
var name = ReadLiteral();
return new Type1TextToken(name, Type1Token.TokenType.Literal);
}
case '<':
{
var following = bytes.Peek();
if (following == '<')
{
bytes.MoveNext();
return new Type1TextToken("<<", Type1Token.TokenType.StartDict);
}
return new Type1TextToken(c, Type1Token.TokenType.Name);
}
case '>':
{
var following = bytes.Peek();
if (following == '>')
{
bytes.MoveNext();
return new Type1TextToken(">>", Type1Token.TokenType.EndDict);
}
return new Type1TextToken(c, Type1Token.TokenType.Name);
}
default:
{
if (ReadHelper.IsWhitespace(b))
{
skip = true;
break;
}
if (b == 0)
{
skip = true;
break;
}
if (TryReadNumber(out var number))
{
return number;
}
var name = ReadLiteral(c);
if (name == null)
{
throw new InvalidOperationException($"The binary portion of the type 1 font was invalid at position {bytes.CurrentOffset}.");
}
if (name.Equals("RD") || name.Equals("-|"))
{
if (previousToken.Type == Type1Token.TokenType.Integer)
{
return ReadCharString(((Type1TextToken)previousToken).AsInt());
}
throw new InvalidOperationException($"Expected integer token before {name} at offset {bytes.CurrentOffset}.");
}
return new Type1TextToken(name, Type1Token.TokenType.Name);
}
}
}
} while (skip);
return null;
}
private Type1TextToken ReadString()
{
char GetNext()
{
bytes.MoveNext();
return (char)bytes.CurrentByte;
}
stringBuffer.Clear();
while (bytes.MoveNext())
{
var c = (char)bytes.CurrentByte;
// string context
switch (c)
{
case '(':
openParens++;
stringBuffer.Append('(');
break;
case ')':
if (openParens == 0)
{
// end of string
return new Type1TextToken(stringBuffer.ToString(), Type1Token.TokenType.String);
}
stringBuffer.Append(')');
openParens--;
break;
case '\\':
// escapes: \n \r \t \b \f \\ \( \)
char c1 = GetNext();
switch (c1)
{
case 'n':
case 'r': stringBuffer.Append("\n"); break;
case 't': stringBuffer.Append('\t'); break;
case 'b': stringBuffer.Append('\b'); break;
case 'f': stringBuffer.Append('\f'); break;
case '\\': stringBuffer.Append('\\'); break;
case '(': stringBuffer.Append('('); break;
case ')': stringBuffer.Append(')'); break;
}
// octal \ddd
if (char.IsDigit(c1))
{
var rawOctal = new string(new[] { c1, GetNext(), GetNext() });
var code = Convert.ToInt32(rawOctal, 8);
stringBuffer.Append((char)code);
}
break;
case '\r':
case '\n':
stringBuffer.Append("\n");
break;
default:
stringBuffer.Append(c);
break;
}
}
return null;
}
private bool TryReadNumber(out Type1TextToken numberToken)
{
char GetNext()
{
bytes.MoveNext();
return (char)bytes.CurrentByte;
}
numberToken = null;
var currentPosition = bytes.CurrentOffset;
var sb = new StringBuilder();
StringBuilder radix = null;
char c = GetNext();
var hasDigit = false;
// optional + or -
if (c == '+' || c == '-')
{
sb.Append(c);
c = GetNext();
}
// optional digits
while (char.IsDigit(c))
{
sb.Append(c);
c = GetNext();
hasDigit = true;
}
// optional .
if (c == '.')
{
sb.Append(c);
c = GetNext();
}
else if (c == '#')
{
// PostScript radix number takes the form base#number
radix = sb;
sb = new StringBuilder();
c = GetNext();
}
else if (sb.Length == 0 || !hasDigit)
{
// failure
bytes.Seek(currentPosition);
return false;
}
else
{
// integer
bytes.Seek(bytes.CurrentOffset - 1);
numberToken = new Type1TextToken(sb.ToString(), Type1Token.TokenType.Integer);
return true;
}
// required digit
if (char.IsDigit(c))
{
sb.Append(c);
c = GetNext();
}
else
{
bytes.Seek(currentPosition);
return false;
}
// optional digits
while (char.IsDigit(c))
{
sb.Append(c);
c = GetNext();
}
// optional E
if (c == 'E')
{
sb.Append(c);
c = GetNext();
// optional minus
if (c == '-')
{
sb.Append(c);
c = GetNext();
}
// required digit
if (char.IsDigit(c))
{
sb.Append(c);
c = GetNext();
}
else
{
bytes.Seek(currentPosition);
return false;
}
// optional digits
while (char.IsDigit(c))
{
sb.Append(c);
c = GetNext();
}
}
bytes.Seek(bytes.CurrentOffset - 1);
if (radix != null)
{
var number = Convert.ToInt32(sb.ToString(), int.Parse(radix.ToString()));
numberToken = new Type1TextToken(number.ToString(), Type1Token.TokenType.Integer);
}
else
{
numberToken = new Type1TextToken(sb.ToString(), Type1Token.TokenType.Real);
}
return true;
}
private string ReadLiteral(char? previousCharacter = null)
{
literalBuffer.Clear();
if (previousCharacter.HasValue)
{
literalBuffer.Append(previousCharacter);
}
do
{
var b = bytes.Peek();
if (!b.HasValue)
{
break;
}
var c = (char)b;
if (char.IsWhiteSpace(c) || c == '(' || c == ')' || c == '<' || c == '>' ||
c == '[' || c == ']' || c == '{' || c == '}' || c == '/' || c == '%')
{
break;
}
literalBuffer.Append(c);
} while (bytes.MoveNext());
var literal = literalBuffer.ToString();
return literal.Length == 0 ? null : literal;
}
private string ReadComment()
{
commentBuffer.Clear();
while (bytes.MoveNext())
{
var c = (char)bytes.CurrentByte;
if (ReadHelper.IsEndOfLine(c))
{
continue;
}
commentBuffer.Append(c);
}
return commentBuffer.ToString();
}
private Type1DataToken ReadCharString(int length)
{
// Skip preceding space.
bytes.MoveNext();
byte[] data = new byte[length];
for (int i = 0; i < length; i++)
{
bytes.MoveNext();
data[i] = bytes.CurrentByte;
}
return new Type1DataToken(Type1Token.TokenType.Charstring, data);
}
}
}