diff --git a/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMBX10.pfa b/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMBX10.pfa new file mode 100644 index 00000000..db940c29 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMBX10.pfa differ diff --git a/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMBX12.pfa b/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMBX12.pfa new file mode 100644 index 00000000..f1299b51 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMBX12.pfa differ diff --git a/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMCSC10.pfa b/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMCSC10.pfa new file mode 100644 index 00000000..6001f74f Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Fonts/Type1/CMCSC10.pfa differ diff --git a/src/UglyToad.PdfPig.Tests/Fonts/Type1/Type1FontParserTests.cs b/src/UglyToad.PdfPig.Tests/Fonts/Type1/Type1FontParserTests.cs index 88a86698..d2ac5622 100644 --- a/src/UglyToad.PdfPig.Tests/Fonts/Type1/Type1FontParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Fonts/Type1/Type1FontParserTests.cs @@ -3,10 +3,8 @@ using System; using System.IO; using System.Linq; - using System.Text; using PdfPig.Fonts.Type1.Parser; using PdfPig.IO; - using PdfPig.Util; using Xunit; public class Type1FontParserTests @@ -30,6 +28,14 @@ parser.Parse(new ByteArrayInputBytes(bytes), 0, 0); } + [Fact] + public void CanReadCharStrings() + { + var bytes = GetFileBytes("CMBX10.pfa"); + + parser.Parse(new ByteArrayInputBytes(bytes), 0, 0); + } + [Fact] public void CanReadAsciiPart() { diff --git a/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs b/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs index 8e913674..0d6e7a7f 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs @@ -28,6 +28,17 @@ } } + [Fact] + public void LettersHaveHeight() + { + using (var document = PdfDocument.Open(GetFilename())) + { + var page = document.GetPage(1); + + Assert.NotEqual(0, page.Letters[0].GlyphRectangle.Height); + } + } + [Fact] public void HasCorrectNumberOfPages() { diff --git a/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1EncryptedPortionParser.cs b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1EncryptedPortionParser.cs index 46aca41f..f04b7a68 100644 --- a/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1EncryptedPortionParser.cs +++ b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1EncryptedPortionParser.cs @@ -2,6 +2,7 @@ { using System.Collections.Generic; using System.Linq; + using IO; using PdfPig.Parser.Parts; using Tokenization.Tokens; using Util; @@ -11,7 +12,7 @@ private const ushort EexecEncryptionKey = 55665; private const int EexecRandomBytes = 4; - public void Parse(IReadOnlyList bytes) + public IReadOnlyList Parse(IReadOnlyList bytes) { if (!IsBinary(bytes)) { @@ -20,7 +21,23 @@ var decrypted = Decrypt(bytes, EexecEncryptionKey, EexecRandomBytes); + // line 461 of type1parser.java var str = OtherEncodings.BytesAsLatin1String(decrypted.ToArray()); + + var tokenizer = new Type1Tokenizer(new ByteArrayInputBytes(decrypted)); + while (tokenizer.CurrentToken != null) + { + tokenizer.GetNext(); + } + + /* + * After 4 random characters follows the /Private dictionary and the /CharString dictionary. + * The first defines a number of technical terms involving character construction, and contains also an array of subroutines used in character paths. + * The second contains the character descriptions themselves. + * Both the subroutines and the character descriptions are yet again encrypted in a fashion similar to the entire binary segment, but now with an initial value of R = 4330 instead of 55665. + */ + + return decrypted; } /// @@ -89,6 +106,16 @@ private static IReadOnlyList Decrypt(IReadOnlyList bytes, int key, int randomBytes) { + /* + * We start with three constants R = 55665, c1 = 52845 and c2 = 22719. + * Then we apply to the entire binary array c[i] of length n the decryption procedure: + * for in [0, n): + * p[i] = c[i]^(R >> 8) + * R = ((c[i] + R)*c1 + c2) & ((1 << 16) - 1) + * + * Here ^ means xor addition, in which one interprets the bits modulo 2. + * The encryption key R changes as the procedure is carried out. + */ if (randomBytes == -1) { return bytes; diff --git a/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs index 6edc69af..8fa7a520 100644 --- a/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs +++ b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs @@ -5,6 +5,7 @@ using Exceptions; using Geometry; using IO; + using PdfPig.Parser.Parts; using Tokenization; using Tokenization.Scanner; using Tokenization.Tokens; @@ -14,7 +15,8 @@ private const string ClearToMark = "cleartomark"; private const int PfbFileIndicator = 0x80; - + private const int EexecKey = 55665; + private readonly Type1EncryptedPortionParser encryptedPortionParser; public Type1FontParser(Type1EncryptedPortionParser encryptedPortionParser) @@ -146,7 +148,7 @@ var matrix = GetFontMatrix(dictionaries); var boundingBox = GetBoundingBox(dictionaries); - encryptedPortionParser.Parse(eexecPortion); + var binaryPortion = encryptedPortionParser.Parse(eexecPortion); return new Type1Font(name, encoding, matrix, boundingBox ?? new PdfRectangle()); } @@ -349,7 +351,6 @@ return new ArrayToken(result); } - private static Dictionary GetEncoding(IReadOnlyList dictionaries) { var result = new Dictionary(); diff --git a/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1Token.cs b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1Token.cs new file mode 100644 index 00000000..1339bd05 --- /dev/null +++ b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1Token.cs @@ -0,0 +1,90 @@ +namespace UglyToad.PdfPig.Fonts.Type1.Parser +{ + using System; + using System.Collections.Generic; + + internal class Type1DataToken : Type1Token + { + public IReadOnlyList Data { get; } + + public Type1DataToken(TokenType type, IReadOnlyList data) : base(type) + { + if (type != TokenType.Charstring) + { + throw new ArgumentException($"Invalid token type for type 1 token receiving bytes, expected Charstring, got {type}."); + } + + Data = data; + } + + public override string ToString() + { + return $"Token[type = {Type}, data = {Data.Count} bytes]"; + + } + } + + internal class Type1TextToken : Type1Token + { + public string Text { get; } + + public Type1TextToken(char c, TokenType type) : this(c.ToString(), type) { } + public Type1TextToken(string text, TokenType type) : base(type) + { + Text = text; + } + + public int AsInt() + { + return (int)AsFloat(); + } + + public float AsFloat() + { + return float.Parse(Text); + } + + public bool AsBool() + { + return string.Equals(Text, "true", StringComparison.OrdinalIgnoreCase); + } + + public override string ToString() + { + return $"Token[type={Type}, text={Text}]"; + } + } + + internal class Type1Token + { + public TokenType Type { get; } + + public Type1Token(TokenType type) + { + Type = type; + } + + public enum TokenType + { + None, + String, + Name, + Literal, + Real, + Integer, + /// + /// An array must begin with either '[' or '{'. + /// + StartArray, + /// + /// An array must end with either ']' or '}'. + /// + EndArray, + StartProc, + EndProc, + StartDict, + EndDict, + Charstring + } + } +} diff --git a/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1Tokenizer.cs b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1Tokenizer.cs new file mode 100644 index 00000000..7888adc0 --- /dev/null +++ b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1Tokenizer.cs @@ -0,0 +1,387 @@ +namespace UglyToad.PdfPig.Fonts.Type1.Parser +{ + using System; + using System.Text; + using IO; + using PdfPig.Parser.Parts; + + internal class Type1Tokenizer + { + private readonly StringBuilder commentBuffer = new StringBuilder(); + private readonly StringBuilder literalBuffer = new StringBuilder(); + private readonly StringBuilder stringBuffer = new StringBuilder(); + + private readonly IInputBytes bytes; + + private int openParens; + private Type1Token previousToken; + + public Type1Token CurrentToken { get; private set; } + + public Type1Tokenizer(IInputBytes bytes) + { + this.bytes = bytes; + CurrentToken = ReadNextToken(); + } + + public Type1Token GetNext() + { + CurrentToken = ReadNextToken(); + return CurrentToken; + } + + private Type1Token ReadNextToken() + { + previousToken = CurrentToken; + bool skip; + do + { + skip = false; + while (bytes.MoveNext()) + { + var b = bytes.CurrentByte; + var c = (char)b; + + switch (c) + { + case '%': + var comment = ReadComment(); + break; + case '(': + return ReadString(); + case ')': + throw new InvalidOperationException("Encountered an end of string ')' outside of string."); + case '[': + return new Type1TextToken(c, Type1Token.TokenType.StartArray); + case ']': + return new Type1TextToken(c, Type1Token.TokenType.EndArray); + case '{': + return new Type1TextToken(c, Type1Token.TokenType.StartProc); + case '}': + return new Type1TextToken(c, Type1Token.TokenType.EndProc); + case '/': + { + var name = ReadLiteral(); + return new Type1TextToken(name, Type1Token.TokenType.Literal); + } + case '<': + { + var following = bytes.Peek(); + if (following == '<') + { + bytes.MoveNext(); + return new Type1TextToken("<<", Type1Token.TokenType.StartDict); + } + + return new Type1TextToken(c, Type1Token.TokenType.Name); + } + case '>': + { + var following = bytes.Peek(); + if (following == '>') + { + bytes.MoveNext(); + return new Type1TextToken(">>", Type1Token.TokenType.EndDict); + } + + return new Type1TextToken(c, Type1Token.TokenType.Name); + } + default: + { + if (ReadHelper.IsWhitespace(b)) + { + skip = true; + break; + } + + if (b == 0) + { + skip = true; + break; + } + + if (TryReadNumber(out var number)) + { + return number; + } + + var name = ReadLiteral(c); + if (name == null) + { + throw new InvalidOperationException($"The binary portion of the type 1 font was invalid at position {bytes.CurrentOffset}."); + } + + if (name.Equals("RD") || name.Equals("-|")) + { + if (previousToken.Type == Type1Token.TokenType.Integer) + { + return ReadCharString(((Type1TextToken)previousToken).AsInt()); + } + + throw new InvalidOperationException($"Expected integer token before {name} at offset {bytes.CurrentOffset}."); + } + + return new Type1TextToken(name, Type1Token.TokenType.Name); + } + } + } + } while (skip); + + return null; + } + + private Type1TextToken ReadString() + { + char GetNext() + { + bytes.MoveNext(); + return (char)bytes.CurrentByte; + } + stringBuffer.Clear(); + + while (bytes.MoveNext()) + { + var c = (char)bytes.CurrentByte; + + // string context + switch (c) + { + case '(': + openParens++; + stringBuffer.Append('('); + break; + case ')': + if (openParens == 0) + { + // end of string + return new Type1TextToken(stringBuffer.ToString(), Type1Token.TokenType.String); + } + stringBuffer.Append(')'); + openParens--; + break; + case '\\': + // escapes: \n \r \t \b \f \\ \( \) + char c1 = GetNext(); + switch (c1) + { + case 'n': + case 'r': stringBuffer.Append("\n"); break; + case 't': stringBuffer.Append('\t'); break; + case 'b': stringBuffer.Append('\b'); break; + case 'f': stringBuffer.Append('\f'); break; + case '\\': stringBuffer.Append('\\'); break; + case '(': stringBuffer.Append('('); break; + case ')': stringBuffer.Append(')'); break; + } + // octal \ddd + if (char.IsDigit(c1)) + { + var rawOctal = new string(new[] { c1, GetNext(), GetNext() }); + var code = Convert.ToInt32(rawOctal, 8); + stringBuffer.Append((char)code); + } + break; + case '\r': + case '\n': + stringBuffer.Append("\n"); + break; + default: + stringBuffer.Append(c); + break; + } + } + return null; + } + + private bool TryReadNumber(out Type1TextToken numberToken) + { + char GetNext() + { + bytes.MoveNext(); + return (char)bytes.CurrentByte; + } + + numberToken = null; + + var currentPosition = bytes.CurrentOffset; + + var sb = new StringBuilder(); + StringBuilder radix = null; + + char c = GetNext(); + var hasDigit = false; + + // optional + or - + if (c == '+' || c == '-') + { + sb.Append(c); + c = GetNext(); + } + + // optional digits + while (char.IsDigit(c)) + { + sb.Append(c); + c = GetNext(); + hasDigit = true; + } + + // optional . + if (c == '.') + { + sb.Append(c); + c = GetNext(); + } + else if (c == '#') + { + // PostScript radix number takes the form base#number + radix = sb; + sb = new StringBuilder(); + c = GetNext(); + } + else if (sb.Length == 0 || !hasDigit) + { + // failure + bytes.Seek(currentPosition); + return false; + } + else + { + // integer + bytes.Seek(bytes.CurrentOffset - 1); + + numberToken = new Type1TextToken(sb.ToString(), Type1Token.TokenType.Integer); + return true; + } + + // required digit + if (char.IsDigit(c)) + { + sb.Append(c); + c = GetNext(); + } + else + { + bytes.Seek(currentPosition); + return false; + } + + // optional digits + while (char.IsDigit(c)) + { + sb.Append(c); + c = GetNext(); + } + + // optional E + if (c == 'E') + { + sb.Append(c); + c = GetNext(); + + // optional minus + if (c == '-') + { + sb.Append(c); + c = GetNext(); + } + + // required digit + if (char.IsDigit(c)) + { + sb.Append(c); + c = GetNext(); + } + else + { + bytes.Seek(currentPosition); + return false; + } + + // optional digits + while (char.IsDigit(c)) + { + sb.Append(c); + c = GetNext(); + } + } + + bytes.Seek(bytes.CurrentOffset - 1); + if (radix != null) + { + var number = Convert.ToInt32(sb.ToString(), int.Parse(radix.ToString())); + numberToken = new Type1TextToken(number.ToString(), Type1Token.TokenType.Integer); + } + else + { + numberToken = new Type1TextToken(sb.ToString(), Type1Token.TokenType.Real); + } + + return true; + } + + private string ReadLiteral(char? previousCharacter = null) + { + literalBuffer.Clear(); + if (previousCharacter.HasValue) + { + literalBuffer.Append(previousCharacter); + } + + do + { + var b = bytes.Peek(); + if (!b.HasValue) + { + break; + } + + var c = (char)b; + + if (char.IsWhiteSpace(c) || c == '(' || c == ')' || c == '<' || c == '>' || + c == '[' || c == ']' || c == '{' || c == '}' || c == '/' || c == '%') + { + break; + } + + literalBuffer.Append(c); + } while (bytes.MoveNext()); + + var literal = literalBuffer.ToString(); + return literal.Length == 0 ? null : literal; + } + + private string ReadComment() + { + commentBuffer.Clear(); + + while (bytes.MoveNext()) + { + var c = (char)bytes.CurrentByte; + if (ReadHelper.IsEndOfLine(c)) + { + continue; + } + + commentBuffer.Append(c); + } + + return commentBuffer.ToString(); + } + + private Type1DataToken ReadCharString(int length) + { + // Skip preceding space. + bytes.MoveNext(); + + byte[] data = new byte[length]; + for (int i = 0; i < length; i++) + { + bytes.MoveNext(); + data[i] = bytes.CurrentByte; + } + + return new Type1DataToken(Type1Token.TokenType.Charstring, data); + } + } +}