From 85d1f84965854e700a6374bb04e6a75a810b778c Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Sat, 28 Apr 2018 19:33:50 +0100 Subject: [PATCH] more compact font format parsing --- .../CompactFontFormatIndividualFontParser.cs | 381 ++++++++++++++++ .../CompactFontFormatParser.cs | 18 +- .../CompactFontFormatStandardStrings.cs | 410 ++++++++++++++++++ .../Parser/PdfDocumentFactory.cs | 2 +- 4 files changed, 809 insertions(+), 2 deletions(-) create mode 100644 src/UglyToad.PdfPig/Fonts/CompactFontFormat/CompactFontFormatIndividualFontParser.cs create mode 100644 src/UglyToad.PdfPig/Fonts/CompactFontFormat/CompactFontFormatStandardStrings.cs diff --git a/src/UglyToad.PdfPig/Fonts/CompactFontFormat/CompactFontFormatIndividualFontParser.cs b/src/UglyToad.PdfPig/Fonts/CompactFontFormat/CompactFontFormatIndividualFontParser.cs new file mode 100644 index 00000000..e386a66c --- /dev/null +++ b/src/UglyToad.PdfPig/Fonts/CompactFontFormat/CompactFontFormatIndividualFontParser.cs @@ -0,0 +1,381 @@ +namespace UglyToad.PdfPig.Fonts.CompactFontFormat +{ + using System; + using System.Collections.Generic; + using System.Text; + using Core; + using Geometry; + + internal class CompactFontFormatIndividualFontParser + { + public void Parse(CompactFontFormatData data, string name, byte[] topDictionaryIndex, string[] stringIndex) + { + var individualData = new CompactFontFormatData(topDictionaryIndex); + + var dictionary = ReadTopLevelDictionary(individualData, stringIndex); + } + + private static CompactFontFormatFontDictionary ReadTopLevelDictionary(CompactFontFormatData data, string[] stringIndex) + { + var dictionary = new CompactFontFormatFontDictionary(); + while (data.CanRead()) + { + var numbers = new List(); + + var infiniteLoopProtection = 0; + while (true) + { + infiniteLoopProtection++; + // Avoid the library getting caught in an infinite loop, probably not possible. + // "An operator may be preceded by up to a maximum of 48 operands." + if (infiniteLoopProtection > 256) + { + throw new InvalidOperationException("Got caught in an infinite loop trying to read a CFF dictionary."); + } + + var byte0 = data.ReadByte(); + + // Operands and operators are distinguished by the first byte, 0 - 21 specify operators + if (byte0 <= 21) + { + ApplyOperator(byte0, numbers, data, stringIndex, dictionary); + break; + } + + /* + * b0 value value range calculation + * 32 - 246 -107 - +107 b0 - 139 + * 247 - 250 +108 - +1131 (b0 - 247)*256 + b1 + 108 + * 251 - 254 -1131 - -108 -(b0 - 251)*256 - b1 - 108 + * 28 -32768 - +32767 b1 << 8 | b2 + * 29 -(2^31)-+(2^31-1) b1 << 24 | b2 << 16 | b3 << 8 | b4 + * + * A byte value of 30 defines a real number operand + */ + if (byte0 == 28) + { + var value = data.ReadByte() << 8 | data.ReadByte(); + numbers.Add(new Operand(value)); + } + else if (byte0 == 29) + { + var value = data.ReadByte() << 24 | data.ReadByte() << 16 | + data.ReadByte() << 8 | data.ReadByte(); + numbers.Add(new Operand(value)); + } + else if (byte0 == 30) + { + var realNumber = ReadRealNumber(data); + numbers.Add(new Operand(realNumber)); + } + else if (byte0 >= 32 && byte0 <= 246) + { + var value = byte0 - 139; + numbers.Add(new Operand(value)); + } + else if (byte0 >= 247 && byte0 <= 250) + { + var value = (byte0 - 247) * 256 + data.ReadByte() + 108; + numbers.Add(new Operand(value)); + } + else if (byte0 >= 251 && byte0 <= 254) + { + var value = -(byte0 - 251) * 256 - data.ReadByte() - 108; + numbers.Add(new Operand(value)); + } + else + { + throw new InvalidOperationException($"The first dictionary byte was not in the range 29 - 254. Got {byte0}."); + } + } + } + + return dictionary; + } + + private static decimal ReadRealNumber(CompactFontFormatData data) + { + var sb = new StringBuilder(); + var done = false; + var exponentMissing = false; + + while (!done) + { + var b = data.ReadByte(); + var nibble1 = b / 16; + var nibble2 = b % 16; + + for (var i = 0; i < 2; i++) + { + var nibble = i == 0 ? nibble1 : nibble2; + + switch (nibble) + { + case 0x0: + case 0x1: + case 0x2: + case 0x3: + case 0x4: + case 0x5: + case 0x6: + case 0x7: + case 0x8: + case 0x9: + sb.Append(nibble); + exponentMissing = false; + break; + case 0xa: + sb.Append("."); + break; + case 0xb: + sb.Append("E"); + exponentMissing = true; + break; + case 0xc: + sb.Append("E-"); + exponentMissing = true; + break; + case 0xd: + break; + case 0xe: + sb.Append("-"); + break; + case 0xf: + done = true; + break; + default: + throw new InvalidOperationException($"Did not expect nibble value: {nibble}."); + } + } + } + + if (exponentMissing) + { + // the exponent is missing, just append "0" to avoid an exception + // not sure if 0 is the correct value, but it seems to fit + // see PDFBOX-1522 + sb.Append("0"); + } + + if (sb.Length == 0) + { + return 0m; + } + + return decimal.Parse(sb.ToString()); + } + + private static void ApplyOperator(byte byte0, List operands, CompactFontFormatData data, + string[] stringIndex, + CompactFontFormatFontDictionary dictionary) + { + + OperandKey key; + if (byte0 == 12) + { + var b1 = data.ReadByte(); + key = new OperandKey(byte0, b1); + } + else + { + key = new OperandKey(byte0); + } + + switch (key.Byte0) + { + case 0: + dictionary.Version = GetString(operands, stringIndex); + break; + case 1: + dictionary.Notice = GetString(operands, stringIndex); + break; + case 2: + dictionary.FullName = GetString(operands, stringIndex); + break; + case 3: + dictionary.FamilyName = GetString(operands, stringIndex); + break; + case 4: + dictionary.Weight = GetString(operands, stringIndex); + break; + case 5: + dictionary.FontBoundingBox = GetBoundingBox(operands); + break; + case 12: + { + if (!key.Byte1.HasValue) + { + throw new InvalidOperationException("A single byte sequence beginning with 12 was found."); + } + + switch (key.Byte1.Value) + { + case 1: + dictionary.IsFixedPitch = operands[0].Decimal == 1; + break; + case 2: + dictionary.ItalicAngle = operands[0].Decimal; + break; + case 3: + dictionary.UnderlinePosition = operands[0].Decimal; + break; + case 4: + dictionary.UnderlineThickness = operands[0].Decimal; + break; + case 5: + dictionary.PaintType = operands[0].Decimal; + break; + case 6: + dictionary.CharstringType = operands[0].Int.Value; + break; + case 7: + break; + case 8: + break; + } + } + break; + case 13: + dictionary.UniqueId = operands.Count > 0 ? operands[0].Decimal : 0; + break; + case 14: + dictionary.Xuid = ToArray(operands); + break; + case 15: + break; + case 16: + break; + case 17: + break; + case 18: + break; + } + } + + private static string GetString(List operands, string[] stringIndex) + { + if (operands.Count == 0) + { + throw new InvalidOperationException("Cannot read a string from an empty operands array."); + } + + if (!operands[0].Int.HasValue) + { + throw new InvalidOperationException($"The first operand for reading a string was not an integer. Got: {operands[0].Decimal}"); + } + + var index = operands[0].Int.Value; + + if (index >= 0 && index <= 390) + { + return CompactFontFormatStandardStrings.GetName(index); + } + + var stringIndexIndex = index - 391; + if (stringIndexIndex >= 0 && stringIndexIndex < stringIndex.Length) + { + return stringIndex[stringIndexIndex]; + } + + return $"SID{index}"; + } + + private static PdfRectangle GetBoundingBox(List operands) + { + if (operands.Count != 4) + { + return new PdfRectangle(); + } + + return new PdfRectangle(operands[0].Decimal, operands[1].Decimal, + operands[2].Decimal, operands[3].Decimal); + } + + private static decimal[] ToArray(List operands) + { + var result = new decimal[operands.Count]; + + for (int i = 0; i < result.Length; i++) + { + result[i] = operands[i].Decimal; + } + + return result; + } + + private struct Operand + { + public int? Int { get; } + + public decimal Decimal { get; } + + public Operand(int integer) + { + Int = integer; + Decimal = integer; + } + + public Operand(decimal d) + { + Int = null; + Decimal = d; + } + } + + private struct OperandKey + { + public byte Byte0 { get; } + + public byte? Byte1 { get; } + + public OperandKey(Byte byte0) + { + Byte0 = byte0; + Byte1 = null; + } + + public OperandKey(byte byte0, byte byte1) + { + Byte0 = byte0; + Byte1 = byte1; + } + } + } + + internal class CompactFontFormatFontDictionary + { + public string Version { get; set; } + + public string Notice { get; set; } + + public string Copyright { get; set; } + + public string FullName { get; set; } + + public string FamilyName { get; set; } + + public string Weight { get; set; } + + public bool IsFixedPitch { get; set; } + + public decimal ItalicAngle { get; set; } + + public decimal UnderlinePosition { get; set; } = -100; + + public decimal UnderlineThickness { get; set; } = 50; + + public decimal PaintType { get; set; } + + public int CharstringType { get; set; } + + public TransformationMatrix FontMatrix { get; set; } = TransformationMatrix.FromValues(0.001m, 0m, 0.001m, 0, 0, 0); + + public decimal UniqueId { get; set; } + + public PdfRectangle FontBoundingBox { get; set; } = new PdfRectangle(0, 0, 0, 0); + + public decimal[] Xuid { get; set; } + + + } +} diff --git a/src/UglyToad.PdfPig/Fonts/CompactFontFormat/CompactFontFormatParser.cs b/src/UglyToad.PdfPig/Fonts/CompactFontFormat/CompactFontFormatParser.cs index 2ed35878..77ad38c6 100644 --- a/src/UglyToad.PdfPig/Fonts/CompactFontFormat/CompactFontFormatParser.cs +++ b/src/UglyToad.PdfPig/Fonts/CompactFontFormat/CompactFontFormatParser.cs @@ -10,6 +10,13 @@ private const string TagTtcf = "ttcf"; private const string TagTtfonly = "\u0000\u0001\u0000\u0000"; + private readonly CompactFontFormatIndividualFontParser individualFontParser; + + public CompactFontFormatParser(CompactFontFormatIndividualFontParser individualFontParser) + { + this.individualFontParser = individualFontParser; + } + public void Parse(CompactFontFormatData data) { var tag = ReadTag(data); @@ -29,11 +36,20 @@ var header = ReadHeader(data); - var names = ReadStringIndex(data); + var fontNames = ReadStringIndex(data); var topLevelDict = ReadDictionaryData(data); var stringIndex = ReadStringIndex(data); + + var globalSubroutineIndex = ReadDictionaryData(data); + + for (var i = 0; i < fontNames.Length; i++) + { + var fontName = fontNames[i]; + + individualFontParser.Parse(data, fontName, topLevelDict[i], stringIndex); + } } private static string ReadTag(CompactFontFormatData data) diff --git a/src/UglyToad.PdfPig/Fonts/CompactFontFormat/CompactFontFormatStandardStrings.cs b/src/UglyToad.PdfPig/Fonts/CompactFontFormat/CompactFontFormatStandardStrings.cs new file mode 100644 index 00000000..549dddcb --- /dev/null +++ b/src/UglyToad.PdfPig/Fonts/CompactFontFormat/CompactFontFormatStandardStrings.cs @@ -0,0 +1,410 @@ +namespace UglyToad.PdfPig.Fonts.CompactFontFormat +{ + internal static class CompactFontFormatStandardStrings + { + public static string GetName(int sid) + { + if (sid < 0 || sid >= StringIdentifierToString.Length) + { + return null; + } + + return StringIdentifierToString[sid]; + } + + private static readonly string[] StringIdentifierToString = + { + ".notdef", + "space", + "exclam", + "quotedbl", + "numbersign", + "dollar", + "percent", + "ampersand", + "quoteright", + "parenleft", + "parenright", + "asterisk", + "plus", + "comma", + "hyphen", + "period", + "slash", + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + "colon", + "semicolon", + "less", + "equal", + "greater", + "question", + "at", + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + "bracketleft", + "backslash", + "bracketright", + "asciicircum", + "underscore", + "quoteleft", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "braceleft", + "bar", + "braceright", + "asciitilde", + "exclamdown", + "cent", + "sterling", + "fraction", + "yen", + "florin", + "section", + "currency", + "quotesingle", + "quotedblleft", + "guillemotleft", + "guilsinglleft", + "guilsinglright", + "fi", + "fl", + "endash", + "dagger", + "daggerdbl", + "periodcentered", + "paragraph", + "bullet", + "quotesinglbase", + "quotedblbase", + "quotedblright", + "guillemotright", + "ellipsis", + "perthousand", + "questiondown", + "grave", + "acute", + "circumflex", + "tilde", + "macron", + "breve", + "dotaccent", + "dieresis", + "ring", + "cedilla", + "hungarumlaut", + "ogonek", + "caron", + "emdash", + "AE", + "ordfeminine", + "Lslash", + "Oslash", + "OE", + "ordmasculine", + "ae", + "dotlessi", + "lslash", + "oslash", + "oe", + "germandbls", + "onesuperior", + "logicalnot", + "mu", + "trademark", + "Eth", + "onehalf", + "plusminus", + "Thorn", + "onequarter", + "divide", + "brokenbar", + "degree", + "thorn", + "threequarters", + "twosuperior", + "registered", + "minus", + "eth", + "multiply", + "threesuperior", + "copyright", + "Aacute", + "Acircumflex", + "Adieresis", + "Agrave", + "Aring", + "Atilde", + "Ccedilla", + "Eacute", + "Ecircumflex", + "Edieresis", + "Egrave", + "Iacute", + "Icircumflex", + "Idieresis", + "Igrave", + "Ntilde", + "Oacute", + "Ocircumflex", + "Odieresis", + "Ograve", + "Otilde", + "Scaron", + "Uacute", + "Ucircumflex", + "Udieresis", + "Ugrave", + "Yacute", + "Ydieresis", + "Zcaron", + "aacute", + "acircumflex", + "adieresis", + "agrave", + "aring", + "atilde", + "ccedilla", + "eacute", + "ecircumflex", + "edieresis", + "egrave", + "iacute", + "icircumflex", + "idieresis", + "igrave", + "ntilde", + "oacute", + "ocircumflex", + "odieresis", + "ograve", + "otilde", + "scaron", + "uacute", + "ucircumflex", + "udieresis", + "ugrave", + "yacute", + "ydieresis", + "zcaron", + "exclamsmall", + "Hungarumlautsmall", + "dollaroldstyle", + "dollarsuperior", + "ampersandsmall", + "Acutesmall", + "parenleftsuperior", + "parenrightsuperior", + "twodotenleader", + "onedotenleader", + "zerooldstyle", + "oneoldstyle", + "twooldstyle", + "threeoldstyle", + "fouroldstyle", + "fiveoldstyle", + "sixoldstyle", + "sevenoldstyle", + "eightoldstyle", + "nineoldstyle", + "commasuperior", + "threequartersemdash", + "periodsuperior", + "questionsmall", + "asuperior", + "bsuperior", + "centsuperior", + "dsuperior", + "esuperior", + "isuperior", + "lsuperior", + "msuperior", + "nsuperior", + "osuperior", + "rsuperior", + "ssuperior", + "tsuperior", + "ff", + "ffi", + "ffl", + "parenleftinferior", + "parenrightinferior", + "Circumflexsmall", + "hyphensuperior", + "Gravesmall", + "Asmall", + "Bsmall", + "Csmall", + "Dsmall", + "Esmall", + "Fsmall", + "Gsmall", + "Hsmall", + "Ismall", + "Jsmall", + "Ksmall", + "Lsmall", + "Msmall", + "Nsmall", + "Osmall", + "Psmall", + "Qsmall", + "Rsmall", + "Ssmall", + "Tsmall", + "Usmall", + "Vsmall", + "Wsmall", + "Xsmall", + "Ysmall", + "Zsmall", + "colonmonetary", + "onefitted", + "rupiah", + "Tildesmall", + "exclamdownsmall", + "centoldstyle", + "Lslashsmall", + "Scaronsmall", + "Zcaronsmall", + "Dieresissmall", + "Brevesmall", + "Caronsmall", + "Dotaccentsmall", + "Macronsmall", + "figuredash", + "hypheninferior", + "Ogoneksmall", + "Ringsmall", + "Cedillasmall", + "questiondownsmall", + "oneeighth", + "threeeighths", + "fiveeighths", + "seveneighths", + "onethird", + "twothirds", + "zerosuperior", + "foursuperior", + "fivesuperior", + "sixsuperior", + "sevensuperior", + "eightsuperior", + "ninesuperior", + "zeroinferior", + "oneinferior", + "twoinferior", + "threeinferior", + "fourinferior", + "fiveinferior", + "sixinferior", + "seveninferior", + "eightinferior", + "nineinferior", + "centinferior", + "dollarinferior", + "periodinferior", + "commainferior", + "Agravesmall", + "Aacutesmall", + "Acircumflexsmall", + "Atildesmall", + "Adieresissmall", + "Aringsmall", + "AEsmall", + "Ccedillasmall", + "Egravesmall", + "Eacutesmall", + "Ecircumflexsmall", + "Edieresissmall", + "Igravesmall", + "Iacutesmall", + "Icircumflexsmall", + "Idieresissmall", + "Ethsmall", + "Ntildesmall", + "Ogravesmall", + "Oacutesmall", + "Ocircumflexsmall", + "Otildesmall", + "Odieresissmall", + "OEsmall", + "Oslashsmall", + "Ugravesmall", + "Uacutesmall", + "Ucircumflexsmall", + "Udieresissmall", + "Yacutesmall", + "Thornsmall", + "Ydieresissmall", + "001.000", + "001.001", + "001.002", + "001.003", + "Black", + "Bold", + "Book", + "Light", + "Medium", + "Regular", + "Roman", + "Semibold" + }; + } +} diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index 8bf6bc56..ad0e4126 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -108,7 +108,7 @@ new TrueTypeFontHandler(log, pdfScanner, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser, encodingReader), new Type1FontHandler(pdfScanner, cMapCache, filterProvider, fontDescriptorFactory, encodingReader, new Type1FontParser(new Type1EncryptedPortionParser()), - new CompactFontFormatParser()), + new CompactFontFormatParser(new CompactFontFormatIndividualFontParser())), new Type3FontHandler(pdfScanner, cMapCache, filterProvider, encodingReader)); var resourceContainer = new ResourceContainer(pdfScanner, fontFactory);