From 4b91300466ad7a2a2236a2b60d545fb173187555 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Mon, 20 Nov 2017 16:42:18 +0000 Subject: [PATCH] split out classes for parsing the cmap format and add assertions to tests. add bytes to int method for hex token and test --- .../Fonts/Parser/CMapParserTests.cs | 40 +- .../Tokenization/Tokens/HexTokenTests.cs | 36 ++ .../Cmap/CharacterIdentifierSystemInfo.cs | 5 + .../Fonts/Cmap/CharacterMapBuilder.cs | 13 +- src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs | 365 +++--------------- src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs | 250 ++---------- .../Parser/Parts/BaseFontCharacterParser.cs | 39 ++ .../Parser/{ => Parts}/BaseFontRangeParser.cs | 6 +- .../Fonts/Parser/Parts/CidCharacterParser.cs | 36 ++ .../Fonts/Parser/Parts/CidFontNameParser.cs | 128 ++++++ .../Fonts/Parser/Parts/CidRangeParser.cs | 46 +++ .../Parser/Parts/CodespaceRangeParser.cs | 42 ++ .../Fonts/Parser/Parts/ICidFontPartParser.cs | 17 + .../Tokenization/Tokens/HexToken.cs | 14 + 14 files changed, 500 insertions(+), 537 deletions(-) create mode 100644 src/UglyToad.Pdf.Tests/Tokenization/Tokens/HexTokenTests.cs create mode 100644 src/UglyToad.Pdf/Fonts/Parser/Parts/BaseFontCharacterParser.cs rename src/UglyToad.Pdf/Fonts/Parser/{ => Parts}/BaseFontRangeParser.cs (95%) create mode 100644 src/UglyToad.Pdf/Fonts/Parser/Parts/CidCharacterParser.cs create mode 100644 src/UglyToad.Pdf/Fonts/Parser/Parts/CidFontNameParser.cs create mode 100644 src/UglyToad.Pdf/Fonts/Parser/Parts/CidRangeParser.cs create mode 100644 src/UglyToad.Pdf/Fonts/Parser/Parts/CodespaceRangeParser.cs create mode 100644 src/UglyToad.Pdf/Fonts/Parser/Parts/ICidFontPartParser.cs diff --git a/src/UglyToad.Pdf.Tests/Fonts/Parser/CMapParserTests.cs b/src/UglyToad.Pdf.Tests/Fonts/Parser/CMapParserTests.cs index c74aa145..f74e020a 100644 --- a/src/UglyToad.Pdf.Tests/Fonts/Parser/CMapParserTests.cs +++ b/src/UglyToad.Pdf.Tests/Fonts/Parser/CMapParserTests.cs @@ -40,11 +40,49 @@ end"; private readonly CMapParser cMapParser = new CMapParser(); [Fact] - public void CanParseCMap() + public void CanParseCidSystemInfoAndOtherInformation() { var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false); var cmap = cMapParser.Parse(input.Bytes, false); + + Assert.Equal("Adobe", cmap.Info.Registry); + Assert.Equal("UCS", cmap.Info.Ordering); + Assert.Equal(0, cmap.Info.Supplement); + + Assert.Equal("Adobe-Identity-UCS", cmap.Name); + Assert.Equal(2, cmap.Type); + } + + [Fact] + public void CanParseCodespaceRange() + { + var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false); + + var cmap = cMapParser.Parse(input.Bytes, false); + + Assert.Equal(1, cmap.CodespaceRanges.Count); + + Assert.Equal(0, cmap.CodespaceRanges[0].StartInt); + Assert.Equal(65535, cmap.CodespaceRanges[0].EndInt); + Assert.Equal(2, cmap.CodespaceRanges[0].CodeLength); + } + + [Fact] + public void CanParseBaseFontCharacters() + { + var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false); + + var cmap = cMapParser.Parse(input.Bytes, false); + + Assert.True(cmap.BaseFontCharacterMap.Count >= 6); + + Assert.Equal(" ", cmap.BaseFontCharacterMap[3]); + Assert.Equal(".", cmap.BaseFontCharacterMap[17]); + Assert.Equal("A", cmap.BaseFontCharacterMap[36]); + Assert.Equal("T", cmap.BaseFontCharacterMap[55]); + Assert.Equal("a", cmap.BaseFontCharacterMap[68]); + Assert.Equal("x", cmap.BaseFontCharacterMap[91]); } } } diff --git a/src/UglyToad.Pdf.Tests/Tokenization/Tokens/HexTokenTests.cs b/src/UglyToad.Pdf.Tests/Tokenization/Tokens/HexTokenTests.cs new file mode 100644 index 00000000..0f6693d6 --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Tokenization/Tokens/HexTokenTests.cs @@ -0,0 +1,36 @@ +namespace UglyToad.Pdf.Tests.Tokenization.Tokens +{ + using Pdf.Tokenization.Tokens; + using Xunit; + + public class HexTokenTests + { + [Theory] + [InlineData("AE", "®")] + [InlineData("61", "a")] + [InlineData("0061", "\0a")] + [InlineData("7465787420736f", "text so")] + public void MapsCorrectlyToString(string input, string expected) + { + var token = new HexToken(input.ToCharArray()); + + Assert.Equal(expected, token.Data); + } + + [Theory] + [InlineData("0003", 3)] + [InlineData("0011", 17)] + [InlineData("0024", 36)] + [InlineData("0037", 55)] + [InlineData("0044", 68)] + [InlineData("005B", 91)] + public void MapsCorrectlyToInt(string input, int expected) + { + var token = new HexToken(input.ToCharArray()); + + var value = HexToken.ConvertHexBytesToInt(token); + + Assert.Equal(expected, value); + } + } +} diff --git a/src/UglyToad.Pdf/Fonts/Cmap/CharacterIdentifierSystemInfo.cs b/src/UglyToad.Pdf/Fonts/Cmap/CharacterIdentifierSystemInfo.cs index 8c3387e4..598babce 100644 --- a/src/UglyToad.Pdf/Fonts/Cmap/CharacterIdentifierSystemInfo.cs +++ b/src/UglyToad.Pdf/Fonts/Cmap/CharacterIdentifierSystemInfo.cs @@ -26,5 +26,10 @@ Ordering = ordering; Supplement = supplement; } + + public override string ToString() + { + return $"{Registry} | {Ordering} | {Supplement}"; + } } } \ No newline at end of file diff --git a/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs b/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs index 362be491..6efa16cd 100644 --- a/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs +++ b/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs @@ -54,6 +54,8 @@ public IReadOnlyList CidCharacterMappings { get; set; } + public IReadOnlyList CidRanges { get; set; } + public Dictionary BaseFontCharacterMap { get; } = new Dictionary(); public void AddBaseFontCharacter(IReadOnlyList bytes, IReadOnlyList value) @@ -68,6 +70,15 @@ BaseFontCharacterMap[code] = value; } + public CMap Build() + { + return new CMap(CharacterIdentifierSystemInfo, Type, WMode, Name, Version, + BaseFontCharacterMap ?? new Dictionary(), + CodespaceRanges ?? new CodespaceRange[0], + CidRanges ?? new CidRange[0], + CidCharacterMappings ?? new CidCharacterMapping[0]); + } + private int GetCodeFromArray(IReadOnlyList data, int length) { int code = 0; @@ -79,7 +90,7 @@ return code; } - private string CreateStringFromBytes(byte[] bytes) + private static string CreateStringFromBytes(byte[] bytes) { return bytes.Length == 1 ? OtherEncodings.BytesAsLatin1String(bytes) diff --git a/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs b/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs index 1a1d891c..184f948d 100644 --- a/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs +++ b/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs @@ -1,11 +1,50 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace UglyToad.Pdf.Fonts.Cmap +namespace UglyToad.Pdf.Fonts.Cmap { + using System; + using System.Collections.Generic; + using Util.JetBrains.Annotations; + public class CMap { + public CharacterIdentifierSystemInfo Info { get; } + + public int Type { get; } + + public int WMode { get; } + + public string Name { get; } + + public string Version { get; } + + [NotNull] + public IReadOnlyDictionary BaseFontCharacterMap { get; } + + [NotNull] + public IReadOnlyList CodespaceRanges { get; } + + [NotNull] + public IReadOnlyList CidRanges { get; } + + [NotNull] + public IReadOnlyList CidCharacterMappings { get; } + + public bool HasCidMappings => CidCharacterMappings.Count > 0 || CidRanges.Count > 0; + + public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0; + + public CMap(CharacterIdentifierSystemInfo info, int type, int wMode, string name, string version, IReadOnlyDictionary baseFontCharacterMap, IReadOnlyList codespaceRanges, IReadOnlyList cidRanges, IReadOnlyList cidCharacterMappings) + { + Info = info; + Type = type; + WMode = wMode; + Name = name; + Version = version; + BaseFontCharacterMap = baseFontCharacterMap ?? throw new ArgumentNullException(nameof(baseFontCharacterMap)); + CodespaceRanges = codespaceRanges ?? throw new ArgumentNullException(nameof(codespaceRanges)); + CidRanges = cidRanges ?? throw new ArgumentNullException(nameof(cidRanges)); + CidCharacterMappings = cidCharacterMappings ?? throw new ArgumentNullException(nameof(cidCharacterMappings)); + } + private int wmode = 0; private string cmapName = null; private string cmapVersion = null; @@ -17,13 +56,7 @@ namespace UglyToad.Pdf.Fonts.Cmap private int minCodeLength = 4; private int maxCodeLength; - - // code lengths - private readonly List codespaceRanges = new List(); - - // Unicode mappings - private readonly Dictionary charToUnicode = new Dictionary(); - + // CID mappings private readonly Dictionary codeToCid = new Dictionary(); private readonly List codeToCidRanges = new List(); @@ -31,44 +64,17 @@ namespace UglyToad.Pdf.Fonts.Cmap private static readonly string SPACE = " "; private int spaceMapping = -1; - /** - * Creates a new instance of CMap. - */ - public CMap() + /// + /// Returns the sequence of Unicode characters for the given character code. + /// + /// Character code + /// Unicode characters(may be more than one, e.g "fi" ligature) + /// if this character map contains an entry for this code, otherwise. + public bool TryConvertToUnicode(int code, out string result) { - } + var found = BaseFontCharacterMap.TryGetValue(code, out result); - /** - * This will tell if this cmap has any CID mappings. - * - * @return true If there are any CID mappings, false otherwise. - */ - public bool hasCIDMappings() - { - return codeToCid.Count > 0 || codeToCidRanges.Count > 0; - } - - /** - * This will tell if this cmap has any Unicode mappings. - * - * @return true If there are any Unicode mappings, false otherwise. - */ - public bool hasUnicodeMappings() - { - return charToUnicode.Count > 0; - } - - /** - * Returns the sequence of Unicode characters for the given character code. - * - * @param code character code - * @return Unicode characters (may be more than one, e.g "fi" ligature) - */ - public string toUnicode(int code) - { - charToUnicode.TryGetValue(code, out var result); - - return result; + return found; } /** @@ -102,27 +108,14 @@ namespace UglyToad.Pdf.Fonts.Cmap // throw new InvalidOperationException("CMap is invalid"); //} - /** - * Returns an int for the given byte array - */ - static int toInt(byte[] data, int dataLen) - { - int code = 0; - for (int i = 0; i < dataLen; ++i) - { - code <<= 8; - code |= (data[i] & 0xFF); - } - return code; - } - + /** * Returns the CID for the given character code. * * @param code character code * @return CID */ - public int toCID(int code) + public int ConvertToCid(int code) { if (codeToCid.TryGetValue(code, out var cid)) { @@ -137,251 +130,11 @@ namespace UglyToad.Pdf.Fonts.Cmap return ch; } } + return 0; } - - /** - * Convert the given part of a byte array to an int. - * @param data the byte array - * @param offset The offset into the byte array. - * @param length The length of the data we are getting. - * @return the resulting int - */ - private int getCodeFromArray(byte[] data, int offset, int length) - { - int code = 0; - for (int i = 0; i < length; i++) - { - code <<= 8; - code |= (data[offset + i] + 256) % 256; - } - return code; - } - - /** - * This will add a character code to Unicode character sequence mapping. - * - * @param codes The character codes to map from. - * @param unicode The Unicode characters to map to. - */ - void addCharMapping(byte[] codes, string unicode) - { - int code = getCodeFromArray(codes, 0, codes.Length); - charToUnicode[code] = unicode; - - // fixme: ugly little hack - if (SPACE.Equals(unicode)) - { - spaceMapping = code; - } - } - - /** - * This will add a CID mapping. - * - * @param code character code - * @param cid CID - */ - void addCIDMapping(int code, int cid) - { - codeToCid[cid] = code; - } - - /** - * This will add a CID Range. - * - * @param from starting charactor of the CID range. - * @param to ending character of the CID range. - * @param cid the cid to be started with. - * - */ - void addCIDRange(char from, char to, int cid) - { - codeToCidRanges.Add(new CidRange(from, to, cid)); - } - - /** - * This will add a codespace range. - * - * @param range A single codespace range. - */ - void addCodespaceRange(CodespaceRange range) - { - codespaceRanges.Add(range); - maxCodeLength = Math.Max(maxCodeLength, range.CodeLength); - minCodeLength = Math.Min(minCodeLength, range.CodeLength); - } - - /** - * Implementation of the usecmap operator. This will - * copy all of the mappings from one cmap to another. - * - * @param cmap The cmap to load mappings from. - */ - private void useCmap(CMap cmap) - { - foreach (CodespaceRange codespaceRange in cmap.codespaceRanges) - { - addCodespaceRange(codespaceRange); - } - charToUnicode.PutAll(cmap.charToUnicode); - codeToCid.PutAll(cmap.codeToCid); - codeToCidRanges.AddRange(cmap.codeToCidRanges); - } - - /** - * Returns the WMode of a CMap. - * - * 0 represents a horizontal and 1 represents a vertical orientation. - * - * @return the wmode - */ - public int getWMode() - { - return wmode; - } - - /** - * Sets the WMode of a CMap. - * - * @param newWMode the new WMode. - */ - public void setWMode(int newWMode) - { - wmode = newWMode; - } - - /** - * Returns the name of the CMap. - * - * @return the CMap name. - */ - public string getName() - { - return cmapName; - } - - /** - * Sets the name of the CMap. - * - * @param name the CMap name. - */ - public void setName(string name) - { - cmapName = name; - } - - /** - * Returns the version of the CMap. - * - * @return the CMap version. - */ - public string getVersion() - { - return cmapVersion; - } - - /** - * Sets the version of the CMap. - * - * @param version the CMap version. - */ - public void setVersion(string version) - { - cmapVersion = version; - } - - /** - * Returns the type of the CMap. - * - * @return the CMap type. - */ - public int getType() - { - return cmapType; - } - - /** - * Sets the type of the CMap. - * - * @param type the CMap type. - */ - public void setType(int type) - { - cmapType = type; - } - - /** - * Returns the registry of the CIDSystemInfo. - * - * @return the registry. - */ - public string getRegistry() - { - return registry; - } - - /** - * Sets the registry of the CIDSystemInfo. - * - * @param newRegistry the registry. - */ - public void setRegistry(string newRegistry) - { - registry = newRegistry; - } - - /** - * Returns the ordering of the CIDSystemInfo. - * - * @return the ordering. - */ - public string getOrdering() - { - return ordering; - } - - /** - * Sets the ordering of the CIDSystemInfo. - * - * @param newOrdering the ordering. - */ - public void setOrdering(string newOrdering) - { - ordering = newOrdering; - } - - /** - * Returns the supplement of the CIDSystemInfo. - * - * @return the supplement. - */ - public int getSupplement() - { - return supplement; - } - - /** - * Sets the supplement of the CIDSystemInfo. - * - * @param newSupplement the supplement. - */ - public void setSupplement(int newSupplement) - { - supplement = newSupplement; - } - - /** - * Returns the mapping for the space character. - * - * @return the mapped code for the space character - */ - public int getSpaceMapping() - { - return spaceMapping; - } - - + + public override string ToString() { return cmapName; diff --git a/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs b/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs index 650f76de..3ac77ee4 100644 --- a/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs +++ b/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs @@ -1,23 +1,26 @@ namespace UglyToad.Pdf.Fonts.Parser { using System; - using System.Collections.Generic; - using System.Globalization; using Cmap; - using Cos; using IO; + using Parts; using Tokenization.Scanner; using Tokenization.Tokens; - using Util.JetBrains.Annotations; public class CMapParser { + private static readonly BaseFontRangeParser BaseFontRangeParser = new BaseFontRangeParser(); + private static readonly BaseFontCharacterParser BaseFontCharacterParser = new BaseFontCharacterParser(); + private static readonly CidRangeParser CidRangeParser = new CidRangeParser(); + private static readonly CidFontNameParser CidFontNameParser = new CidFontNameParser(); + private static readonly CodespaceRangeParser CodespaceRangeParser = new CodespaceRangeParser(); + private static readonly CidCharacterParser CidCharacterParser = new CidCharacterParser(); + public CMap Parse(IInputBytes inputBytes, bool isLenientParsing) { var scanner = new CoreTokenScanner(inputBytes); var builder = new CharacterMapBuilder(); - var result = new CMap(); IToken previousToken = null; while (scanner.MoveNext()) @@ -34,20 +37,19 @@ { if (previousToken is NumericToken numeric) { - ParseCodespaceRange(numeric, scanner, builder); + CodespaceRangeParser.Parse(numeric, scanner, builder, isLenientParsing); } else { throw new InvalidOperationException("Unexpected token preceding start of codespace range: " + previousToken); } - } break; case "beginbfchar": { if (previousToken is NumericToken numeric) { - ParseBaseFontCharacters(numeric, scanner, builder); + BaseFontCharacterParser.Parse(numeric, scanner, builder, isLenientParsing); } else { @@ -59,8 +61,7 @@ { if (previousToken is NumericToken numeric) { - var parser = new BaseFontRangeParser(); - parser.Parse(numeric, scanner, builder); + BaseFontRangeParser.Parse(numeric, scanner, builder, isLenientParsing); } else { @@ -72,9 +73,7 @@ { if (previousToken is NumericToken numeric) { - var characters = ParseCidCharacters(numeric, scanner); - - builder.CidCharacterMappings = characters; + CidCharacterParser.Parse(numeric, scanner, builder, isLenientParsing); } else { @@ -83,229 +82,28 @@ break; } case "begincidrange": + { + if (previousToken is NumericToken numeric) + { + CidRangeParser.Parse(numeric, scanner, builder, isLenientParsing); + } + else + { + throw new InvalidOperationException("Unexpected token preceding start of Cid ranges: " + previousToken); + } + } break; } } else if (token is NameToken name) { - ParseName(name, scanner, builder, isLenientParsing); + CidFontNameParser.Parse(name, scanner, builder, isLenientParsing); } previousToken = token; } - return null; - } - - private static void ParseCodespaceRange(NumericToken count, ITokenScanner tokenScanner, CharacterMapBuilder builder) - { - /* - * For example: - 3 begincodespacerange - <00> <80> - <8140> <9ffc> - - endcodespacerange - */ - - var ranges = new List(count.Int); - - for (var i = 0; i < count.Int; i++) - { - if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start)) - { - throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken); - } - - if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end)) - { - throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken); - } - - ranges.Add(new CodespaceRange(start.Bytes, end.Bytes)); - } - - builder.CodespaceRanges = ranges; - } - - private static void ParseBaseFontCharacters(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder) - { - for (var i = 0; i < numeric.Int; i++) - { - if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode)) - { - throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}"); - } - - if (!tokenScanner.MoveNext()) - { - throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}"); - } - - if (tokenScanner.CurrentToken is NameToken characterName) - { - builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name); - } - else if (tokenScanner.CurrentToken is HexToken characterCode) - { - builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes); - } - else - { - throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}"); - } - } - } - - private static IReadOnlyList ParseCidCharacters(NumericToken numeric, ITokenScanner scanner) - { - var results = new List(); - - for (var i = 0; i < numeric.Int; i++) - { - if (!scanner.TryReadToken(out HexToken sourceCode)) - { - throw new InvalidOperationException("The first token in a line for Cid Characters should be a hex, instead it was: " + scanner.CurrentToken); - } - - if (!scanner.TryReadToken(out NumericToken destinationCode)) - { - throw new InvalidOperationException("The destination token in a line for Cid Character should be an integer, instead it was: " + scanner.CurrentToken); - } - - var sourceInteger = sourceCode.Bytes.ToInt(sourceCode.Bytes.Count); - var mapping = new CidCharacterMapping(sourceInteger, destinationCode.Int); - - results.Add(mapping); - } - - return results; - } - - private static void ParseName(NameToken nameToken, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing) - { - switch (nameToken.Data.Name) - { - case "WMode": - { - var next = TryMoveNext(scanner); - if (next is NumericToken numeric) - { - builder.WMode = numeric.Int; - } - break; - } - case "CMapName": - { - var next = TryMoveNext(scanner); - if (next is NameToken name) - { - builder.Name = name.Data.Name; - } - break; - } - case "CMapVersion": - { - var next = TryMoveNext(scanner); - if (next is NumericToken number) - { - builder.Version = number.Data.ToString(NumberFormatInfo.InvariantInfo); - } - else if (next is StringToken stringToken) - { - builder.Version = stringToken.Data; - } - break; - } - case "CMapType": - { - var next = TryMoveNext(scanner); - if (next is NumericToken numeric) - { - builder.Type = numeric.Int; - } - break; - } - case "Registry": - { - throw new NotImplementedException("Registry should be in a dictionary"); - } - case "Ordering": - { - throw new NotImplementedException("Ordering should be in a dictionary"); - } - case "Supplement": - { - throw new NotImplementedException("Supplement should be in a dictionary"); - } - case "CIDSystemInfo": - { - var next = TryMoveNext(scanner); - - if (next is DictionaryToken dictionary) - { - builder.CharacterIdentifierSystemInfo = GetCharacterIdentifier(dictionary, isLenientParsing); - } - break; - } - } - } - - private static CharacterIdentifierSystemInfo GetCharacterIdentifier(DictionaryToken dictionary, bool isLenientParsing) - { - string GetErrorMessage(string missingKey) - { - return $"No {missingKey} found in the CIDSystemInfo dictionary: " + dictionary; - } - - if (!dictionary.TryGetByName(CosName.REGISTRY, out var registry) || !(registry is StringToken registryString)) - { - if (isLenientParsing) - { - registryString = new StringToken("Adobe"); - } - else - { - throw new InvalidOperationException(GetErrorMessage("registry")); - } - } - - if (!dictionary.TryGetByName(CosName.ORDERING, out var ordering) || !(ordering is StringToken orderingString)) - { - if (isLenientParsing) - { - orderingString = new StringToken(""); - } - else - { - throw new InvalidOperationException(GetErrorMessage("ordering")); - } - } - - if (!dictionary.TryGetByName(CosName.SUPPLEMENT, out var supplement) || !(supplement is NumericToken supplementNumeric)) - { - if (isLenientParsing) - { - supplementNumeric = new NumericToken(0); - } - else - { - throw new InvalidOperationException(GetErrorMessage("supplement")); - } - } - - return new CharacterIdentifierSystemInfo(registryString.Data, orderingString.Data, supplementNumeric.Int); - } - - [CanBeNull] - private static IToken TryMoveNext(ITokenScanner scanner) - { - if (!scanner.MoveNext()) - { - return null; - } - - return scanner.CurrentToken; + return builder.Build(); } } } diff --git a/src/UglyToad.Pdf/Fonts/Parser/Parts/BaseFontCharacterParser.cs b/src/UglyToad.Pdf/Fonts/Parser/Parts/BaseFontCharacterParser.cs new file mode 100644 index 00000000..bbf2372e --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/BaseFontCharacterParser.cs @@ -0,0 +1,39 @@ +namespace UglyToad.Pdf.Fonts.Parser.Parts +{ + using System; + using Cmap; + using Tokenization.Scanner; + using Tokenization.Tokens; + + internal class BaseFontCharacterParser : ICidFontPartParser + { + public void Parse(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder, bool isLenientParsing) + { + for (var i = 0; i < numeric.Int; i++) + { + if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode)) + { + throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}"); + } + + if (!tokenScanner.MoveNext()) + { + throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}"); + } + + if (tokenScanner.CurrentToken is NameToken characterName) + { + builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name); + } + else if (tokenScanner.CurrentToken is HexToken characterCode) + { + builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes); + } + else + { + throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}"); + } + } + } + } +} diff --git a/src/UglyToad.Pdf/Fonts/Parser/BaseFontRangeParser.cs b/src/UglyToad.Pdf/Fonts/Parser/Parts/BaseFontRangeParser.cs similarity index 95% rename from src/UglyToad.Pdf/Fonts/Parser/BaseFontRangeParser.cs rename to src/UglyToad.Pdf/Fonts/Parser/Parts/BaseFontRangeParser.cs index 6507f998..cdf8b579 100644 --- a/src/UglyToad.Pdf/Fonts/Parser/BaseFontRangeParser.cs +++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/BaseFontRangeParser.cs @@ -1,4 +1,4 @@ -namespace UglyToad.Pdf.Fonts.Parser +namespace UglyToad.Pdf.Fonts.Parser.Parts { using System; using System.Collections.Generic; @@ -7,9 +7,9 @@ using Tokenization.Scanner; using Tokenization.Tokens; - internal class BaseFontRangeParser + internal class BaseFontRangeParser : ICidFontPartParser { - public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder) + public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing) { for (var i = 0; i < numeric.Int; i++) { diff --git a/src/UglyToad.Pdf/Fonts/Parser/Parts/CidCharacterParser.cs b/src/UglyToad.Pdf/Fonts/Parser/Parts/CidCharacterParser.cs new file mode 100644 index 00000000..590b3f4f --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/CidCharacterParser.cs @@ -0,0 +1,36 @@ +namespace UglyToad.Pdf.Fonts.Parser.Parts +{ + using System; + using System.Collections.Generic; + using Cmap; + using Tokenization.Scanner; + using Tokenization.Tokens; + + internal class CidCharacterParser : ICidFontPartParser + { + public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing) + { + var results = new List(); + + for (var i = 0; i < numeric.Int; i++) + { + if (!scanner.TryReadToken(out HexToken sourceCode)) + { + throw new InvalidOperationException("The first token in a line for Cid Characters should be a hex, instead it was: " + scanner.CurrentToken); + } + + if (!scanner.TryReadToken(out NumericToken destinationCode)) + { + throw new InvalidOperationException("The destination token in a line for Cid Character should be an integer, instead it was: " + scanner.CurrentToken); + } + + var sourceInteger = sourceCode.Bytes.ToInt(sourceCode.Bytes.Count); + var mapping = new CidCharacterMapping(sourceInteger, destinationCode.Int); + + results.Add(mapping); + } + + builder.CidCharacterMappings = results; + } + } +} diff --git a/src/UglyToad.Pdf/Fonts/Parser/Parts/CidFontNameParser.cs b/src/UglyToad.Pdf/Fonts/Parser/Parts/CidFontNameParser.cs new file mode 100644 index 00000000..b7a7a7c8 --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/CidFontNameParser.cs @@ -0,0 +1,128 @@ +namespace UglyToad.Pdf.Fonts.Parser.Parts +{ + using System; + using System.Globalization; + using Cmap; + using Cos; + using Tokenization.Scanner; + using Tokenization.Tokens; + + internal class CidFontNameParser : ICidFontPartParser + { + public void Parse(NameToken nameToken, ITokenScanner scanner, CharacterMapBuilder builder, + bool isLenientParsing) + { + switch (nameToken.Data.Name) + { + case "WMode": + { + if (scanner.TryReadToken(out NumericToken numeric)) + { + builder.WMode = numeric.Int; + } + break; + } + case "CMapName": + { + if (scanner.TryReadToken(out NameToken name)) + { + builder.Name = name.Data.Name; + } + break; + } + case "CMapVersion": + { + if (!scanner.MoveNext()) + { + break; + } + + var next = scanner.CurrentToken; + if (next is NumericToken number) + { + builder.Version = number.Data.ToString(NumberFormatInfo.InvariantInfo); + } + else if (next is StringToken stringToken) + { + builder.Version = stringToken.Data; + } + break; + } + case "CMapType": + { + if (scanner.TryReadToken(out NumericToken numeric)) + { + builder.Type = numeric.Int; + } + break; + } + case "Registry": + { + throw new NotImplementedException("Registry should be in a dictionary"); + } + case "Ordering": + { + throw new NotImplementedException("Ordering should be in a dictionary"); + } + case "Supplement": + { + throw new NotImplementedException("Supplement should be in a dictionary"); + } + case "CIDSystemInfo": + { + if (scanner.TryReadToken(out DictionaryToken dictionary)) + { + builder.CharacterIdentifierSystemInfo = GetCharacterIdentifier(dictionary, isLenientParsing); + } + break; + } + } + } + + private static CharacterIdentifierSystemInfo GetCharacterIdentifier(DictionaryToken dictionary, bool isLenientParsing) + { + string GetErrorMessage(string missingKey) + { + return $"No {missingKey} found in the CIDSystemInfo dictionary: " + dictionary; + } + + if (!dictionary.TryGetByName(CosName.REGISTRY, out var registry) || !(registry is StringToken registryString)) + { + if (isLenientParsing) + { + registryString = new StringToken("Adobe"); + } + else + { + throw new InvalidOperationException(GetErrorMessage("registry")); + } + } + + if (!dictionary.TryGetByName(CosName.ORDERING, out var ordering) || !(ordering is StringToken orderingString)) + { + if (isLenientParsing) + { + orderingString = new StringToken(""); + } + else + { + throw new InvalidOperationException(GetErrorMessage("ordering")); + } + } + + if (!dictionary.TryGetByName(CosName.SUPPLEMENT, out var supplement) || !(supplement is NumericToken supplementNumeric)) + { + if (isLenientParsing) + { + supplementNumeric = new NumericToken(0); + } + else + { + throw new InvalidOperationException(GetErrorMessage("supplement")); + } + } + + return new CharacterIdentifierSystemInfo(registryString.Data, orderingString.Data, supplementNumeric.Int); + } + } +} diff --git a/src/UglyToad.Pdf/Fonts/Parser/Parts/CidRangeParser.cs b/src/UglyToad.Pdf/Fonts/Parser/Parts/CidRangeParser.cs new file mode 100644 index 00000000..8c9cbb16 --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/CidRangeParser.cs @@ -0,0 +1,46 @@ +namespace UglyToad.Pdf.Fonts.Parser.Parts +{ + using System; + using System.Collections.Generic; + using Cmap; + using Tokenization.Scanner; + using Tokenization.Tokens; + + internal class CidRangeParser : ICidFontPartParser + { + public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing) + { + var ranges = new List(); + + for (var i = 0; i < numeric.Int; i++) + { + if (!scanner.TryReadToken(out HexToken startHexToken)) + { + // TODO: message + throw new InvalidOperationException(); + } + + if (!scanner.TryReadToken(out HexToken endHexToken)) + { + // TODO: message + throw new InvalidOperationException(); + } + + if (!scanner.TryReadToken(out NumericToken mappedCode)) + { + // TODO: message + throw new InvalidOperationException(); + } + + var start = HexToken.ConvertHexBytesToInt(startHexToken); + var end = HexToken.ConvertHexBytesToInt(endHexToken); + + var range = new CidRange((char)start, (char)end, mappedCode.Int); + + ranges.Add(range); + } + + builder.CidRanges = ranges; + } + } +} diff --git a/src/UglyToad.Pdf/Fonts/Parser/Parts/CodespaceRangeParser.cs b/src/UglyToad.Pdf/Fonts/Parser/Parts/CodespaceRangeParser.cs new file mode 100644 index 00000000..f1661a50 --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/CodespaceRangeParser.cs @@ -0,0 +1,42 @@ +namespace UglyToad.Pdf.Fonts.Parser.Parts +{ + using System; + using System.Collections.Generic; + using Cmap; + using Tokenization.Scanner; + using Tokenization.Tokens; + + internal class CodespaceRangeParser : ICidFontPartParser + { + public void Parse(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder, bool isLenientParsing) + { + /* + * For example: + 3 begincodespacerange + <00> <80> + <8140> <9ffc> + + endcodespacerange + */ + + var ranges = new List(numeric.Int); + + for (var i = 0; i < numeric.Int; i++) + { + if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start)) + { + throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken); + } + + if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end)) + { + throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken); + } + + ranges.Add(new CodespaceRange(start.Bytes, end.Bytes)); + } + + builder.CodespaceRanges = ranges; + } + } +} diff --git a/src/UglyToad.Pdf/Fonts/Parser/Parts/ICidFontPartParser.cs b/src/UglyToad.Pdf/Fonts/Parser/Parts/ICidFontPartParser.cs new file mode 100644 index 00000000..0adb191d --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/ICidFontPartParser.cs @@ -0,0 +1,17 @@ +namespace UglyToad.Pdf.Fonts.Parser.Parts +{ + using Cmap; + using Tokenization.Scanner; + + /// + /// Provides parsing for a certain operator type in a CID font definition. + /// + /// The type of the token preceding the operation we wish to parse. + internal interface ICidFontPartParser + { + /// + /// Parse the definition for this part of the CID font and write the results to the . + /// + void Parse(TToken previous, ITokenScanner tokenScanner, CharacterMapBuilder builder, bool isLenientParsing); + } +} \ No newline at end of file diff --git a/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs b/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs index 723c056d..9de379c5 100644 --- a/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs +++ b/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs @@ -70,5 +70,19 @@ namespace UglyToad.Pdf.Tokenization.Tokens Bytes = bytes; Data = builder.ToString(); } + + public static int ConvertHexBytesToInt(HexToken token) + { + var bytes = token.Bytes; + + var value = bytes[0] & 0xFF; + if (bytes.Count == 2) + { + value <<= 8; + value += bytes[1] & 0xFF; + } + + return value; + } } } \ No newline at end of file