diff --git a/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs b/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs index 0c4f6fec..4949cf42 100644 --- a/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs +++ b/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs @@ -1,5 +1,10 @@ namespace UglyToad.Pdf.Fonts.Cmap { + using System.Collections.Generic; + using System.Linq; + using System.Text; + using Util; + /// /// A mutable class used when parsing and generating a . /// @@ -44,5 +49,39 @@ /// Defined as required. /// public int Type { get; set; } = -1; + + public IReadOnlyList CodespaceRanges { get; set; } + + public Dictionary BaseFontCharacterMap { get; } = new Dictionary(); + + public void AddBaseFontCharacter(IReadOnlyList bytes, IReadOnlyList value) + { + AddBaseFontCharacter(bytes, CreateStringFromBytes(value.ToArray())); + } + + public void AddBaseFontCharacter(IReadOnlyList bytes, string value) + { + var code = GetCodeFromArray(bytes, bytes.Count); + + BaseFontCharacterMap[code] = value; + } + + private int GetCodeFromArray(IReadOnlyList data, int length) + { + int code = 0; + for (int i = 0; i < length; i++) + { + code <<= 8; + code |= (data[i] + 256) % 256; + } + return code; + } + + private string CreateStringFromBytes(byte[] bytes) + { + return bytes.Length == 1 + ? OtherEncodings.BytesAsLatin1String(bytes) + : Encoding.BigEndianUnicode.GetString(bytes); + } } } diff --git a/src/UglyToad.Pdf/Fonts/Cmap/CmapUtils.cs b/src/UglyToad.Pdf/Fonts/Cmap/CmapUtils.cs index 0c1297d1..2eb64713 100644 --- a/src/UglyToad.Pdf/Fonts/Cmap/CmapUtils.cs +++ b/src/UglyToad.Pdf/Fonts/Cmap/CmapUtils.cs @@ -4,7 +4,7 @@ internal static class CmapUtils { - public static int ToInt(this byte[] data, int length) + public static int ToInt(this IReadOnlyList data, int length) { int code = 0; for (int i = 0; i < length; ++i) diff --git a/src/UglyToad.Pdf/Fonts/Cmap/CodespaceRange.cs b/src/UglyToad.Pdf/Fonts/Cmap/CodespaceRange.cs index e335b09a..f5aa086e 100644 --- a/src/UglyToad.Pdf/Fonts/Cmap/CodespaceRange.cs +++ b/src/UglyToad.Pdf/Fonts/Cmap/CodespaceRange.cs @@ -1,68 +1,34 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace UglyToad.Pdf.Fonts.Cmap +namespace UglyToad.Pdf.Fonts.Cmap { + using System.Collections.Generic; + /// /// A codespace range is specified by a pair of codes of some particular length giving the lower and upper bounds of that range. /// public class CodespaceRange { - private byte[] start; - private byte[] end; - private int startInt; - private int endInt; + public IReadOnlyList Start { get; } - public int CodeLength { get; private set; } + public IReadOnlyList End { get; } - /** - * Creates a new instance of CodespaceRange. - */ - public CodespaceRange() + public int StartInt { get; } + + public int EndInt { get; } + + public int CodeLength { get; } + + /// + /// Creates a new instance of . + /// + public CodespaceRange(IReadOnlyList start, IReadOnlyList end) { + Start = start; + End = end; + StartInt = start.ToInt(start.Count); + EndInt = end.ToInt(end.Count); + CodeLength = start.Count; } - - /** Getter for property end. - * @return Value of property end. - * - */ - public byte[] getEnd() - { - return end; - } - - /** Setter for property end. - * @param endBytes New value of property end. - * - */ - void setEnd(byte[] endBytes) - { - end = endBytes; - endInt = endBytes.ToInt(endBytes.Length); - } - - /** Getter for property start. - * @return Value of property start. - * - */ - public byte[] getStart() - { - return start; - } - - /** Setter for property start. - * @param startBytes New value of property start. - * - */ - void setStart(byte[] startBytes) - { - start = startBytes; - CodeLength = start.Length; - startInt = startBytes.ToInt(startBytes.Length); - } - /** * Returns true if the given code bytes match this codespace range. */ @@ -80,7 +46,7 @@ namespace UglyToad.Pdf.Fonts.Cmap if (codeLen == CodeLength) { int value = code.ToInt(codeLen); - if (value >= startInt && value <= endInt) + if (value >= StartInt && value <= EndInt) { return true; } diff --git a/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs b/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs index 608733d5..323434e2 100644 --- a/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs +++ b/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs @@ -1,6 +1,7 @@ namespace UglyToad.Pdf.Fonts.Parser { using System; + using System.Collections.Generic; using System.Globalization; using Cmap; using Cos; @@ -27,7 +28,38 @@ { switch (operatorToken.Data) { - default: + case "usecmap": + throw new NotImplementedException("External CMap files not yet supported, please submit a pull request!"); + case "begincodespacerange": + { + if (previousToken is NumericToken numeric) + { + ParseCodespaceRange(numeric, scanner, builder); + } + else + { + throw new InvalidOperationException("Unexpected token preceding start of codespace range: " + previousToken); + } + + } + break; + case "beginbfchar": + { + if (previousToken is NumericToken numeric) + { + ParseBaseFontCharacters(numeric, scanner, builder); + } + else + { + throw new InvalidOperationException("Unexpected token preceding start of base font characters: " + previousToken); + } + } + break; + case "beginbfrange": + break; + case "begincidchar": + break; + case "begingcidrange": break; } } @@ -42,6 +74,66 @@ return null; } + private static void ParseCodespaceRange(NumericToken count, ITokenScanner tokenScanner, CharacterMapBuilder builder) + { + /* + * For example: + 3 begincodespacerange + <00> <80> + <8140> <9ffc> + + endcodespacerange + */ + + var ranges = new List(count.Int); + + for (var i = 0; i < count.Int; i++) + { + if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start)) + { + throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken); + } + + if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end)) + { + throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken); + } + + ranges.Add(new CodespaceRange(start.Bytes, end.Bytes)); + } + + builder.CodespaceRanges = ranges; + } + + private static void ParseBaseFontCharacters(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder) + { + for (var i = 0; i < numeric.Int; i++) + { + if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode)) + { + throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}"); + } + + if (!tokenScanner.MoveNext()) + { + throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}"); + } + + if (tokenScanner.CurrentToken is NameToken characterName) + { + builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name); + } + else if (tokenScanner.CurrentToken is HexToken characterCode) + { + builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes); + } + else + { + throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}"); + } + } + } + private static void ParseName(NameToken nameToken, CoreTokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing) { switch (nameToken.Data.Name) diff --git a/src/UglyToad.Pdf/Tokenization/HexTokenizer.cs b/src/UglyToad.Pdf/Tokenization/HexTokenizer.cs index 74e12b8a..3976cf60 100644 --- a/src/UglyToad.Pdf/Tokenization/HexTokenizer.cs +++ b/src/UglyToad.Pdf/Tokenization/HexTokenizer.cs @@ -1,6 +1,6 @@ namespace UglyToad.Pdf.Tokenization { - using System.Text; + using System.Collections.Generic; using IO; using Parser.Parts; using Tokens; @@ -17,8 +17,8 @@ { return false; } - - var characters = new StringBuilder(); + + var characters = new List(); while (inputBytes.MoveNext()) { @@ -39,10 +39,10 @@ return false; } - characters.Append((char)current); + characters.Add((char)current); } - token = new HexToken(characters.ToString()); + token = new HexToken(characters); return true; } diff --git a/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs b/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs index 3d0c653f..723c056d 100644 --- a/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs +++ b/src/UglyToad.Pdf/Tokenization/Tokens/HexToken.cs @@ -1,30 +1,73 @@ namespace UglyToad.Pdf.Tokenization.Tokens { - using System; using System.Collections.Generic; using System.Text; public class HexToken : IDataToken { + private static readonly Dictionary HexMap = new Dictionary + { + {'0', 0x00 }, + {'1', 0x01 }, + {'2', 0x02 }, + {'3', 0x03 }, + {'4', 0x04 }, + {'5', 0x05 }, + {'6', 0x06 }, + {'7', 0x07 }, + {'8', 0x08 }, + {'9', 0x09 }, + + {'A', 0x0A }, + {'a', 0x0A }, + {'B', 0x0B }, + {'b', 0x0B }, + {'C', 0x0C }, + {'c', 0x0C }, + {'D', 0x0D }, + {'d', 0x0D }, + {'E', 0x0E }, + {'e', 0x0E }, + {'F', 0x0F }, + {'f', 0x0F } + }; + + private static byte Convert(char high, char low) + { + var highByte = HexMap[high]; + var lowByte = HexMap[low]; + + return (byte)(highByte << 4 | lowByte); + } + public string Data { get; } public IReadOnlyList Bytes { get; } - public HexToken(string characters) + public HexToken(IReadOnlyList characters) { - if (characters.Length % 2 != 0) - { - characters += "0"; - } - + var bytes = new List(); var builder = new StringBuilder(); - byte[] raw = new byte[characters.Length / 2]; - for (int i = 0; i < raw.Length; i++) + + for (int i = 0; i < characters.Count; i += 2) { - builder.Append((char)Convert.ToByte(characters.Substring(i * 2, 2), 16)); + char high = characters[i]; + char low; + if (i == characters.Count - 1) + { + low = '0'; + } + else + { + low = characters[i + 1]; + } + + var b = Convert(high, low); + bytes.Add(b); + builder.Append((char)b); } - Bytes = raw; + Bytes = bytes; Data = builder.ToString(); } }