diff --git a/src/UglyToad.Pdf.Tests/Fonts/Parser/CMapParserTests.cs b/src/UglyToad.Pdf.Tests/Fonts/Parser/CMapParserTests.cs new file mode 100644 index 00000000..c74aa145 --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Fonts/Parser/CMapParserTests.cs @@ -0,0 +1,50 @@ +namespace UglyToad.Pdf.Tests.Fonts.Parser +{ + using Pdf.Fonts.Parser; + using Xunit; + + public class CMapParserTests + { + private const string GoogleDocToUnicodeCmap = @"/CIDInit /ProcSet findresource begin +12 dict begin +begincmap +/CIDSystemInfo +<< /Registry (Adobe) +/Ordering (UCS) +/Supplement 0 +>> def +/CMapName /Adobe-Identity-UCS def +/CMapType 2 def +1 begincodespacerange +<0000> +endcodespacerange +6 beginbfchar +<0003> <0020> +<0011> <002E> +<0024> <0041> +<0037> <0054> +<0044> <0061> +<005B> <0078> +endbfchar +4 beginbfrange +<0046> <0049> <0063> +<004B> <004C> <0068> +<004F> <0052> <006C> +<0055> <0058> <0072> +endbfrange +endcmap +CMapName currentdict /CMap defineresource pop +end +end"; + + private readonly CMapParser cMapParser = new CMapParser(); + + [Fact] + public void CanParseCMap() + { + var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false); + + var cmap = cMapParser.Parse(input.Bytes, false); + } + } +} diff --git a/src/UglyToad.Pdf.Tests/Tokenization/CommentTokenizerTests.cs b/src/UglyToad.Pdf.Tests/Tokenization/CommentTokenizerTests.cs new file mode 100644 index 00000000..e0dfcdca --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Tokenization/CommentTokenizerTests.cs @@ -0,0 +1,47 @@ +namespace UglyToad.Pdf.Tests.Tokenization +{ + using Pdf.Tokenization; + using Pdf.Tokenization.Tokens; + using Xunit; + + public class CommentTokenizerTests + { + private readonly CommentTokenizer commentTokenizer = new CommentTokenizer(); + + [Theory] + [InlineData("(%not a comment)")] + [InlineData("\\%not a comment)")] + [InlineData("‰")] + public void InvalidFirstCharacter_ReturnsFalse(string s) + { + var input = StringBytesTestConverter.Convert(s); + + var result = commentTokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.False(result); + Assert.Null(token); + } + + [Theory] + [InlineData(@"%Resource-CMAP +%AnotherComment", "Resource-CMAP")] + [InlineData("%%PDF 1.5", "%PDF 1.5")] + [InlineData(@"% comment {/%) blah blah blah + 123", " comment {/%) blah blah blah")] + [InlineData("%comment\rNot comment", "comment")] + [InlineData("%comment\r\nNot comment", "comment")] + [InlineData("%comment\nNot comment", "comment")] + public void ParsesComment(string s, string expected) + { + var input = StringBytesTestConverter.Convert(s); + + var result = commentTokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + + var comment = Assert.IsType(token); + + Assert.Equal(expected, comment.Data); + } + } +} diff --git a/src/UglyToad.Pdf.Tests/Tokenization/Tokens/DictionaryTokenTests.cs b/src/UglyToad.Pdf.Tests/Tokenization/Tokens/DictionaryTokenTests.cs new file mode 100644 index 00000000..fbf68f7f --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Tokenization/Tokens/DictionaryTokenTests.cs @@ -0,0 +1,78 @@ +// ReSharper disable ObjectCreationAsStatement +namespace UglyToad.Pdf.Tests.Tokenization.Tokens +{ + using System; + using System.Collections.Generic; + using Pdf.Cos; + using Pdf.Tokenization.Tokens; + using Xunit; + + public class DictionaryTokenTests + { + [Fact] + public void NullDictionaryThrows() + { + Action action = () => new DictionaryToken(null); + + Assert.Throws(action); + } + + [Fact] + public void EmptyDictionaryValid() + { + var dictionary = new DictionaryToken(new Dictionary()); + + Assert.Empty(dictionary.Data); + } + + [Fact] + public void TryGetByName_EmptyDictionary() + { + var dictionary = new DictionaryToken(new Dictionary()); + + var result = dictionary.TryGetByName(CosName.ACTUAL_TEXT, out var token); + + Assert.False(result); + Assert.Null(token); + } + + [Fact] + public void TryGetByName_NullName_Throws() + { + var dictionary = new DictionaryToken(new Dictionary()); + + Action action = () => dictionary.TryGetByName(null, out var _); + + Assert.Throws(action); + } + + [Fact] + public void TryGetByName_NonEmptyDictionaryNotContainingKey() + { + var dictionary = new DictionaryToken(new Dictionary + { + { new NameToken("Registry"), new StringToken("None") } + }); + + var result = dictionary.TryGetByName(CosName.ACTUAL_TEXT, out var token); + + Assert.False(result); + Assert.Null(token); + } + + [Fact] + public void TryGetByName_ContainingKey() + { + var dictionary = new DictionaryToken(new Dictionary + { + { new NameToken("Fish"), new NumericToken(420) }, + { new NameToken("Registry"), new StringToken("None") } + }); + + var result = dictionary.TryGetByName(CosName.REGISTRY, out var token); + + Assert.True(result); + Assert.Equal("None", Assert.IsType(token).Data); + } + } +} diff --git a/src/UglyToad.Pdf/Content/Page.cs b/src/UglyToad.Pdf/Content/Page.cs index 6152c0d9..714ed503 100644 --- a/src/UglyToad.Pdf/Content/Page.cs +++ b/src/UglyToad.Pdf/Content/Page.cs @@ -41,6 +41,11 @@ var pee = new TextSectionParser(new NoOpLog()).ReadTextObjects(new ByteTextScanner(rw)); var font0 = parsingArguments.CachingProviders.ObjectPool.Get(new CosObjectKey(16, 0)); var cmpa = parsingArguments.CachingProviders.ObjectPool.Get(new CosObjectKey(9, 0)); + var toad = parsingArguments.Container.Get() + .Parse(parsingArguments, new CosObjectKey(9, 0), false); + var bigsby = (toad as RawCosStream).Decode(parsingArguments.Container.Get()); + + var ssss = OtherEncodings.BytesAsLatin1String(bigsby); } } diff --git a/src/UglyToad.Pdf/Fonts/Cmap/CharacterIdentifierSystemInfo.cs b/src/UglyToad.Pdf/Fonts/Cmap/CharacterIdentifierSystemInfo.cs new file mode 100644 index 00000000..8c3387e4 --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Cmap/CharacterIdentifierSystemInfo.cs @@ -0,0 +1,30 @@ +namespace UglyToad.Pdf.Fonts.Cmap +{ + /// + /// Specifies the character collection associated with the (CIDFont). + /// + public struct CharacterIdentifierSystemInfo + { + /// + /// Identifies the issuer of the character collection. + /// + public string Registry { get; } + + /// + /// Uniquely identifies the character collection within the parent registry. + /// + public string Ordering { get; } + + /// + /// The supplement number of the character collection. + /// + public int Supplement { get; } + + public CharacterIdentifierSystemInfo(string registry, string ordering, int supplement) + { + Registry = registry; + Ordering = ordering; + Supplement = supplement; + } + } +} \ No newline at end of file diff --git a/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs b/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs new file mode 100644 index 00000000..0c4f6fec --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Cmap/CharacterMapBuilder.cs @@ -0,0 +1,48 @@ +namespace UglyToad.Pdf.Fonts.Cmap +{ + /// + /// A mutable class used when parsing and generating a . + /// + internal class CharacterMapBuilder + { + /// + /// Defines the character collection associated CIDFont/s for this CMap. + /// + public CharacterIdentifierSystemInfo CharacterIdentifierSystemInfo { get; set; } + + /// + /// An that determines the writing mode for any CIDFont combined with this CMap. + /// 0: Horizontal + /// 1: Vertical + /// + /// + /// Defined as optional. + /// + public int WMode { get; set; } = 0; + + /// + /// The PostScript name of the CMap. + /// + /// + /// Defined as required. + /// + public string Name { get; set; } + + /// + /// Defines the version of this CIDFont file. + /// + /// + /// Defined as optional. + /// + public string Version { get; set; } + + /// + /// Defines changes to the internal structure of Character Map files + /// or operator semantics. + /// + /// + /// Defined as required. + /// + public int Type { get; set; } = -1; + } +} diff --git a/src/UglyToad.Pdf/Fonts/CompositeFont.cs b/src/UglyToad.Pdf/Fonts/CompositeFont.cs index ae9275f7..4d544bf5 100644 --- a/src/UglyToad.Pdf/Fonts/CompositeFont.cs +++ b/src/UglyToad.Pdf/Fonts/CompositeFont.cs @@ -4,6 +4,7 @@ using System.Text; namespace UglyToad.Pdf.Fonts { + using Cmap; using Cos; public class CompositeFont @@ -132,32 +133,4 @@ namespace UglyToad.Pdf.Fonts { } - - /// - /// Specifies the character collection associated with the (CIDFont). - /// - public struct CharacterIdentifierSystemInfo - { - /// - /// Identifies the issuer of the character collection. - /// - public string Registry { get; } - - /// - /// Uniquely identifies the character collection within the parent registry. - /// - public string Ordering { get; } - - /// - /// The supplement number of the character collection. - /// - public int Supplement { get; } - - public CharacterIdentifierSystemInfo(string registry, string ordering, int supplement) - { - Registry = registry; - Ordering = ordering; - Supplement = supplement; - } - } } diff --git a/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs b/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs index 2b03b617..608733d5 100644 --- a/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs +++ b/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs @@ -1,710 +1,171 @@ -//using System; -//using System.Collections.Generic; -//using System.Text; -//using UglyToad.Pdf.Fonts.Cmap; +namespace UglyToad.Pdf.Fonts.Parser +{ + using System; + using System.Globalization; + using Cmap; + using Cos; + using IO; + using Tokenization.Scanner; + using Tokenization.Tokens; + using Util.JetBrains.Annotations; -//namespace UglyToad.Pdf.Fonts.Parser -//{ -// using Cmap; -// using Text; + public class CMapParser + { + public CMap Parse(IInputBytes inputBytes, bool isLenientParsing) + { + var scanner = new CoreTokenScanner(inputBytes); -// internal class CMapParser -// { -// private static readonly String MARK_END_OF_DICTIONARY = ">>"; -// private static readonly String MARK_END_OF_ARRAY = "]"; + var builder = new CharacterMapBuilder(); + var result = new CMap(); -// private readonly byte[] tokenParserByteBuffer = new byte[512]; + IToken previousToken = null; + while (scanner.MoveNext()) + { + var token = scanner.CurrentToken; -// /** -// * Creates a new instance of CMapParser. -// */ -// public CMapParser() -// { -// } + if (token is OperatorToken operatorToken) + { + switch (operatorToken.Data) + { + default: + break; + } + } + else if (token is NameToken name) + { + ParseName(name, scanner, builder, isLenientParsing); + } -///** -// * Parses a predefined CMap. -// * -// * @param name CMap name. -// * @return The parsed predefined CMap as a java object, never null. -// * @throws IOException If the CMap could not be parsed. -// */ -//public CMap parsePredefined(String name) -//{ -// try (InputStream input = getExternalCMap(name)) -// { -// return parse(input); -// } -//} + previousToken = token; + } -///** -// * This will parse the stream and create a cmap object. -// * -// * @param input The CMAP stream to parse. -// * @return The parsed stream as a java object, never null. -// * @throws IOException If there is an error parsing the stream. -// */ -//public CMap parse(InputStream input) -//{ -// PushbackInputStream cmapStream = new PushbackInputStream(input); -//CMap result = new CMap(); -//Object previousToken = null; -//Object token; -// while ((token = parseNextToken(cmapStream)) != null) -// { -// if (token instanceof Operator) -// { -// Operator op = (Operator)token; -// if (op.op.equals("endcmap")) -// { -// // end of CMap reached, stop reading as there isn't any interesting info anymore -// break; -// } + return null; + } -// switch (op.op) -// { -// case "usecmap": -// parseUsecmap((LiteralName) previousToken, result); -// break; -// case "begincodespacerange": -// parseBegincodespacerange((Number) previousToken, cmapStream, result); -// break; -// case "beginbfchar": -// parseBeginbfchar((Number) previousToken, cmapStream, result); -// break; -// case "beginbfrange": -// parseBeginbfrange((Number) previousToken, cmapStream, result); -// break; -// case "begincidchar": -// parseBegincidchar((Number) previousToken, cmapStream, result); -// break; -// case "begincidrange": -// parseBegincidrange((Integer) previousToken, cmapStream, result); -// break; -// default: -// break; -// } -// } -// else if (token instanceof LiteralName) -// { -// parseLiteralName((LiteralName) token, cmapStream, result); -// } -// previousToken = token; -// } -// return result; -// } + private static void ParseName(NameToken nameToken, CoreTokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing) + { + switch (nameToken.Data.Name) + { + case "WMode": + { + var next = TryMoveNext(scanner); + if (next is NumericToken numeric) + { + builder.WMode = numeric.Int; + } + break; + } + case "CMapName": + { + var next = TryMoveNext(scanner); + if (next is NameToken name) + { + builder.Name = name.Data.Name; + } + break; + } + case "CMapVersion": + { + var next = TryMoveNext(scanner); + if (next is NumericToken number) + { + builder.Version = number.Data.ToString(NumberFormatInfo.InvariantInfo); + } + else if (next is StringToken stringToken) + { + builder.Version = stringToken.Data; + } + break; + } + case "CMapType": + { + var next = TryMoveNext(scanner); + if (next is NumericToken numeric) + { + builder.Type = numeric.Int; + } + break; + } + case "Registry": + { + throw new NotImplementedException("Registry should be in a dictionary"); + } + case "Ordering": + { + throw new NotImplementedException("Ordering should be in a dictionary"); + } + case "Supplement": + { + throw new NotImplementedException("Supplement should be in a dictionary"); + } + case "CIDSystemInfo": + { + var next = TryMoveNext(scanner); -// private void parseUsecmap(LiteralName useCmapName, CMap result) -//{ -// InputStream useStream = getExternalCMap(useCmapName.name); -// CMap useCMap = parse(useStream); -// result.useCmap(useCMap); -//} + if (next is DictionaryToken dictionary) + { + builder.CharacterIdentifierSystemInfo = GetCharacterIdentifier(dictionary, isLenientParsing); + } + break; + } + } + } -//private void parseLiteralName(LiteralName literal, PushbackInputStream cmapStream, CMap result) -//{ -// switch (literal.name) -// { -// case "WMode": -// { -// Object next = parseNextToken(cmapStream); -// if (next instanceof Integer) -// { -// result.setWMode((Integer)next); -// } -// break; -// } -// case "CMapName": -// { -// Object next = parseNextToken(cmapStream); -// if (next instanceof LiteralName) -// { -// result.setName(((LiteralName)next).name); -// } -// break; -// } -// case "CMapVersion": -// { -// Object next = parseNextToken(cmapStream); -// if (next instanceof Number) -// { -// result.setVersion(next.toString()); -// } -// else if (next instanceof String) -// { -// result.setVersion((String)next); -// } -// break; -// } -// case "CMapType": -// { -// Object next = parseNextToken(cmapStream); -// if (next instanceof Integer) -// { -// result.setType((Integer)next); -// } -// break; -// } -// case "Registry": -// { -// Object next = parseNextToken(cmapStream); -// if (next instanceof String) -// { -// result.setRegistry((String)next); -// } -// break; -// } -// case "Ordering": -// { -// Object next = parseNextToken(cmapStream); -// if (next instanceof String) -// { -// result.setOrdering((String)next); -// } -// break; -// } -// case "Supplement": -// { -// Object next = parseNextToken(cmapStream); -// if (next instanceof Integer) -// { -// result.setSupplement((Integer)next); -// } -// break; -// } -// default: -// break; -// } -//} + private static CharacterIdentifierSystemInfo GetCharacterIdentifier(DictionaryToken dictionary, bool isLenientParsing) + { + string GetErrorMessage(string missingKey) + { + return $"No {missingKey} found in the CIDSystemInfo dictionary: " + dictionary; + } -//private void parseBegincodespacerange(Number cosCount, PushbackInputStream cmapStream, CMap result) -//{ -// for (int j = 0; j < cosCount.intValue(); j++) -// { -// Object nextToken = parseNextToken(cmapStream); -// if (nextToken instanceof Operator) -// { -// if (!((Operator)nextToken).op.equals("endcodespacerange")) -// { -// throw new IOException("Error : ~codespacerange contains an unexpected operator : " -// + ((Operator)nextToken).op); -// } -// break; -// } -// byte[] startRange = (byte[])nextToken; -// byte[] endRange = (byte[])parseNextToken(cmapStream); -// CodespaceRange range = new CodespaceRange(); -// range.setStart(startRange); -// range.setEnd(endRange); -// result.addCodespaceRange(range); -// } -//} + if (!dictionary.TryGetByName(CosName.REGISTRY, out var registry) || !(registry is StringToken registryString)) + { + if (isLenientParsing) + { + registryString = new StringToken("Adobe"); + } + else + { + throw new InvalidOperationException(GetErrorMessage("registry")); + } + } -//private void parseBeginbfchar(Number cosCount, PushbackInputStream cmapStream, CMap result) -//{ -// for (int j = 0; j < cosCount.intValue(); j++) -// { -// Object nextToken = parseNextToken(cmapStream); -// if (nextToken instanceof Operator) -// { -// if (!((Operator)nextToken).op.equals("endbfchar")) -// { -// throw new IOException("Error : ~bfchar contains an unexpected operator : " -// + ((Operator)nextToken).op); -// } -// break; -// } -// byte[] inputCode = (byte[])nextToken; -// nextToken = parseNextToken(cmapStream); -// if (nextToken instanceof byte[]) -// { -// byte[] bytes = (byte[])nextToken; -// String value = createStringFromBytes(bytes); -// result.addCharMapping(inputCode, value); -// } -// else if (nextToken instanceof LiteralName) -// { -// result.addCharMapping(inputCode, ((LiteralName)nextToken).name); -// } -// else -// { -// throw new IOException("Error parsing CMap beginbfchar, expected{COSString " -// + "or COSName} and not " + nextToken); -// } -// } -//} + if (!dictionary.TryGetByName(CosName.ORDERING, out var ordering) || !(ordering is StringToken orderingString)) + { + if (isLenientParsing) + { + orderingString = new StringToken(""); + } + else + { + throw new InvalidOperationException(GetErrorMessage("ordering")); + } + } -//private void parseBegincidrange(int numberOfLines, PushbackInputStream cmapStream, CMap result) -//{ -// for (int n = 0; n < numberOfLines; n++) -// { -// Object nextToken = parseNextToken(cmapStream); -// if (nextToken instanceof Operator) -// { -// if (!((Operator)nextToken).op.equals("endcidrange")) -// { -// throw new IOException("Error : ~cidrange contains an unexpected operator : " -// + ((Operator)nextToken).op); -// } -// break; -// } -// byte[] startCode = (byte[])nextToken; -// int start = createIntFromBytes(startCode); -// byte[] endCode = (byte[])parseNextToken(cmapStream); -// int end = createIntFromBytes(endCode); -// int mappedCode = (Integer)parseNextToken(cmapStream); -// if (startCode.length <= 2 && endCode.length <= 2) -// { -// result.addCIDRange((char)start, (char)end, mappedCode); -// } -// else -// { -// // TODO Is this even possible? -// int endOfMappings = mappedCode + end - start; -// while (mappedCode <= endOfMappings) -// { -// int mappedCID = createIntFromBytes(startCode); -// result.addCIDMapping(mappedCode++, mappedCID); -// increment(startCode); -// } -// } -// } -//} + if (!dictionary.TryGetByName(CosName.SUPPLEMENT, out var supplement) || !(supplement is NumericToken supplementNumeric)) + { + if (isLenientParsing) + { + supplementNumeric = new NumericToken(0); + } + else + { + throw new InvalidOperationException(GetErrorMessage("supplement")); + } + } -//private void parseBegincidchar(Number cosCount, PushbackInputStream cmapStream, CMap result) -//{ -// for (int j = 0; j < cosCount.intValue(); j++) -// { -// Object nextToken = parseNextToken(cmapStream); -// if (nextToken instanceof Operator) -// { -// if (!((Operator)nextToken).op.equals("endcidchar")) -// { -// throw new IOException("Error : ~cidchar contains an unexpected operator : " -// + ((Operator)nextToken).op); -// } -// break; -// } -// byte[] inputCode = (byte[])nextToken; -// int mappedCode = (Integer)parseNextToken(cmapStream); -// int mappedCID = createIntFromBytes(inputCode); -// result.addCIDMapping(mappedCode, mappedCID); -// } -//} + return new CharacterIdentifierSystemInfo(registryString.Data, orderingString.Data, supplementNumeric.Int); + } -//private void parseBeginbfrange(Number cosCount, PushbackInputStream cmapStream, CMap result) -//{ -// for (int j = 0; j < cosCount.intValue(); j++) -// { -// Object nextToken = parseNextToken(cmapStream); -// if (nextToken instanceof Operator) -// { -// if (!((Operator)nextToken).op.equals("endbfrange")) -// { -// throw new IOException("Error : ~bfrange contains an unexpected operator : " -// + ((Operator)nextToken).op); -// } -// break; -// } -// byte[] startCode = (byte[])nextToken; -// byte[] endCode = (byte[])parseNextToken(cmapStream); -// nextToken = parseNextToken(cmapStream); -// List array = null; -// byte[] tokenBytes; -// if (nextToken instanceof List) -// { -// array = (List)nextToken; -// if (array.isEmpty()) -// { -// continue; -// } -// tokenBytes = array.get(0); -// } -// else -// { -// tokenBytes = (byte[])nextToken; -// } -// if (tokenBytes == null || tokenBytes.length == 0) -// { -// // PDFBOX-3450: ignore <> -// // PDFBOX-3807: ignore null -// continue; -// } -// boolean done = false; + [CanBeNull] + private static IToken TryMoveNext(ITokenScanner scanner) + { + if (!scanner.MoveNext()) + { + return null; + } -// int arrayIndex = 0; -// while (!done) -// { -// if (compare(startCode, endCode) >= 0) -// { -// done = true; -// } -// String value = createStringFromBytes(tokenBytes); -// result.addCharMapping(startCode, value); -// increment(startCode); - -// if (array == null) -// { -// increment(tokenBytes); -// } -// else -// { -// arrayIndex++; -// if (arrayIndex < array.size()) -// { -// tokenBytes = array.get(arrayIndex); -// } -// } -// } -// } -//} - -///** -// * Returns an input stream containing the given "use" CMap. -// * -// * @param name Name of the given "use" CMap resource. -// * @throws IOException if the CMap resource doesn't exist or if there is an error opening its -// * stream. -// */ -//protected InputStream getExternalCMap(String name) -//{ -// URL url = getClass().getResource(name); -// if (url == null) -// { -// throw new IOException("Error: Could not find referenced cmap stream " + name); -// } -// return url.openStream(); -//} - -//private Object parseNextToken(PushbackInputStream is) -//{ -// Object retval = null; -// int nextByte = is.read(); -// // skip whitespace -// while (nextByte == 0x09 || nextByte == 0x20 || nextByte == 0x0D || nextByte == 0x0A) -// { -// nextByte = is.read(); -// } -// switch (nextByte) -// { -// case '%': -// { -// // header operations, for now return the entire line -// // may need to smarter in the future -// StringBuilder buffer = new StringBuilder(); -// buffer.append((char)nextByte); -// readUntilEndOfLine(is, buffer); -// retval = buffer.toString(); -// break; -// } -// case '(': -// { -// StringBuilder buffer = new StringBuilder(); -// int stringByte = is.read(); - -// while (stringByte != -1 && stringByte != ')') -// { -// buffer.append((char)stringByte); -// stringByte = is.read(); -// } -// retval = buffer.toString(); -// break; -// } -// case '>': -// { -// int secondCloseBrace = is.read(); -// if (secondCloseBrace == '>') -// { -// retval = MARK_END_OF_DICTIONARY; -// } -// else -// { -// throw new IOException("Error: expected the end of a dictionary."); -// } -// break; -// } -// case ']': -// { -// retval = MARK_END_OF_ARRAY; -// break; -// } -// case '[': -// { -// List list = new ArrayList<>(); - -// Object nextToken = parseNextToken(is); -// while (nextToken != null && !MARK_END_OF_ARRAY.equals(nextToken)) -// { -// list.add(nextToken); -// nextToken = parseNextToken(is); -// } -// retval = list; -// break; -// } -// case '<': -// { -// int theNextByte = is.read(); -// if (theNextByte == '<') -// { -// Map result = new HashMap<>(); -// // we are reading a dictionary -// Object key = parseNextToken(is); -// while (key instanceof LiteralName && !MARK_END_OF_DICTIONARY.equals(key)) -// { -// Object value = parseNextToken(is); -// result.put(((LiteralName)key).name, value); -// key = parseNextToken(is); -// } -// retval = result; -// } -// else -// { -// // won't read more than 512 bytes - -// int multiplyer = 16; -// int bufferIndex = -1; -// while (theNextByte != -1 && theNextByte != '>') -// { -// int intValue = 0; -// if (theNextByte >= '0' && theNextByte <= '9') -// { -// intValue = theNextByte - '0'; -// } -// else if (theNextByte >= 'A' && theNextByte <= 'F') -// { -// intValue = 10 + theNextByte - 'A'; -// } -// else if (theNextByte >= 'a' && theNextByte <= 'f') -// { -// intValue = 10 + theNextByte - 'a'; -// } -// // all kind of whitespaces may occur in malformed CMap files -// // see PDFBOX-2035 -// else if (isWhitespaceOrEOF(theNextByte)) -// { -// // skipping whitespaces -// theNextByte = is.read(); -// continue; -// } -// else -// { -// throw new IOException("Error: expected hex character and not " + (char)theNextByte + ":" -// + theNextByte); -// } -// intValue *= multiplyer; -// if (multiplyer == 16) -// { -// bufferIndex++; -// tokenParserByteBuffer[bufferIndex] = 0; -// multiplyer = 1; -// } -// else -// { -// multiplyer = 16; -// } -// tokenParserByteBuffer[bufferIndex] += intValue; -// theNextByte = is.read(); -// } -// byte[] finalResult = new byte[bufferIndex + 1]; -// System.arraycopy(tokenParserByteBuffer, 0, finalResult, 0, bufferIndex + 1); -// retval = finalResult; -// } -// break; -// } -// case '/': -// { -// StringBuilder buffer = new StringBuilder(); -// int stringByte = is.read(); - -// while (!isWhitespaceOrEOF(stringByte) && !isDelimiter(stringByte)) -// { -// buffer.append((char)stringByte); -// stringByte = is.read(); -// } -// if (isDelimiter(stringByte)) -// { -// is.unread(stringByte); -// } -// retval = new LiteralName(buffer.toString()); -// break; -// } -// case -1: -// { -// // EOF returning null -// break; -// } -// case '0': -// case '1': -// case '2': -// case '3': -// case '4': -// case '5': -// case '6': -// case '7': -// case '8': -// case '9': -// { -// StringBuilder buffer = new StringBuilder(); -// buffer.append((char)nextByte); -// nextByte = is.read(); - -// while (!isWhitespaceOrEOF(nextByte) && (Character.isDigit((char)nextByte) || nextByte == '.')) -// { -// buffer.append((char)nextByte); -// nextByte = is.read(); -// } -// is.unread(nextByte); -// String value = buffer.toString(); -// if (value.indexOf('.') >= 0) -// { -// retval = Double.valueOf(value); -// } -// else -// { -// retval = Integer.valueOf(value); -// } -// break; -// } -// default: -// { -// StringBuilder buffer = new StringBuilder(); -// buffer.append((char)nextByte); -// nextByte = is.read(); - -// // newline separator may be missing in malformed CMap files -// // see PDFBOX-2035 -// while (!isWhitespaceOrEOF(nextByte) && !isDelimiter(nextByte) && !Character.isDigit(nextByte)) -// { -// buffer.append((char)nextByte); -// nextByte = is.read(); -// } -// if (isDelimiter(nextByte) || Character.isDigit(nextByte)) -// { -// is.unread(nextByte); -// } -// retval = new Operator(buffer.toString()); - -// break; -// } -// } -// return retval; -//} - -//private void readUntilEndOfLine(InputStream is, StringBuilder buf) -//{ -// int nextByte = is.read(); -// while (nextByte != -1 && nextByte != 0x0D && nextByte != 0x0A) -// { -// buf.append((char)nextByte); -// nextByte = is.read(); -// } -//} - -//private boolean isWhitespaceOrEOF(int aByte) -//{ -// return aByte == -1 || aByte == 0x20 || aByte == 0x0D || aByte == 0x0A; -//} - -///** Is this a standard PDF delimiter character? */ -//private boolean isDelimiter(int aByte) -//{ -// switch (aByte) -// { -// case '(': -// case ')': -// case '<': -// case '>': -// case '[': -// case ']': -// case '{': -// case '}': -// case '/': -// case '%': -// return true; -// default: -// return false; -// } -//} - -//private void increment(byte[] data) -//{ -// increment(data, data.length - 1); -//} - -//private void increment(byte[] data, int position) -//{ -// if (position > 0 && (data[position] & 0xFF) == 255) -// { -// data[position] = 0; -// increment(data, position - 1); -// } -// else -// { -// data[position] = (byte)(data[position] + 1); -// } -//} - -//private int createIntFromBytes(byte[] bytes) -//{ -// int intValue = bytes[0] & 0xFF; -// if (bytes.length == 2) -// { -// intValue <<= 8; -// intValue += bytes[1] & 0xFF; -// } -// return intValue; -//} - -//private String createStringFromBytes(byte[] bytes) -//{ -// return new String(bytes, bytes.length == 1 ? Charsets.ISO_8859_1 : Charsets.UTF_16BE); -// } - -// private int compare(byte[] first, byte[] second) -//{ -// for (int i = 0; i < first.length; i++) -// { -// if (first[i] == second[i]) -// { -// continue; -// } - -// if ((first[i] & 0xFF) < (second[i] & 0xFF)) -// { -// return -1; -// } -// else -// { -// return 1; -// } -// } -// return 0; -//} - -///** -// * Internal class. -// */ -//private static final class LiteralName -//{ -// private String name; - -// private LiteralName(String theName) -// { -// name = theName; -// } -//} - -///** -// * Internal class. -// */ -//private static final class Operator -//{ -// private String op; - -// private Operator(String theOp) -// { -// op = theOp; -// } -//} -// } -//} + return scanner.CurrentToken; + } + } +} diff --git a/src/UglyToad.Pdf/Fonts/Parser/CMapParserJava.cs b/src/UglyToad.Pdf/Fonts/Parser/CMapParserJava.cs new file mode 100644 index 00000000..2b03b617 --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Parser/CMapParserJava.cs @@ -0,0 +1,710 @@ +//using System; +//using System.Collections.Generic; +//using System.Text; +//using UglyToad.Pdf.Fonts.Cmap; + +//namespace UglyToad.Pdf.Fonts.Parser +//{ +// using Cmap; +// using Text; + +// internal class CMapParser +// { +// private static readonly String MARK_END_OF_DICTIONARY = ">>"; +// private static readonly String MARK_END_OF_ARRAY = "]"; + +// private readonly byte[] tokenParserByteBuffer = new byte[512]; + +// /** +// * Creates a new instance of CMapParser. +// */ +// public CMapParser() +// { +// } + +///** +// * Parses a predefined CMap. +// * +// * @param name CMap name. +// * @return The parsed predefined CMap as a java object, never null. +// * @throws IOException If the CMap could not be parsed. +// */ +//public CMap parsePredefined(String name) +//{ +// try (InputStream input = getExternalCMap(name)) +// { +// return parse(input); +// } +//} + +///** +// * This will parse the stream and create a cmap object. +// * +// * @param input The CMAP stream to parse. +// * @return The parsed stream as a java object, never null. +// * @throws IOException If there is an error parsing the stream. +// */ +//public CMap parse(InputStream input) +//{ +// PushbackInputStream cmapStream = new PushbackInputStream(input); +//CMap result = new CMap(); +//Object previousToken = null; +//Object token; +// while ((token = parseNextToken(cmapStream)) != null) +// { +// if (token instanceof Operator) +// { +// Operator op = (Operator)token; +// if (op.op.equals("endcmap")) +// { +// // end of CMap reached, stop reading as there isn't any interesting info anymore +// break; +// } + +// switch (op.op) +// { +// case "usecmap": +// parseUsecmap((LiteralName) previousToken, result); +// break; +// case "begincodespacerange": +// parseBegincodespacerange((Number) previousToken, cmapStream, result); +// break; +// case "beginbfchar": +// parseBeginbfchar((Number) previousToken, cmapStream, result); +// break; +// case "beginbfrange": +// parseBeginbfrange((Number) previousToken, cmapStream, result); +// break; +// case "begincidchar": +// parseBegincidchar((Number) previousToken, cmapStream, result); +// break; +// case "begincidrange": +// parseBegincidrange((Integer) previousToken, cmapStream, result); +// break; +// default: +// break; +// } +// } +// else if (token instanceof LiteralName) +// { +// parseLiteralName((LiteralName) token, cmapStream, result); +// } +// previousToken = token; +// } +// return result; +// } + +// private void parseUsecmap(LiteralName useCmapName, CMap result) +//{ +// InputStream useStream = getExternalCMap(useCmapName.name); +// CMap useCMap = parse(useStream); +// result.useCmap(useCMap); +//} + +//private void parseLiteralName(LiteralName literal, PushbackInputStream cmapStream, CMap result) +//{ +// switch (literal.name) +// { +// case "WMode": +// { +// Object next = parseNextToken(cmapStream); +// if (next instanceof Integer) +// { +// result.setWMode((Integer)next); +// } +// break; +// } +// case "CMapName": +// { +// Object next = parseNextToken(cmapStream); +// if (next instanceof LiteralName) +// { +// result.setName(((LiteralName)next).name); +// } +// break; +// } +// case "CMapVersion": +// { +// Object next = parseNextToken(cmapStream); +// if (next instanceof Number) +// { +// result.setVersion(next.toString()); +// } +// else if (next instanceof String) +// { +// result.setVersion((String)next); +// } +// break; +// } +// case "CMapType": +// { +// Object next = parseNextToken(cmapStream); +// if (next instanceof Integer) +// { +// result.setType((Integer)next); +// } +// break; +// } +// case "Registry": +// { +// Object next = parseNextToken(cmapStream); +// if (next instanceof String) +// { +// result.setRegistry((String)next); +// } +// break; +// } +// case "Ordering": +// { +// Object next = parseNextToken(cmapStream); +// if (next instanceof String) +// { +// result.setOrdering((String)next); +// } +// break; +// } +// case "Supplement": +// { +// Object next = parseNextToken(cmapStream); +// if (next instanceof Integer) +// { +// result.setSupplement((Integer)next); +// } +// break; +// } +// default: +// break; +// } +//} + +//private void parseBegincodespacerange(Number cosCount, PushbackInputStream cmapStream, CMap result) +//{ +// for (int j = 0; j < cosCount.intValue(); j++) +// { +// Object nextToken = parseNextToken(cmapStream); +// if (nextToken instanceof Operator) +// { +// if (!((Operator)nextToken).op.equals("endcodespacerange")) +// { +// throw new IOException("Error : ~codespacerange contains an unexpected operator : " +// + ((Operator)nextToken).op); +// } +// break; +// } +// byte[] startRange = (byte[])nextToken; +// byte[] endRange = (byte[])parseNextToken(cmapStream); +// CodespaceRange range = new CodespaceRange(); +// range.setStart(startRange); +// range.setEnd(endRange); +// result.addCodespaceRange(range); +// } +//} + +//private void parseBeginbfchar(Number cosCount, PushbackInputStream cmapStream, CMap result) +//{ +// for (int j = 0; j < cosCount.intValue(); j++) +// { +// Object nextToken = parseNextToken(cmapStream); +// if (nextToken instanceof Operator) +// { +// if (!((Operator)nextToken).op.equals("endbfchar")) +// { +// throw new IOException("Error : ~bfchar contains an unexpected operator : " +// + ((Operator)nextToken).op); +// } +// break; +// } +// byte[] inputCode = (byte[])nextToken; +// nextToken = parseNextToken(cmapStream); +// if (nextToken instanceof byte[]) +// { +// byte[] bytes = (byte[])nextToken; +// String value = createStringFromBytes(bytes); +// result.addCharMapping(inputCode, value); +// } +// else if (nextToken instanceof LiteralName) +// { +// result.addCharMapping(inputCode, ((LiteralName)nextToken).name); +// } +// else +// { +// throw new IOException("Error parsing CMap beginbfchar, expected{COSString " +// + "or COSName} and not " + nextToken); +// } +// } +//} + +//private void parseBegincidrange(int numberOfLines, PushbackInputStream cmapStream, CMap result) +//{ +// for (int n = 0; n < numberOfLines; n++) +// { +// Object nextToken = parseNextToken(cmapStream); +// if (nextToken instanceof Operator) +// { +// if (!((Operator)nextToken).op.equals("endcidrange")) +// { +// throw new IOException("Error : ~cidrange contains an unexpected operator : " +// + ((Operator)nextToken).op); +// } +// break; +// } +// byte[] startCode = (byte[])nextToken; +// int start = createIntFromBytes(startCode); +// byte[] endCode = (byte[])parseNextToken(cmapStream); +// int end = createIntFromBytes(endCode); +// int mappedCode = (Integer)parseNextToken(cmapStream); +// if (startCode.length <= 2 && endCode.length <= 2) +// { +// result.addCIDRange((char)start, (char)end, mappedCode); +// } +// else +// { +// // TODO Is this even possible? +// int endOfMappings = mappedCode + end - start; +// while (mappedCode <= endOfMappings) +// { +// int mappedCID = createIntFromBytes(startCode); +// result.addCIDMapping(mappedCode++, mappedCID); +// increment(startCode); +// } +// } +// } +//} + +//private void parseBegincidchar(Number cosCount, PushbackInputStream cmapStream, CMap result) +//{ +// for (int j = 0; j < cosCount.intValue(); j++) +// { +// Object nextToken = parseNextToken(cmapStream); +// if (nextToken instanceof Operator) +// { +// if (!((Operator)nextToken).op.equals("endcidchar")) +// { +// throw new IOException("Error : ~cidchar contains an unexpected operator : " +// + ((Operator)nextToken).op); +// } +// break; +// } +// byte[] inputCode = (byte[])nextToken; +// int mappedCode = (Integer)parseNextToken(cmapStream); +// int mappedCID = createIntFromBytes(inputCode); +// result.addCIDMapping(mappedCode, mappedCID); +// } +//} + +//private void parseBeginbfrange(Number cosCount, PushbackInputStream cmapStream, CMap result) +//{ +// for (int j = 0; j < cosCount.intValue(); j++) +// { +// Object nextToken = parseNextToken(cmapStream); +// if (nextToken instanceof Operator) +// { +// if (!((Operator)nextToken).op.equals("endbfrange")) +// { +// throw new IOException("Error : ~bfrange contains an unexpected operator : " +// + ((Operator)nextToken).op); +// } +// break; +// } +// byte[] startCode = (byte[])nextToken; +// byte[] endCode = (byte[])parseNextToken(cmapStream); +// nextToken = parseNextToken(cmapStream); +// List array = null; +// byte[] tokenBytes; +// if (nextToken instanceof List) +// { +// array = (List)nextToken; +// if (array.isEmpty()) +// { +// continue; +// } +// tokenBytes = array.get(0); +// } +// else +// { +// tokenBytes = (byte[])nextToken; +// } +// if (tokenBytes == null || tokenBytes.length == 0) +// { +// // PDFBOX-3450: ignore <> +// // PDFBOX-3807: ignore null +// continue; +// } +// boolean done = false; + +// int arrayIndex = 0; +// while (!done) +// { +// if (compare(startCode, endCode) >= 0) +// { +// done = true; +// } +// String value = createStringFromBytes(tokenBytes); +// result.addCharMapping(startCode, value); +// increment(startCode); + +// if (array == null) +// { +// increment(tokenBytes); +// } +// else +// { +// arrayIndex++; +// if (arrayIndex < array.size()) +// { +// tokenBytes = array.get(arrayIndex); +// } +// } +// } +// } +//} + +///** +// * Returns an input stream containing the given "use" CMap. +// * +// * @param name Name of the given "use" CMap resource. +// * @throws IOException if the CMap resource doesn't exist or if there is an error opening its +// * stream. +// */ +//protected InputStream getExternalCMap(String name) +//{ +// URL url = getClass().getResource(name); +// if (url == null) +// { +// throw new IOException("Error: Could not find referenced cmap stream " + name); +// } +// return url.openStream(); +//} + +//private Object parseNextToken(PushbackInputStream is) +//{ +// Object retval = null; +// int nextByte = is.read(); +// // skip whitespace +// while (nextByte == 0x09 || nextByte == 0x20 || nextByte == 0x0D || nextByte == 0x0A) +// { +// nextByte = is.read(); +// } +// switch (nextByte) +// { +// case '%': +// { +// // header operations, for now return the entire line +// // may need to smarter in the future +// StringBuilder buffer = new StringBuilder(); +// buffer.append((char)nextByte); +// readUntilEndOfLine(is, buffer); +// retval = buffer.toString(); +// break; +// } +// case '(': +// { +// StringBuilder buffer = new StringBuilder(); +// int stringByte = is.read(); + +// while (stringByte != -1 && stringByte != ')') +// { +// buffer.append((char)stringByte); +// stringByte = is.read(); +// } +// retval = buffer.toString(); +// break; +// } +// case '>': +// { +// int secondCloseBrace = is.read(); +// if (secondCloseBrace == '>') +// { +// retval = MARK_END_OF_DICTIONARY; +// } +// else +// { +// throw new IOException("Error: expected the end of a dictionary."); +// } +// break; +// } +// case ']': +// { +// retval = MARK_END_OF_ARRAY; +// break; +// } +// case '[': +// { +// List list = new ArrayList<>(); + +// Object nextToken = parseNextToken(is); +// while (nextToken != null && !MARK_END_OF_ARRAY.equals(nextToken)) +// { +// list.add(nextToken); +// nextToken = parseNextToken(is); +// } +// retval = list; +// break; +// } +// case '<': +// { +// int theNextByte = is.read(); +// if (theNextByte == '<') +// { +// Map result = new HashMap<>(); +// // we are reading a dictionary +// Object key = parseNextToken(is); +// while (key instanceof LiteralName && !MARK_END_OF_DICTIONARY.equals(key)) +// { +// Object value = parseNextToken(is); +// result.put(((LiteralName)key).name, value); +// key = parseNextToken(is); +// } +// retval = result; +// } +// else +// { +// // won't read more than 512 bytes + +// int multiplyer = 16; +// int bufferIndex = -1; +// while (theNextByte != -1 && theNextByte != '>') +// { +// int intValue = 0; +// if (theNextByte >= '0' && theNextByte <= '9') +// { +// intValue = theNextByte - '0'; +// } +// else if (theNextByte >= 'A' && theNextByte <= 'F') +// { +// intValue = 10 + theNextByte - 'A'; +// } +// else if (theNextByte >= 'a' && theNextByte <= 'f') +// { +// intValue = 10 + theNextByte - 'a'; +// } +// // all kind of whitespaces may occur in malformed CMap files +// // see PDFBOX-2035 +// else if (isWhitespaceOrEOF(theNextByte)) +// { +// // skipping whitespaces +// theNextByte = is.read(); +// continue; +// } +// else +// { +// throw new IOException("Error: expected hex character and not " + (char)theNextByte + ":" +// + theNextByte); +// } +// intValue *= multiplyer; +// if (multiplyer == 16) +// { +// bufferIndex++; +// tokenParserByteBuffer[bufferIndex] = 0; +// multiplyer = 1; +// } +// else +// { +// multiplyer = 16; +// } +// tokenParserByteBuffer[bufferIndex] += intValue; +// theNextByte = is.read(); +// } +// byte[] finalResult = new byte[bufferIndex + 1]; +// System.arraycopy(tokenParserByteBuffer, 0, finalResult, 0, bufferIndex + 1); +// retval = finalResult; +// } +// break; +// } +// case '/': +// { +// StringBuilder buffer = new StringBuilder(); +// int stringByte = is.read(); + +// while (!isWhitespaceOrEOF(stringByte) && !isDelimiter(stringByte)) +// { +// buffer.append((char)stringByte); +// stringByte = is.read(); +// } +// if (isDelimiter(stringByte)) +// { +// is.unread(stringByte); +// } +// retval = new LiteralName(buffer.toString()); +// break; +// } +// case -1: +// { +// // EOF returning null +// break; +// } +// case '0': +// case '1': +// case '2': +// case '3': +// case '4': +// case '5': +// case '6': +// case '7': +// case '8': +// case '9': +// { +// StringBuilder buffer = new StringBuilder(); +// buffer.append((char)nextByte); +// nextByte = is.read(); + +// while (!isWhitespaceOrEOF(nextByte) && (Character.isDigit((char)nextByte) || nextByte == '.')) +// { +// buffer.append((char)nextByte); +// nextByte = is.read(); +// } +// is.unread(nextByte); +// String value = buffer.toString(); +// if (value.indexOf('.') >= 0) +// { +// retval = Double.valueOf(value); +// } +// else +// { +// retval = Integer.valueOf(value); +// } +// break; +// } +// default: +// { +// StringBuilder buffer = new StringBuilder(); +// buffer.append((char)nextByte); +// nextByte = is.read(); + +// // newline separator may be missing in malformed CMap files +// // see PDFBOX-2035 +// while (!isWhitespaceOrEOF(nextByte) && !isDelimiter(nextByte) && !Character.isDigit(nextByte)) +// { +// buffer.append((char)nextByte); +// nextByte = is.read(); +// } +// if (isDelimiter(nextByte) || Character.isDigit(nextByte)) +// { +// is.unread(nextByte); +// } +// retval = new Operator(buffer.toString()); + +// break; +// } +// } +// return retval; +//} + +//private void readUntilEndOfLine(InputStream is, StringBuilder buf) +//{ +// int nextByte = is.read(); +// while (nextByte != -1 && nextByte != 0x0D && nextByte != 0x0A) +// { +// buf.append((char)nextByte); +// nextByte = is.read(); +// } +//} + +//private boolean isWhitespaceOrEOF(int aByte) +//{ +// return aByte == -1 || aByte == 0x20 || aByte == 0x0D || aByte == 0x0A; +//} + +///** Is this a standard PDF delimiter character? */ +//private boolean isDelimiter(int aByte) +//{ +// switch (aByte) +// { +// case '(': +// case ')': +// case '<': +// case '>': +// case '[': +// case ']': +// case '{': +// case '}': +// case '/': +// case '%': +// return true; +// default: +// return false; +// } +//} + +//private void increment(byte[] data) +//{ +// increment(data, data.length - 1); +//} + +//private void increment(byte[] data, int position) +//{ +// if (position > 0 && (data[position] & 0xFF) == 255) +// { +// data[position] = 0; +// increment(data, position - 1); +// } +// else +// { +// data[position] = (byte)(data[position] + 1); +// } +//} + +//private int createIntFromBytes(byte[] bytes) +//{ +// int intValue = bytes[0] & 0xFF; +// if (bytes.length == 2) +// { +// intValue <<= 8; +// intValue += bytes[1] & 0xFF; +// } +// return intValue; +//} + +//private String createStringFromBytes(byte[] bytes) +//{ +// return new String(bytes, bytes.length == 1 ? Charsets.ISO_8859_1 : Charsets.UTF_16BE); +// } + +// private int compare(byte[] first, byte[] second) +//{ +// for (int i = 0; i < first.length; i++) +// { +// if (first[i] == second[i]) +// { +// continue; +// } + +// if ((first[i] & 0xFF) < (second[i] & 0xFF)) +// { +// return -1; +// } +// else +// { +// return 1; +// } +// } +// return 0; +//} + +///** +// * Internal class. +// */ +//private static final class LiteralName +//{ +// private String name; + +// private LiteralName(String theName) +// { +// name = theName; +// } +//} + +///** +// * Internal class. +// */ +//private static final class Operator +//{ +// private String op; + +// private Operator(String theOp) +// { +// op = theOp; +// } +//} +// } +//} diff --git a/src/UglyToad.Pdf/Fonts/Parser/CharacterIdentifierFontParser.cs b/src/UglyToad.Pdf/Fonts/Parser/CharacterIdentifierFontParser.cs index baefa533..a8ec5346 100644 --- a/src/UglyToad.Pdf/Fonts/Parser/CharacterIdentifierFontParser.cs +++ b/src/UglyToad.Pdf/Fonts/Parser/CharacterIdentifierFontParser.cs @@ -1,6 +1,7 @@ namespace UglyToad.Pdf.Fonts.Parser { using System; + using Cmap; using ContentStream; using ContentStream.TypedAccessors; using Cos; diff --git a/src/UglyToad.Pdf/Tokenization/CommentTokenizer.cs b/src/UglyToad.Pdf/Tokenization/CommentTokenizer.cs new file mode 100644 index 00000000..9b153c4d --- /dev/null +++ b/src/UglyToad.Pdf/Tokenization/CommentTokenizer.cs @@ -0,0 +1,33 @@ +namespace UglyToad.Pdf.Tokenization +{ + using System.Text; + using IO; + using Parser.Parts; + using Tokens; + + public class CommentTokenizer : ITokenizer + { + public bool ReadsNextByte { get; } = true; + + public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) + { + token = null; + + if (currentByte != '%') + { + return false; + } + + var builder = new StringBuilder(); + + while (inputBytes.MoveNext() && !ReadHelper.IsEndOfLine(inputBytes.CurrentByte)) + { + builder.Append((char) inputBytes.CurrentByte); + } + + token = new CommentToken(builder.ToString()); + + return true; + } + } +} diff --git a/src/UglyToad.Pdf/Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.Pdf/Tokenization/Scanner/CoreTokenScanner.cs index 2782a47f..37e33a7c 100644 --- a/src/UglyToad.Pdf/Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.Pdf/Tokenization/Scanner/CoreTokenScanner.cs @@ -24,6 +24,7 @@ private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer(); private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer(); private static readonly DictionaryTokenizer DictionaryTokenizer = new DictionaryTokenizer(); + private static readonly CommentTokenizer CommentTokenizer = new CommentTokenizer(); private readonly ScannerScope scope; private readonly IInputBytes inputBytes; @@ -98,6 +99,9 @@ case '/': tokenizer = NameTokenizer; break; + case '%': + tokenizer = CommentTokenizer; + break; case '0': case '1': case '2': diff --git a/src/UglyToad.Pdf/Tokenization/Tokens/CommentToken.cs b/src/UglyToad.Pdf/Tokenization/Tokens/CommentToken.cs new file mode 100644 index 00000000..24178900 --- /dev/null +++ b/src/UglyToad.Pdf/Tokenization/Tokens/CommentToken.cs @@ -0,0 +1,17 @@ +namespace UglyToad.Pdf.Tokenization.Tokens +{ + public class CommentToken : IDataToken + { + public string Data { get; } + + public CommentToken(string data) + { + Data = data; + } + + public override string ToString() + { + return Data ?? "NULL"; + } + } +} diff --git a/src/UglyToad.Pdf/Tokenization/Tokens/DictionaryToken.cs b/src/UglyToad.Pdf/Tokenization/Tokens/DictionaryToken.cs index 4c038f26..f3512cb8 100644 --- a/src/UglyToad.Pdf/Tokenization/Tokens/DictionaryToken.cs +++ b/src/UglyToad.Pdf/Tokenization/Tokens/DictionaryToken.cs @@ -3,15 +3,40 @@ using System; using System.Collections.Generic; using System.Linq; + using Cos; + using Util.JetBrains.Annotations; public class DictionaryToken : IDataToken> { + [NotNull] public IReadOnlyDictionary Data { get; } - public DictionaryToken(IReadOnlyDictionary data) + public DictionaryToken([NotNull]IReadOnlyDictionary data) { Data = data ?? throw new ArgumentNullException(nameof(data)); } + + public bool TryGetByName(CosName name, out IToken token) + { + if (name == null) + { + throw new ArgumentNullException(nameof(name)); + } + + token = null; + + foreach (var keyValuePair in Data) + { + if (keyValuePair.Key is NameToken nameToken && nameToken.Data.Equals(name)) + { + token = keyValuePair.Value; + + return true; + } + } + + return false; + } public override string ToString() { diff --git a/src/UglyToad.Pdf/Tokenization/Tokens/OperatorToken.cs b/src/UglyToad.Pdf/Tokenization/Tokens/OperatorToken.cs index 41256861..e7dda4ce 100644 --- a/src/UglyToad.Pdf/Tokenization/Tokens/OperatorToken.cs +++ b/src/UglyToad.Pdf/Tokenization/Tokens/OperatorToken.cs @@ -43,5 +43,10 @@ return new OperatorToken(data); } } + + public override string ToString() + { + return Data; + } } } diff --git a/src/UglyToad.Pdf/Tokenization/Tokens/StringToken.cs b/src/UglyToad.Pdf/Tokenization/Tokens/StringToken.cs index 59474cec..8a1cbf48 100644 --- a/src/UglyToad.Pdf/Tokenization/Tokens/StringToken.cs +++ b/src/UglyToad.Pdf/Tokenization/Tokens/StringToken.cs @@ -11,7 +11,7 @@ namespace UglyToad.Pdf.Tokenization.Tokens public override string ToString() { - return Data; + return $"\"{Data}\""; } } } \ No newline at end of file