split out classes for parsing the cmap format and add assertions to tests. add bytes to int method for hex token and test

This commit is contained in:
Eliot Jones
2017-11-20 16:42:18 +00:00
parent 0fd433240b
commit 4b91300466
14 changed files with 500 additions and 537 deletions

View File

@@ -40,11 +40,49 @@ end";
private readonly CMapParser cMapParser = new CMapParser();
[Fact]
public void CanParseCMap()
public void CanParseCidSystemInfoAndOtherInformation()
{
var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false);
var cmap = cMapParser.Parse(input.Bytes, false);
Assert.Equal("Adobe", cmap.Info.Registry);
Assert.Equal("UCS", cmap.Info.Ordering);
Assert.Equal(0, cmap.Info.Supplement);
Assert.Equal("Adobe-Identity-UCS", cmap.Name);
Assert.Equal(2, cmap.Type);
}
[Fact]
public void CanParseCodespaceRange()
{
var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false);
var cmap = cMapParser.Parse(input.Bytes, false);
Assert.Equal(1, cmap.CodespaceRanges.Count);
Assert.Equal(0, cmap.CodespaceRanges[0].StartInt);
Assert.Equal(65535, cmap.CodespaceRanges[0].EndInt);
Assert.Equal(2, cmap.CodespaceRanges[0].CodeLength);
}
[Fact]
public void CanParseBaseFontCharacters()
{
var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false);
var cmap = cMapParser.Parse(input.Bytes, false);
Assert.True(cmap.BaseFontCharacterMap.Count >= 6);
Assert.Equal(" ", cmap.BaseFontCharacterMap[3]);
Assert.Equal(".", cmap.BaseFontCharacterMap[17]);
Assert.Equal("A", cmap.BaseFontCharacterMap[36]);
Assert.Equal("T", cmap.BaseFontCharacterMap[55]);
Assert.Equal("a", cmap.BaseFontCharacterMap[68]);
Assert.Equal("x", cmap.BaseFontCharacterMap[91]);
}
}
}

View File

@@ -0,0 +1,36 @@
namespace UglyToad.Pdf.Tests.Tokenization.Tokens
{
using Pdf.Tokenization.Tokens;
using Xunit;
public class HexTokenTests
{
[Theory]
[InlineData("AE", "®")]
[InlineData("61", "a")]
[InlineData("0061", "\0a")]
[InlineData("7465787420736f", "text so")]
public void MapsCorrectlyToString(string input, string expected)
{
var token = new HexToken(input.ToCharArray());
Assert.Equal(expected, token.Data);
}
[Theory]
[InlineData("0003", 3)]
[InlineData("0011", 17)]
[InlineData("0024", 36)]
[InlineData("0037", 55)]
[InlineData("0044", 68)]
[InlineData("005B", 91)]
public void MapsCorrectlyToInt(string input, int expected)
{
var token = new HexToken(input.ToCharArray());
var value = HexToken.ConvertHexBytesToInt(token);
Assert.Equal(expected, value);
}
}
}

View File

@@ -26,5 +26,10 @@
Ordering = ordering;
Supplement = supplement;
}
public override string ToString()
{
return $"{Registry} | {Ordering} | {Supplement}";
}
}
}

View File

@@ -54,6 +54,8 @@
public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; set; }
public IReadOnlyList<CidRange> CidRanges { get; set; }
public Dictionary<int, string> BaseFontCharacterMap { get; } = new Dictionary<int, string>();
public void AddBaseFontCharacter(IReadOnlyList<byte> bytes, IReadOnlyList<byte> value)
@@ -68,6 +70,15 @@
BaseFontCharacterMap[code] = value;
}
public CMap Build()
{
return new CMap(CharacterIdentifierSystemInfo, Type, WMode, Name, Version,
BaseFontCharacterMap ?? new Dictionary<int, string>(),
CodespaceRanges ?? new CodespaceRange[0],
CidRanges ?? new CidRange[0],
CidCharacterMappings ?? new CidCharacterMapping[0]);
}
private int GetCodeFromArray(IReadOnlyList<byte> data, int length)
{
int code = 0;
@@ -79,7 +90,7 @@
return code;
}
private string CreateStringFromBytes(byte[] bytes)
private static string CreateStringFromBytes(byte[] bytes)
{
return bytes.Length == 1
? OtherEncodings.BytesAsLatin1String(bytes)

View File

@@ -1,11 +1,50 @@
using System;
using System.Collections.Generic;
using System.Text;
namespace UglyToad.Pdf.Fonts.Cmap
namespace UglyToad.Pdf.Fonts.Cmap
{
using System;
using System.Collections.Generic;
using Util.JetBrains.Annotations;
public class CMap
{
public CharacterIdentifierSystemInfo Info { get; }
public int Type { get; }
public int WMode { get; }
public string Name { get; }
public string Version { get; }
[NotNull]
public IReadOnlyDictionary<int, string> BaseFontCharacterMap { get; }
[NotNull]
public IReadOnlyList<CodespaceRange> CodespaceRanges { get; }
[NotNull]
public IReadOnlyList<CidRange> CidRanges { get; }
[NotNull]
public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; }
public bool HasCidMappings => CidCharacterMappings.Count > 0 || CidRanges.Count > 0;
public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0;
public CMap(CharacterIdentifierSystemInfo info, int type, int wMode, string name, string version, IReadOnlyDictionary<int, string> baseFontCharacterMap, IReadOnlyList<CodespaceRange> codespaceRanges, IReadOnlyList<CidRange> cidRanges, IReadOnlyList<CidCharacterMapping> cidCharacterMappings)
{
Info = info;
Type = type;
WMode = wMode;
Name = name;
Version = version;
BaseFontCharacterMap = baseFontCharacterMap ?? throw new ArgumentNullException(nameof(baseFontCharacterMap));
CodespaceRanges = codespaceRanges ?? throw new ArgumentNullException(nameof(codespaceRanges));
CidRanges = cidRanges ?? throw new ArgumentNullException(nameof(cidRanges));
CidCharacterMappings = cidCharacterMappings ?? throw new ArgumentNullException(nameof(cidCharacterMappings));
}
private int wmode = 0;
private string cmapName = null;
private string cmapVersion = null;
@@ -17,13 +56,7 @@ namespace UglyToad.Pdf.Fonts.Cmap
private int minCodeLength = 4;
private int maxCodeLength;
// code lengths
private readonly List<CodespaceRange> codespaceRanges = new List<CodespaceRange>();
// Unicode mappings
private readonly Dictionary<int, string> charToUnicode = new Dictionary<int, string>();
// CID mappings
private readonly Dictionary<int, int> codeToCid = new Dictionary<int, int>();
private readonly List<CidRange> codeToCidRanges = new List<CidRange>();
@@ -31,44 +64,17 @@ namespace UglyToad.Pdf.Fonts.Cmap
private static readonly string SPACE = " ";
private int spaceMapping = -1;
/**
* Creates a new instance of CMap.
*/
public CMap()
/// <summary>
/// Returns the sequence of Unicode characters for the given character code.
/// </summary>
/// <param name="code">Character code</param>
/// <param name="result">Unicode characters(may be more than one, e.g "fi" ligature)</param>
/// <returns><see langword="true"/> if this character map contains an entry for this code, <see langword="false"/> otherwise.</returns>
public bool TryConvertToUnicode(int code, out string result)
{
}
var found = BaseFontCharacterMap.TryGetValue(code, out result);
/**
* This will tell if this cmap has any CID mappings.
*
* @return true If there are any CID mappings, false otherwise.
*/
public bool hasCIDMappings()
{
return codeToCid.Count > 0 || codeToCidRanges.Count > 0;
}
/**
* This will tell if this cmap has any Unicode mappings.
*
* @return true If there are any Unicode mappings, false otherwise.
*/
public bool hasUnicodeMappings()
{
return charToUnicode.Count > 0;
}
/**
* Returns the sequence of Unicode characters for the given character code.
*
* @param code character code
* @return Unicode characters (may be more than one, e.g "fi" ligature)
*/
public string toUnicode(int code)
{
charToUnicode.TryGetValue(code, out var result);
return result;
return found;
}
/**
@@ -102,27 +108,14 @@ namespace UglyToad.Pdf.Fonts.Cmap
// throw new InvalidOperationException("CMap is invalid");
//}
/**
* Returns an int for the given byte array
*/
static int toInt(byte[] data, int dataLen)
{
int code = 0;
for (int i = 0; i < dataLen; ++i)
{
code <<= 8;
code |= (data[i] & 0xFF);
}
return code;
}
/**
* Returns the CID for the given character code.
*
* @param code character code
* @return CID
*/
public int toCID(int code)
public int ConvertToCid(int code)
{
if (codeToCid.TryGetValue(code, out var cid))
{
@@ -137,251 +130,11 @@ namespace UglyToad.Pdf.Fonts.Cmap
return ch;
}
}
return 0;
}
/**
* Convert the given part of a byte array to an int.
* @param data the byte array
* @param offset The offset into the byte array.
* @param length The length of the data we are getting.
* @return the resulting int
*/
private int getCodeFromArray(byte[] data, int offset, int length)
{
int code = 0;
for (int i = 0; i < length; i++)
{
code <<= 8;
code |= (data[offset + i] + 256) % 256;
}
return code;
}
/**
* This will add a character code to Unicode character sequence mapping.
*
* @param codes The character codes to map from.
* @param unicode The Unicode characters to map to.
*/
void addCharMapping(byte[] codes, string unicode)
{
int code = getCodeFromArray(codes, 0, codes.Length);
charToUnicode[code] = unicode;
// fixme: ugly little hack
if (SPACE.Equals(unicode))
{
spaceMapping = code;
}
}
/**
* This will add a CID mapping.
*
* @param code character code
* @param cid CID
*/
void addCIDMapping(int code, int cid)
{
codeToCid[cid] = code;
}
/**
* This will add a CID Range.
*
* @param from starting charactor of the CID range.
* @param to ending character of the CID range.
* @param cid the cid to be started with.
*
*/
void addCIDRange(char from, char to, int cid)
{
codeToCidRanges.Add(new CidRange(from, to, cid));
}
/**
* This will add a codespace range.
*
* @param range A single codespace range.
*/
void addCodespaceRange(CodespaceRange range)
{
codespaceRanges.Add(range);
maxCodeLength = Math.Max(maxCodeLength, range.CodeLength);
minCodeLength = Math.Min(minCodeLength, range.CodeLength);
}
/**
* Implementation of the usecmap operator. This will
* copy all of the mappings from one cmap to another.
*
* @param cmap The cmap to load mappings from.
*/
private void useCmap(CMap cmap)
{
foreach (CodespaceRange codespaceRange in cmap.codespaceRanges)
{
addCodespaceRange(codespaceRange);
}
charToUnicode.PutAll(cmap.charToUnicode);
codeToCid.PutAll(cmap.codeToCid);
codeToCidRanges.AddRange(cmap.codeToCidRanges);
}
/**
* Returns the WMode of a CMap.
*
* 0 represents a horizontal and 1 represents a vertical orientation.
*
* @return the wmode
*/
public int getWMode()
{
return wmode;
}
/**
* Sets the WMode of a CMap.
*
* @param newWMode the new WMode.
*/
public void setWMode(int newWMode)
{
wmode = newWMode;
}
/**
* Returns the name of the CMap.
*
* @return the CMap name.
*/
public string getName()
{
return cmapName;
}
/**
* Sets the name of the CMap.
*
* @param name the CMap name.
*/
public void setName(string name)
{
cmapName = name;
}
/**
* Returns the version of the CMap.
*
* @return the CMap version.
*/
public string getVersion()
{
return cmapVersion;
}
/**
* Sets the version of the CMap.
*
* @param version the CMap version.
*/
public void setVersion(string version)
{
cmapVersion = version;
}
/**
* Returns the type of the CMap.
*
* @return the CMap type.
*/
public int getType()
{
return cmapType;
}
/**
* Sets the type of the CMap.
*
* @param type the CMap type.
*/
public void setType(int type)
{
cmapType = type;
}
/**
* Returns the registry of the CIDSystemInfo.
*
* @return the registry.
*/
public string getRegistry()
{
return registry;
}
/**
* Sets the registry of the CIDSystemInfo.
*
* @param newRegistry the registry.
*/
public void setRegistry(string newRegistry)
{
registry = newRegistry;
}
/**
* Returns the ordering of the CIDSystemInfo.
*
* @return the ordering.
*/
public string getOrdering()
{
return ordering;
}
/**
* Sets the ordering of the CIDSystemInfo.
*
* @param newOrdering the ordering.
*/
public void setOrdering(string newOrdering)
{
ordering = newOrdering;
}
/**
* Returns the supplement of the CIDSystemInfo.
*
* @return the supplement.
*/
public int getSupplement()
{
return supplement;
}
/**
* Sets the supplement of the CIDSystemInfo.
*
* @param newSupplement the supplement.
*/
public void setSupplement(int newSupplement)
{
supplement = newSupplement;
}
/**
* Returns the mapping for the space character.
*
* @return the mapped code for the space character
*/
public int getSpaceMapping()
{
return spaceMapping;
}
public override string ToString()
{
return cmapName;

View File

@@ -1,23 +1,26 @@
namespace UglyToad.Pdf.Fonts.Parser
{
using System;
using System.Collections.Generic;
using System.Globalization;
using Cmap;
using Cos;
using IO;
using Parts;
using Tokenization.Scanner;
using Tokenization.Tokens;
using Util.JetBrains.Annotations;
public class CMapParser
{
private static readonly BaseFontRangeParser BaseFontRangeParser = new BaseFontRangeParser();
private static readonly BaseFontCharacterParser BaseFontCharacterParser = new BaseFontCharacterParser();
private static readonly CidRangeParser CidRangeParser = new CidRangeParser();
private static readonly CidFontNameParser CidFontNameParser = new CidFontNameParser();
private static readonly CodespaceRangeParser CodespaceRangeParser = new CodespaceRangeParser();
private static readonly CidCharacterParser CidCharacterParser = new CidCharacterParser();
public CMap Parse(IInputBytes inputBytes, bool isLenientParsing)
{
var scanner = new CoreTokenScanner(inputBytes);
var builder = new CharacterMapBuilder();
var result = new CMap();
IToken previousToken = null;
while (scanner.MoveNext())
@@ -34,20 +37,19 @@
{
if (previousToken is NumericToken numeric)
{
ParseCodespaceRange(numeric, scanner, builder);
CodespaceRangeParser.Parse(numeric, scanner, builder, isLenientParsing);
}
else
{
throw new InvalidOperationException("Unexpected token preceding start of codespace range: " + previousToken);
}
}
break;
case "beginbfchar":
{
if (previousToken is NumericToken numeric)
{
ParseBaseFontCharacters(numeric, scanner, builder);
BaseFontCharacterParser.Parse(numeric, scanner, builder, isLenientParsing);
}
else
{
@@ -59,8 +61,7 @@
{
if (previousToken is NumericToken numeric)
{
var parser = new BaseFontRangeParser();
parser.Parse(numeric, scanner, builder);
BaseFontRangeParser.Parse(numeric, scanner, builder, isLenientParsing);
}
else
{
@@ -72,9 +73,7 @@
{
if (previousToken is NumericToken numeric)
{
var characters = ParseCidCharacters(numeric, scanner);
builder.CidCharacterMappings = characters;
CidCharacterParser.Parse(numeric, scanner, builder, isLenientParsing);
}
else
{
@@ -83,229 +82,28 @@
break;
}
case "begincidrange":
{
if (previousToken is NumericToken numeric)
{
CidRangeParser.Parse(numeric, scanner, builder, isLenientParsing);
}
else
{
throw new InvalidOperationException("Unexpected token preceding start of Cid ranges: " + previousToken);
}
}
break;
}
}
else if (token is NameToken name)
{
ParseName(name, scanner, builder, isLenientParsing);
CidFontNameParser.Parse(name, scanner, builder, isLenientParsing);
}
previousToken = token;
}
return null;
}
private static void ParseCodespaceRange(NumericToken count, ITokenScanner tokenScanner, CharacterMapBuilder builder)
{
/*
* For example:
3 begincodespacerange
<00> <80>
<8140> <9ffc>
<a0> <de>
endcodespacerange
*/
var ranges = new List<CodespaceRange>(count.Int);
for (var i = 0; i < count.Int; i++)
{
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start))
{
throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
}
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end))
{
throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
}
ranges.Add(new CodespaceRange(start.Bytes, end.Bytes));
}
builder.CodespaceRanges = ranges;
}
private static void ParseBaseFontCharacters(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder)
{
for (var i = 0; i < numeric.Int; i++)
{
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode))
{
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
}
if (!tokenScanner.MoveNext())
{
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
}
if (tokenScanner.CurrentToken is NameToken characterName)
{
builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name);
}
else if (tokenScanner.CurrentToken is HexToken characterCode)
{
builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes);
}
else
{
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
}
}
}
private static IReadOnlyList<CidCharacterMapping> ParseCidCharacters(NumericToken numeric, ITokenScanner scanner)
{
var results = new List<CidCharacterMapping>();
for (var i = 0; i < numeric.Int; i++)
{
if (!scanner.TryReadToken(out HexToken sourceCode))
{
throw new InvalidOperationException("The first token in a line for Cid Characters should be a hex, instead it was: " + scanner.CurrentToken);
}
if (!scanner.TryReadToken(out NumericToken destinationCode))
{
throw new InvalidOperationException("The destination token in a line for Cid Character should be an integer, instead it was: " + scanner.CurrentToken);
}
var sourceInteger = sourceCode.Bytes.ToInt(sourceCode.Bytes.Count);
var mapping = new CidCharacterMapping(sourceInteger, destinationCode.Int);
results.Add(mapping);
}
return results;
}
private static void ParseName(NameToken nameToken, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
{
switch (nameToken.Data.Name)
{
case "WMode":
{
var next = TryMoveNext(scanner);
if (next is NumericToken numeric)
{
builder.WMode = numeric.Int;
}
break;
}
case "CMapName":
{
var next = TryMoveNext(scanner);
if (next is NameToken name)
{
builder.Name = name.Data.Name;
}
break;
}
case "CMapVersion":
{
var next = TryMoveNext(scanner);
if (next is NumericToken number)
{
builder.Version = number.Data.ToString(NumberFormatInfo.InvariantInfo);
}
else if (next is StringToken stringToken)
{
builder.Version = stringToken.Data;
}
break;
}
case "CMapType":
{
var next = TryMoveNext(scanner);
if (next is NumericToken numeric)
{
builder.Type = numeric.Int;
}
break;
}
case "Registry":
{
throw new NotImplementedException("Registry should be in a dictionary");
}
case "Ordering":
{
throw new NotImplementedException("Ordering should be in a dictionary");
}
case "Supplement":
{
throw new NotImplementedException("Supplement should be in a dictionary");
}
case "CIDSystemInfo":
{
var next = TryMoveNext(scanner);
if (next is DictionaryToken dictionary)
{
builder.CharacterIdentifierSystemInfo = GetCharacterIdentifier(dictionary, isLenientParsing);
}
break;
}
}
}
private static CharacterIdentifierSystemInfo GetCharacterIdentifier(DictionaryToken dictionary, bool isLenientParsing)
{
string GetErrorMessage(string missingKey)
{
return $"No {missingKey} found in the CIDSystemInfo dictionary: " + dictionary;
}
if (!dictionary.TryGetByName(CosName.REGISTRY, out var registry) || !(registry is StringToken registryString))
{
if (isLenientParsing)
{
registryString = new StringToken("Adobe");
}
else
{
throw new InvalidOperationException(GetErrorMessage("registry"));
}
}
if (!dictionary.TryGetByName(CosName.ORDERING, out var ordering) || !(ordering is StringToken orderingString))
{
if (isLenientParsing)
{
orderingString = new StringToken("");
}
else
{
throw new InvalidOperationException(GetErrorMessage("ordering"));
}
}
if (!dictionary.TryGetByName(CosName.SUPPLEMENT, out var supplement) || !(supplement is NumericToken supplementNumeric))
{
if (isLenientParsing)
{
supplementNumeric = new NumericToken(0);
}
else
{
throw new InvalidOperationException(GetErrorMessage("supplement"));
}
}
return new CharacterIdentifierSystemInfo(registryString.Data, orderingString.Data, supplementNumeric.Int);
}
[CanBeNull]
private static IToken TryMoveNext(ITokenScanner scanner)
{
if (!scanner.MoveNext())
{
return null;
}
return scanner.CurrentToken;
return builder.Build();
}
}
}

View File

@@ -0,0 +1,39 @@
namespace UglyToad.Pdf.Fonts.Parser.Parts
{
using System;
using Cmap;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class BaseFontCharacterParser : ICidFontPartParser<NumericToken>
{
public void Parse(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder, bool isLenientParsing)
{
for (var i = 0; i < numeric.Int; i++)
{
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode))
{
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
}
if (!tokenScanner.MoveNext())
{
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
}
if (tokenScanner.CurrentToken is NameToken characterName)
{
builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name);
}
else if (tokenScanner.CurrentToken is HexToken characterCode)
{
builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes);
}
else
{
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
}
}
}
}
}

View File

@@ -1,4 +1,4 @@
namespace UglyToad.Pdf.Fonts.Parser
namespace UglyToad.Pdf.Fonts.Parser.Parts
{
using System;
using System.Collections.Generic;
@@ -7,9 +7,9 @@
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class BaseFontRangeParser
internal class BaseFontRangeParser : ICidFontPartParser<NumericToken>
{
public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder)
public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
{
for (var i = 0; i < numeric.Int; i++)
{

View File

@@ -0,0 +1,36 @@
namespace UglyToad.Pdf.Fonts.Parser.Parts
{
using System;
using System.Collections.Generic;
using Cmap;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class CidCharacterParser : ICidFontPartParser<NumericToken>
{
public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
{
var results = new List<CidCharacterMapping>();
for (var i = 0; i < numeric.Int; i++)
{
if (!scanner.TryReadToken(out HexToken sourceCode))
{
throw new InvalidOperationException("The first token in a line for Cid Characters should be a hex, instead it was: " + scanner.CurrentToken);
}
if (!scanner.TryReadToken(out NumericToken destinationCode))
{
throw new InvalidOperationException("The destination token in a line for Cid Character should be an integer, instead it was: " + scanner.CurrentToken);
}
var sourceInteger = sourceCode.Bytes.ToInt(sourceCode.Bytes.Count);
var mapping = new CidCharacterMapping(sourceInteger, destinationCode.Int);
results.Add(mapping);
}
builder.CidCharacterMappings = results;
}
}
}

View File

@@ -0,0 +1,128 @@
namespace UglyToad.Pdf.Fonts.Parser.Parts
{
using System;
using System.Globalization;
using Cmap;
using Cos;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class CidFontNameParser : ICidFontPartParser<NameToken>
{
public void Parse(NameToken nameToken, ITokenScanner scanner, CharacterMapBuilder builder,
bool isLenientParsing)
{
switch (nameToken.Data.Name)
{
case "WMode":
{
if (scanner.TryReadToken(out NumericToken numeric))
{
builder.WMode = numeric.Int;
}
break;
}
case "CMapName":
{
if (scanner.TryReadToken(out NameToken name))
{
builder.Name = name.Data.Name;
}
break;
}
case "CMapVersion":
{
if (!scanner.MoveNext())
{
break;
}
var next = scanner.CurrentToken;
if (next is NumericToken number)
{
builder.Version = number.Data.ToString(NumberFormatInfo.InvariantInfo);
}
else if (next is StringToken stringToken)
{
builder.Version = stringToken.Data;
}
break;
}
case "CMapType":
{
if (scanner.TryReadToken(out NumericToken numeric))
{
builder.Type = numeric.Int;
}
break;
}
case "Registry":
{
throw new NotImplementedException("Registry should be in a dictionary");
}
case "Ordering":
{
throw new NotImplementedException("Ordering should be in a dictionary");
}
case "Supplement":
{
throw new NotImplementedException("Supplement should be in a dictionary");
}
case "CIDSystemInfo":
{
if (scanner.TryReadToken(out DictionaryToken dictionary))
{
builder.CharacterIdentifierSystemInfo = GetCharacterIdentifier(dictionary, isLenientParsing);
}
break;
}
}
}
private static CharacterIdentifierSystemInfo GetCharacterIdentifier(DictionaryToken dictionary, bool isLenientParsing)
{
string GetErrorMessage(string missingKey)
{
return $"No {missingKey} found in the CIDSystemInfo dictionary: " + dictionary;
}
if (!dictionary.TryGetByName(CosName.REGISTRY, out var registry) || !(registry is StringToken registryString))
{
if (isLenientParsing)
{
registryString = new StringToken("Adobe");
}
else
{
throw new InvalidOperationException(GetErrorMessage("registry"));
}
}
if (!dictionary.TryGetByName(CosName.ORDERING, out var ordering) || !(ordering is StringToken orderingString))
{
if (isLenientParsing)
{
orderingString = new StringToken("");
}
else
{
throw new InvalidOperationException(GetErrorMessage("ordering"));
}
}
if (!dictionary.TryGetByName(CosName.SUPPLEMENT, out var supplement) || !(supplement is NumericToken supplementNumeric))
{
if (isLenientParsing)
{
supplementNumeric = new NumericToken(0);
}
else
{
throw new InvalidOperationException(GetErrorMessage("supplement"));
}
}
return new CharacterIdentifierSystemInfo(registryString.Data, orderingString.Data, supplementNumeric.Int);
}
}
}

View File

@@ -0,0 +1,46 @@
namespace UglyToad.Pdf.Fonts.Parser.Parts
{
using System;
using System.Collections.Generic;
using Cmap;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class CidRangeParser : ICidFontPartParser<NumericToken>
{
public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
{
var ranges = new List<CidRange>();
for (var i = 0; i < numeric.Int; i++)
{
if (!scanner.TryReadToken(out HexToken startHexToken))
{
// TODO: message
throw new InvalidOperationException();
}
if (!scanner.TryReadToken(out HexToken endHexToken))
{
// TODO: message
throw new InvalidOperationException();
}
if (!scanner.TryReadToken(out NumericToken mappedCode))
{
// TODO: message
throw new InvalidOperationException();
}
var start = HexToken.ConvertHexBytesToInt(startHexToken);
var end = HexToken.ConvertHexBytesToInt(endHexToken);
var range = new CidRange((char)start, (char)end, mappedCode.Int);
ranges.Add(range);
}
builder.CidRanges = ranges;
}
}
}

View File

@@ -0,0 +1,42 @@
namespace UglyToad.Pdf.Fonts.Parser.Parts
{
using System;
using System.Collections.Generic;
using Cmap;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class CodespaceRangeParser : ICidFontPartParser<NumericToken>
{
public void Parse(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder, bool isLenientParsing)
{
/*
* For example:
3 begincodespacerange
<00> <80>
<8140> <9ffc>
<a0> <de>
endcodespacerange
*/
var ranges = new List<CodespaceRange>(numeric.Int);
for (var i = 0; i < numeric.Int; i++)
{
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start))
{
throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
}
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end))
{
throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
}
ranges.Add(new CodespaceRange(start.Bytes, end.Bytes));
}
builder.CodespaceRanges = ranges;
}
}
}

View File

@@ -0,0 +1,17 @@
namespace UglyToad.Pdf.Fonts.Parser.Parts
{
using Cmap;
using Tokenization.Scanner;
/// <summary>
/// Provides parsing for a certain operator type in a CID font definition.
/// </summary>
/// <typeparam name="TToken">The type of the token preceding the operation we wish to parse.</typeparam>
internal interface ICidFontPartParser<in TToken>
{
/// <summary>
/// Parse the definition for this part of the CID font and write the results to the <see cref="CharacterMapBuilder"/>.
/// </summary>
void Parse(TToken previous, ITokenScanner tokenScanner, CharacterMapBuilder builder, bool isLenientParsing);
}
}

View File

@@ -70,5 +70,19 @@ namespace UglyToad.Pdf.Tokenization.Tokens
Bytes = bytes;
Data = builder.ToString();
}
public static int ConvertHexBytesToInt(HexToken token)
{
var bytes = token.Bytes;
var value = bytes[0] & 0xFF;
if (bytes.Count == 2)
{
value <<= 8;
value += bytes[1] & 0xFF;
}
return value;
}
}
}