mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
split out classes for parsing the cmap format and add assertions to tests. add bytes to int method for hex token and test
This commit is contained in:
@@ -40,11 +40,49 @@ end";
|
||||
private readonly CMapParser cMapParser = new CMapParser();
|
||||
|
||||
[Fact]
|
||||
public void CanParseCMap()
|
||||
public void CanParseCidSystemInfoAndOtherInformation()
|
||||
{
|
||||
var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false);
|
||||
|
||||
var cmap = cMapParser.Parse(input.Bytes, false);
|
||||
|
||||
Assert.Equal("Adobe", cmap.Info.Registry);
|
||||
Assert.Equal("UCS", cmap.Info.Ordering);
|
||||
Assert.Equal(0, cmap.Info.Supplement);
|
||||
|
||||
Assert.Equal("Adobe-Identity-UCS", cmap.Name);
|
||||
Assert.Equal(2, cmap.Type);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanParseCodespaceRange()
|
||||
{
|
||||
var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false);
|
||||
|
||||
var cmap = cMapParser.Parse(input.Bytes, false);
|
||||
|
||||
Assert.Equal(1, cmap.CodespaceRanges.Count);
|
||||
|
||||
Assert.Equal(0, cmap.CodespaceRanges[0].StartInt);
|
||||
Assert.Equal(65535, cmap.CodespaceRanges[0].EndInt);
|
||||
Assert.Equal(2, cmap.CodespaceRanges[0].CodeLength);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanParseBaseFontCharacters()
|
||||
{
|
||||
var input = StringBytesTestConverter.Convert(GoogleDocToUnicodeCmap, false);
|
||||
|
||||
var cmap = cMapParser.Parse(input.Bytes, false);
|
||||
|
||||
Assert.True(cmap.BaseFontCharacterMap.Count >= 6);
|
||||
|
||||
Assert.Equal(" ", cmap.BaseFontCharacterMap[3]);
|
||||
Assert.Equal(".", cmap.BaseFontCharacterMap[17]);
|
||||
Assert.Equal("A", cmap.BaseFontCharacterMap[36]);
|
||||
Assert.Equal("T", cmap.BaseFontCharacterMap[55]);
|
||||
Assert.Equal("a", cmap.BaseFontCharacterMap[68]);
|
||||
Assert.Equal("x", cmap.BaseFontCharacterMap[91]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
36
src/UglyToad.Pdf.Tests/Tokenization/Tokens/HexTokenTests.cs
Normal file
36
src/UglyToad.Pdf.Tests/Tokenization/Tokens/HexTokenTests.cs
Normal file
@@ -0,0 +1,36 @@
|
||||
namespace UglyToad.Pdf.Tests.Tokenization.Tokens
|
||||
{
|
||||
using Pdf.Tokenization.Tokens;
|
||||
using Xunit;
|
||||
|
||||
public class HexTokenTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData("AE", "®")]
|
||||
[InlineData("61", "a")]
|
||||
[InlineData("0061", "\0a")]
|
||||
[InlineData("7465787420736f", "text so")]
|
||||
public void MapsCorrectlyToString(string input, string expected)
|
||||
{
|
||||
var token = new HexToken(input.ToCharArray());
|
||||
|
||||
Assert.Equal(expected, token.Data);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("0003", 3)]
|
||||
[InlineData("0011", 17)]
|
||||
[InlineData("0024", 36)]
|
||||
[InlineData("0037", 55)]
|
||||
[InlineData("0044", 68)]
|
||||
[InlineData("005B", 91)]
|
||||
public void MapsCorrectlyToInt(string input, int expected)
|
||||
{
|
||||
var token = new HexToken(input.ToCharArray());
|
||||
|
||||
var value = HexToken.ConvertHexBytesToInt(token);
|
||||
|
||||
Assert.Equal(expected, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -26,5 +26,10 @@
|
||||
Ordering = ordering;
|
||||
Supplement = supplement;
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"{Registry} | {Ordering} | {Supplement}";
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -54,6 +54,8 @@
|
||||
|
||||
public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; set; }
|
||||
|
||||
public IReadOnlyList<CidRange> CidRanges { get; set; }
|
||||
|
||||
public Dictionary<int, string> BaseFontCharacterMap { get; } = new Dictionary<int, string>();
|
||||
|
||||
public void AddBaseFontCharacter(IReadOnlyList<byte> bytes, IReadOnlyList<byte> value)
|
||||
@@ -68,6 +70,15 @@
|
||||
BaseFontCharacterMap[code] = value;
|
||||
}
|
||||
|
||||
public CMap Build()
|
||||
{
|
||||
return new CMap(CharacterIdentifierSystemInfo, Type, WMode, Name, Version,
|
||||
BaseFontCharacterMap ?? new Dictionary<int, string>(),
|
||||
CodespaceRanges ?? new CodespaceRange[0],
|
||||
CidRanges ?? new CidRange[0],
|
||||
CidCharacterMappings ?? new CidCharacterMapping[0]);
|
||||
}
|
||||
|
||||
private int GetCodeFromArray(IReadOnlyList<byte> data, int length)
|
||||
{
|
||||
int code = 0;
|
||||
@@ -79,7 +90,7 @@
|
||||
return code;
|
||||
}
|
||||
|
||||
private string CreateStringFromBytes(byte[] bytes)
|
||||
private static string CreateStringFromBytes(byte[] bytes)
|
||||
{
|
||||
return bytes.Length == 1
|
||||
? OtherEncodings.BytesAsLatin1String(bytes)
|
||||
|
||||
@@ -1,11 +1,50 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
|
||||
namespace UglyToad.Pdf.Fonts.Cmap
|
||||
namespace UglyToad.Pdf.Fonts.Cmap
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
public class CMap
|
||||
{
|
||||
public CharacterIdentifierSystemInfo Info { get; }
|
||||
|
||||
public int Type { get; }
|
||||
|
||||
public int WMode { get; }
|
||||
|
||||
public string Name { get; }
|
||||
|
||||
public string Version { get; }
|
||||
|
||||
[NotNull]
|
||||
public IReadOnlyDictionary<int, string> BaseFontCharacterMap { get; }
|
||||
|
||||
[NotNull]
|
||||
public IReadOnlyList<CodespaceRange> CodespaceRanges { get; }
|
||||
|
||||
[NotNull]
|
||||
public IReadOnlyList<CidRange> CidRanges { get; }
|
||||
|
||||
[NotNull]
|
||||
public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; }
|
||||
|
||||
public bool HasCidMappings => CidCharacterMappings.Count > 0 || CidRanges.Count > 0;
|
||||
|
||||
public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0;
|
||||
|
||||
public CMap(CharacterIdentifierSystemInfo info, int type, int wMode, string name, string version, IReadOnlyDictionary<int, string> baseFontCharacterMap, IReadOnlyList<CodespaceRange> codespaceRanges, IReadOnlyList<CidRange> cidRanges, IReadOnlyList<CidCharacterMapping> cidCharacterMappings)
|
||||
{
|
||||
Info = info;
|
||||
Type = type;
|
||||
WMode = wMode;
|
||||
Name = name;
|
||||
Version = version;
|
||||
BaseFontCharacterMap = baseFontCharacterMap ?? throw new ArgumentNullException(nameof(baseFontCharacterMap));
|
||||
CodespaceRanges = codespaceRanges ?? throw new ArgumentNullException(nameof(codespaceRanges));
|
||||
CidRanges = cidRanges ?? throw new ArgumentNullException(nameof(cidRanges));
|
||||
CidCharacterMappings = cidCharacterMappings ?? throw new ArgumentNullException(nameof(cidCharacterMappings));
|
||||
}
|
||||
|
||||
private int wmode = 0;
|
||||
private string cmapName = null;
|
||||
private string cmapVersion = null;
|
||||
@@ -17,13 +56,7 @@ namespace UglyToad.Pdf.Fonts.Cmap
|
||||
|
||||
private int minCodeLength = 4;
|
||||
private int maxCodeLength;
|
||||
|
||||
// code lengths
|
||||
private readonly List<CodespaceRange> codespaceRanges = new List<CodespaceRange>();
|
||||
|
||||
// Unicode mappings
|
||||
private readonly Dictionary<int, string> charToUnicode = new Dictionary<int, string>();
|
||||
|
||||
|
||||
// CID mappings
|
||||
private readonly Dictionary<int, int> codeToCid = new Dictionary<int, int>();
|
||||
private readonly List<CidRange> codeToCidRanges = new List<CidRange>();
|
||||
@@ -31,44 +64,17 @@ namespace UglyToad.Pdf.Fonts.Cmap
|
||||
private static readonly string SPACE = " ";
|
||||
private int spaceMapping = -1;
|
||||
|
||||
/**
|
||||
* Creates a new instance of CMap.
|
||||
*/
|
||||
public CMap()
|
||||
/// <summary>
|
||||
/// Returns the sequence of Unicode characters for the given character code.
|
||||
/// </summary>
|
||||
/// <param name="code">Character code</param>
|
||||
/// <param name="result">Unicode characters(may be more than one, e.g "fi" ligature)</param>
|
||||
/// <returns><see langword="true"/> if this character map contains an entry for this code, <see langword="false"/> otherwise.</returns>
|
||||
public bool TryConvertToUnicode(int code, out string result)
|
||||
{
|
||||
}
|
||||
var found = BaseFontCharacterMap.TryGetValue(code, out result);
|
||||
|
||||
/**
|
||||
* This will tell if this cmap has any CID mappings.
|
||||
*
|
||||
* @return true If there are any CID mappings, false otherwise.
|
||||
*/
|
||||
public bool hasCIDMappings()
|
||||
{
|
||||
return codeToCid.Count > 0 || codeToCidRanges.Count > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* This will tell if this cmap has any Unicode mappings.
|
||||
*
|
||||
* @return true If there are any Unicode mappings, false otherwise.
|
||||
*/
|
||||
public bool hasUnicodeMappings()
|
||||
{
|
||||
return charToUnicode.Count > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the sequence of Unicode characters for the given character code.
|
||||
*
|
||||
* @param code character code
|
||||
* @return Unicode characters (may be more than one, e.g "fi" ligature)
|
||||
*/
|
||||
public string toUnicode(int code)
|
||||
{
|
||||
charToUnicode.TryGetValue(code, out var result);
|
||||
|
||||
return result;
|
||||
return found;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -102,27 +108,14 @@ namespace UglyToad.Pdf.Fonts.Cmap
|
||||
// throw new InvalidOperationException("CMap is invalid");
|
||||
//}
|
||||
|
||||
/**
|
||||
* Returns an int for the given byte array
|
||||
*/
|
||||
static int toInt(byte[] data, int dataLen)
|
||||
{
|
||||
int code = 0;
|
||||
for (int i = 0; i < dataLen; ++i)
|
||||
{
|
||||
code <<= 8;
|
||||
code |= (data[i] & 0xFF);
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the CID for the given character code.
|
||||
*
|
||||
* @param code character code
|
||||
* @return CID
|
||||
*/
|
||||
public int toCID(int code)
|
||||
public int ConvertToCid(int code)
|
||||
{
|
||||
if (codeToCid.TryGetValue(code, out var cid))
|
||||
{
|
||||
@@ -137,251 +130,11 @@ namespace UglyToad.Pdf.Fonts.Cmap
|
||||
return ch;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert the given part of a byte array to an int.
|
||||
* @param data the byte array
|
||||
* @param offset The offset into the byte array.
|
||||
* @param length The length of the data we are getting.
|
||||
* @return the resulting int
|
||||
*/
|
||||
private int getCodeFromArray(byte[] data, int offset, int length)
|
||||
{
|
||||
int code = 0;
|
||||
for (int i = 0; i < length; i++)
|
||||
{
|
||||
code <<= 8;
|
||||
code |= (data[offset + i] + 256) % 256;
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
/**
|
||||
* This will add a character code to Unicode character sequence mapping.
|
||||
*
|
||||
* @param codes The character codes to map from.
|
||||
* @param unicode The Unicode characters to map to.
|
||||
*/
|
||||
void addCharMapping(byte[] codes, string unicode)
|
||||
{
|
||||
int code = getCodeFromArray(codes, 0, codes.Length);
|
||||
charToUnicode[code] = unicode;
|
||||
|
||||
// fixme: ugly little hack
|
||||
if (SPACE.Equals(unicode))
|
||||
{
|
||||
spaceMapping = code;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This will add a CID mapping.
|
||||
*
|
||||
* @param code character code
|
||||
* @param cid CID
|
||||
*/
|
||||
void addCIDMapping(int code, int cid)
|
||||
{
|
||||
codeToCid[cid] = code;
|
||||
}
|
||||
|
||||
/**
|
||||
* This will add a CID Range.
|
||||
*
|
||||
* @param from starting charactor of the CID range.
|
||||
* @param to ending character of the CID range.
|
||||
* @param cid the cid to be started with.
|
||||
*
|
||||
*/
|
||||
void addCIDRange(char from, char to, int cid)
|
||||
{
|
||||
codeToCidRanges.Add(new CidRange(from, to, cid));
|
||||
}
|
||||
|
||||
/**
|
||||
* This will add a codespace range.
|
||||
*
|
||||
* @param range A single codespace range.
|
||||
*/
|
||||
void addCodespaceRange(CodespaceRange range)
|
||||
{
|
||||
codespaceRanges.Add(range);
|
||||
maxCodeLength = Math.Max(maxCodeLength, range.CodeLength);
|
||||
minCodeLength = Math.Min(minCodeLength, range.CodeLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of the usecmap operator. This will
|
||||
* copy all of the mappings from one cmap to another.
|
||||
*
|
||||
* @param cmap The cmap to load mappings from.
|
||||
*/
|
||||
private void useCmap(CMap cmap)
|
||||
{
|
||||
foreach (CodespaceRange codespaceRange in cmap.codespaceRanges)
|
||||
{
|
||||
addCodespaceRange(codespaceRange);
|
||||
}
|
||||
charToUnicode.PutAll(cmap.charToUnicode);
|
||||
codeToCid.PutAll(cmap.codeToCid);
|
||||
codeToCidRanges.AddRange(cmap.codeToCidRanges);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the WMode of a CMap.
|
||||
*
|
||||
* 0 represents a horizontal and 1 represents a vertical orientation.
|
||||
*
|
||||
* @return the wmode
|
||||
*/
|
||||
public int getWMode()
|
||||
{
|
||||
return wmode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the WMode of a CMap.
|
||||
*
|
||||
* @param newWMode the new WMode.
|
||||
*/
|
||||
public void setWMode(int newWMode)
|
||||
{
|
||||
wmode = newWMode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the name of the CMap.
|
||||
*
|
||||
* @return the CMap name.
|
||||
*/
|
||||
public string getName()
|
||||
{
|
||||
return cmapName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the name of the CMap.
|
||||
*
|
||||
* @param name the CMap name.
|
||||
*/
|
||||
public void setName(string name)
|
||||
{
|
||||
cmapName = name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the version of the CMap.
|
||||
*
|
||||
* @return the CMap version.
|
||||
*/
|
||||
public string getVersion()
|
||||
{
|
||||
return cmapVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the version of the CMap.
|
||||
*
|
||||
* @param version the CMap version.
|
||||
*/
|
||||
public void setVersion(string version)
|
||||
{
|
||||
cmapVersion = version;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the type of the CMap.
|
||||
*
|
||||
* @return the CMap type.
|
||||
*/
|
||||
public int getType()
|
||||
{
|
||||
return cmapType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the type of the CMap.
|
||||
*
|
||||
* @param type the CMap type.
|
||||
*/
|
||||
public void setType(int type)
|
||||
{
|
||||
cmapType = type;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the registry of the CIDSystemInfo.
|
||||
*
|
||||
* @return the registry.
|
||||
*/
|
||||
public string getRegistry()
|
||||
{
|
||||
return registry;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the registry of the CIDSystemInfo.
|
||||
*
|
||||
* @param newRegistry the registry.
|
||||
*/
|
||||
public void setRegistry(string newRegistry)
|
||||
{
|
||||
registry = newRegistry;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the ordering of the CIDSystemInfo.
|
||||
*
|
||||
* @return the ordering.
|
||||
*/
|
||||
public string getOrdering()
|
||||
{
|
||||
return ordering;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the ordering of the CIDSystemInfo.
|
||||
*
|
||||
* @param newOrdering the ordering.
|
||||
*/
|
||||
public void setOrdering(string newOrdering)
|
||||
{
|
||||
ordering = newOrdering;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the supplement of the CIDSystemInfo.
|
||||
*
|
||||
* @return the supplement.
|
||||
*/
|
||||
public int getSupplement()
|
||||
{
|
||||
return supplement;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the supplement of the CIDSystemInfo.
|
||||
*
|
||||
* @param newSupplement the supplement.
|
||||
*/
|
||||
public void setSupplement(int newSupplement)
|
||||
{
|
||||
supplement = newSupplement;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the mapping for the space character.
|
||||
*
|
||||
* @return the mapped code for the space character
|
||||
*/
|
||||
public int getSpaceMapping()
|
||||
{
|
||||
return spaceMapping;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return cmapName;
|
||||
|
||||
@@ -1,23 +1,26 @@
|
||||
namespace UglyToad.Pdf.Fonts.Parser
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Globalization;
|
||||
using Cmap;
|
||||
using Cos;
|
||||
using IO;
|
||||
using Parts;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
public class CMapParser
|
||||
{
|
||||
private static readonly BaseFontRangeParser BaseFontRangeParser = new BaseFontRangeParser();
|
||||
private static readonly BaseFontCharacterParser BaseFontCharacterParser = new BaseFontCharacterParser();
|
||||
private static readonly CidRangeParser CidRangeParser = new CidRangeParser();
|
||||
private static readonly CidFontNameParser CidFontNameParser = new CidFontNameParser();
|
||||
private static readonly CodespaceRangeParser CodespaceRangeParser = new CodespaceRangeParser();
|
||||
private static readonly CidCharacterParser CidCharacterParser = new CidCharacterParser();
|
||||
|
||||
public CMap Parse(IInputBytes inputBytes, bool isLenientParsing)
|
||||
{
|
||||
var scanner = new CoreTokenScanner(inputBytes);
|
||||
|
||||
var builder = new CharacterMapBuilder();
|
||||
var result = new CMap();
|
||||
|
||||
IToken previousToken = null;
|
||||
while (scanner.MoveNext())
|
||||
@@ -34,20 +37,19 @@
|
||||
{
|
||||
if (previousToken is NumericToken numeric)
|
||||
{
|
||||
ParseCodespaceRange(numeric, scanner, builder);
|
||||
CodespaceRangeParser.Parse(numeric, scanner, builder, isLenientParsing);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException("Unexpected token preceding start of codespace range: " + previousToken);
|
||||
}
|
||||
|
||||
}
|
||||
break;
|
||||
case "beginbfchar":
|
||||
{
|
||||
if (previousToken is NumericToken numeric)
|
||||
{
|
||||
ParseBaseFontCharacters(numeric, scanner, builder);
|
||||
BaseFontCharacterParser.Parse(numeric, scanner, builder, isLenientParsing);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -59,8 +61,7 @@
|
||||
{
|
||||
if (previousToken is NumericToken numeric)
|
||||
{
|
||||
var parser = new BaseFontRangeParser();
|
||||
parser.Parse(numeric, scanner, builder);
|
||||
BaseFontRangeParser.Parse(numeric, scanner, builder, isLenientParsing);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -72,9 +73,7 @@
|
||||
{
|
||||
if (previousToken is NumericToken numeric)
|
||||
{
|
||||
var characters = ParseCidCharacters(numeric, scanner);
|
||||
|
||||
builder.CidCharacterMappings = characters;
|
||||
CidCharacterParser.Parse(numeric, scanner, builder, isLenientParsing);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -83,229 +82,28 @@
|
||||
break;
|
||||
}
|
||||
case "begincidrange":
|
||||
{
|
||||
if (previousToken is NumericToken numeric)
|
||||
{
|
||||
CidRangeParser.Parse(numeric, scanner, builder, isLenientParsing);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException("Unexpected token preceding start of Cid ranges: " + previousToken);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (token is NameToken name)
|
||||
{
|
||||
ParseName(name, scanner, builder, isLenientParsing);
|
||||
CidFontNameParser.Parse(name, scanner, builder, isLenientParsing);
|
||||
}
|
||||
|
||||
previousToken = token;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static void ParseCodespaceRange(NumericToken count, ITokenScanner tokenScanner, CharacterMapBuilder builder)
|
||||
{
|
||||
/*
|
||||
* For example:
|
||||
3 begincodespacerange
|
||||
<00> <80>
|
||||
<8140> <9ffc>
|
||||
<a0> <de>
|
||||
endcodespacerange
|
||||
*/
|
||||
|
||||
var ranges = new List<CodespaceRange>(count.Int);
|
||||
|
||||
for (var i = 0; i < count.Int; i++)
|
||||
{
|
||||
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start))
|
||||
{
|
||||
throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
|
||||
}
|
||||
|
||||
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end))
|
||||
{
|
||||
throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
|
||||
}
|
||||
|
||||
ranges.Add(new CodespaceRange(start.Bytes, end.Bytes));
|
||||
}
|
||||
|
||||
builder.CodespaceRanges = ranges;
|
||||
}
|
||||
|
||||
private static void ParseBaseFontCharacters(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder)
|
||||
{
|
||||
for (var i = 0; i < numeric.Int; i++)
|
||||
{
|
||||
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode))
|
||||
{
|
||||
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
|
||||
}
|
||||
|
||||
if (!tokenScanner.MoveNext())
|
||||
{
|
||||
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
|
||||
}
|
||||
|
||||
if (tokenScanner.CurrentToken is NameToken characterName)
|
||||
{
|
||||
builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name);
|
||||
}
|
||||
else if (tokenScanner.CurrentToken is HexToken characterCode)
|
||||
{
|
||||
builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static IReadOnlyList<CidCharacterMapping> ParseCidCharacters(NumericToken numeric, ITokenScanner scanner)
|
||||
{
|
||||
var results = new List<CidCharacterMapping>();
|
||||
|
||||
for (var i = 0; i < numeric.Int; i++)
|
||||
{
|
||||
if (!scanner.TryReadToken(out HexToken sourceCode))
|
||||
{
|
||||
throw new InvalidOperationException("The first token in a line for Cid Characters should be a hex, instead it was: " + scanner.CurrentToken);
|
||||
}
|
||||
|
||||
if (!scanner.TryReadToken(out NumericToken destinationCode))
|
||||
{
|
||||
throw new InvalidOperationException("The destination token in a line for Cid Character should be an integer, instead it was: " + scanner.CurrentToken);
|
||||
}
|
||||
|
||||
var sourceInteger = sourceCode.Bytes.ToInt(sourceCode.Bytes.Count);
|
||||
var mapping = new CidCharacterMapping(sourceInteger, destinationCode.Int);
|
||||
|
||||
results.Add(mapping);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private static void ParseName(NameToken nameToken, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
|
||||
{
|
||||
switch (nameToken.Data.Name)
|
||||
{
|
||||
case "WMode":
|
||||
{
|
||||
var next = TryMoveNext(scanner);
|
||||
if (next is NumericToken numeric)
|
||||
{
|
||||
builder.WMode = numeric.Int;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case "CMapName":
|
||||
{
|
||||
var next = TryMoveNext(scanner);
|
||||
if (next is NameToken name)
|
||||
{
|
||||
builder.Name = name.Data.Name;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case "CMapVersion":
|
||||
{
|
||||
var next = TryMoveNext(scanner);
|
||||
if (next is NumericToken number)
|
||||
{
|
||||
builder.Version = number.Data.ToString(NumberFormatInfo.InvariantInfo);
|
||||
}
|
||||
else if (next is StringToken stringToken)
|
||||
{
|
||||
builder.Version = stringToken.Data;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case "CMapType":
|
||||
{
|
||||
var next = TryMoveNext(scanner);
|
||||
if (next is NumericToken numeric)
|
||||
{
|
||||
builder.Type = numeric.Int;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case "Registry":
|
||||
{
|
||||
throw new NotImplementedException("Registry should be in a dictionary");
|
||||
}
|
||||
case "Ordering":
|
||||
{
|
||||
throw new NotImplementedException("Ordering should be in a dictionary");
|
||||
}
|
||||
case "Supplement":
|
||||
{
|
||||
throw new NotImplementedException("Supplement should be in a dictionary");
|
||||
}
|
||||
case "CIDSystemInfo":
|
||||
{
|
||||
var next = TryMoveNext(scanner);
|
||||
|
||||
if (next is DictionaryToken dictionary)
|
||||
{
|
||||
builder.CharacterIdentifierSystemInfo = GetCharacterIdentifier(dictionary, isLenientParsing);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static CharacterIdentifierSystemInfo GetCharacterIdentifier(DictionaryToken dictionary, bool isLenientParsing)
|
||||
{
|
||||
string GetErrorMessage(string missingKey)
|
||||
{
|
||||
return $"No {missingKey} found in the CIDSystemInfo dictionary: " + dictionary;
|
||||
}
|
||||
|
||||
if (!dictionary.TryGetByName(CosName.REGISTRY, out var registry) || !(registry is StringToken registryString))
|
||||
{
|
||||
if (isLenientParsing)
|
||||
{
|
||||
registryString = new StringToken("Adobe");
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException(GetErrorMessage("registry"));
|
||||
}
|
||||
}
|
||||
|
||||
if (!dictionary.TryGetByName(CosName.ORDERING, out var ordering) || !(ordering is StringToken orderingString))
|
||||
{
|
||||
if (isLenientParsing)
|
||||
{
|
||||
orderingString = new StringToken("");
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException(GetErrorMessage("ordering"));
|
||||
}
|
||||
}
|
||||
|
||||
if (!dictionary.TryGetByName(CosName.SUPPLEMENT, out var supplement) || !(supplement is NumericToken supplementNumeric))
|
||||
{
|
||||
if (isLenientParsing)
|
||||
{
|
||||
supplementNumeric = new NumericToken(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException(GetErrorMessage("supplement"));
|
||||
}
|
||||
}
|
||||
|
||||
return new CharacterIdentifierSystemInfo(registryString.Data, orderingString.Data, supplementNumeric.Int);
|
||||
}
|
||||
|
||||
[CanBeNull]
|
||||
private static IToken TryMoveNext(ITokenScanner scanner)
|
||||
{
|
||||
if (!scanner.MoveNext())
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return scanner.CurrentToken;
|
||||
return builder.Build();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
namespace UglyToad.Pdf.Fonts.Parser.Parts
|
||||
{
|
||||
using System;
|
||||
using Cmap;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class BaseFontCharacterParser : ICidFontPartParser<NumericToken>
|
||||
{
|
||||
public void Parse(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder, bool isLenientParsing)
|
||||
{
|
||||
for (var i = 0; i < numeric.Int; i++)
|
||||
{
|
||||
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode))
|
||||
{
|
||||
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
|
||||
}
|
||||
|
||||
if (!tokenScanner.MoveNext())
|
||||
{
|
||||
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
|
||||
}
|
||||
|
||||
if (tokenScanner.CurrentToken is NameToken characterName)
|
||||
{
|
||||
builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name);
|
||||
}
|
||||
else if (tokenScanner.CurrentToken is HexToken characterCode)
|
||||
{
|
||||
builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
namespace UglyToad.Pdf.Fonts.Parser
|
||||
namespace UglyToad.Pdf.Fonts.Parser.Parts
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
@@ -7,9 +7,9 @@
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class BaseFontRangeParser
|
||||
internal class BaseFontRangeParser : ICidFontPartParser<NumericToken>
|
||||
{
|
||||
public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder)
|
||||
public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
|
||||
{
|
||||
for (var i = 0; i < numeric.Int; i++)
|
||||
{
|
||||
36
src/UglyToad.Pdf/Fonts/Parser/Parts/CidCharacterParser.cs
Normal file
36
src/UglyToad.Pdf/Fonts/Parser/Parts/CidCharacterParser.cs
Normal file
@@ -0,0 +1,36 @@
|
||||
namespace UglyToad.Pdf.Fonts.Parser.Parts
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Cmap;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class CidCharacterParser : ICidFontPartParser<NumericToken>
|
||||
{
|
||||
public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
|
||||
{
|
||||
var results = new List<CidCharacterMapping>();
|
||||
|
||||
for (var i = 0; i < numeric.Int; i++)
|
||||
{
|
||||
if (!scanner.TryReadToken(out HexToken sourceCode))
|
||||
{
|
||||
throw new InvalidOperationException("The first token in a line for Cid Characters should be a hex, instead it was: " + scanner.CurrentToken);
|
||||
}
|
||||
|
||||
if (!scanner.TryReadToken(out NumericToken destinationCode))
|
||||
{
|
||||
throw new InvalidOperationException("The destination token in a line for Cid Character should be an integer, instead it was: " + scanner.CurrentToken);
|
||||
}
|
||||
|
||||
var sourceInteger = sourceCode.Bytes.ToInt(sourceCode.Bytes.Count);
|
||||
var mapping = new CidCharacterMapping(sourceInteger, destinationCode.Int);
|
||||
|
||||
results.Add(mapping);
|
||||
}
|
||||
|
||||
builder.CidCharacterMappings = results;
|
||||
}
|
||||
}
|
||||
}
|
||||
128
src/UglyToad.Pdf/Fonts/Parser/Parts/CidFontNameParser.cs
Normal file
128
src/UglyToad.Pdf/Fonts/Parser/Parts/CidFontNameParser.cs
Normal file
@@ -0,0 +1,128 @@
|
||||
namespace UglyToad.Pdf.Fonts.Parser.Parts
|
||||
{
|
||||
using System;
|
||||
using System.Globalization;
|
||||
using Cmap;
|
||||
using Cos;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class CidFontNameParser : ICidFontPartParser<NameToken>
|
||||
{
|
||||
public void Parse(NameToken nameToken, ITokenScanner scanner, CharacterMapBuilder builder,
|
||||
bool isLenientParsing)
|
||||
{
|
||||
switch (nameToken.Data.Name)
|
||||
{
|
||||
case "WMode":
|
||||
{
|
||||
if (scanner.TryReadToken(out NumericToken numeric))
|
||||
{
|
||||
builder.WMode = numeric.Int;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case "CMapName":
|
||||
{
|
||||
if (scanner.TryReadToken(out NameToken name))
|
||||
{
|
||||
builder.Name = name.Data.Name;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case "CMapVersion":
|
||||
{
|
||||
if (!scanner.MoveNext())
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
var next = scanner.CurrentToken;
|
||||
if (next is NumericToken number)
|
||||
{
|
||||
builder.Version = number.Data.ToString(NumberFormatInfo.InvariantInfo);
|
||||
}
|
||||
else if (next is StringToken stringToken)
|
||||
{
|
||||
builder.Version = stringToken.Data;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case "CMapType":
|
||||
{
|
||||
if (scanner.TryReadToken(out NumericToken numeric))
|
||||
{
|
||||
builder.Type = numeric.Int;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case "Registry":
|
||||
{
|
||||
throw new NotImplementedException("Registry should be in a dictionary");
|
||||
}
|
||||
case "Ordering":
|
||||
{
|
||||
throw new NotImplementedException("Ordering should be in a dictionary");
|
||||
}
|
||||
case "Supplement":
|
||||
{
|
||||
throw new NotImplementedException("Supplement should be in a dictionary");
|
||||
}
|
||||
case "CIDSystemInfo":
|
||||
{
|
||||
if (scanner.TryReadToken(out DictionaryToken dictionary))
|
||||
{
|
||||
builder.CharacterIdentifierSystemInfo = GetCharacterIdentifier(dictionary, isLenientParsing);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static CharacterIdentifierSystemInfo GetCharacterIdentifier(DictionaryToken dictionary, bool isLenientParsing)
|
||||
{
|
||||
string GetErrorMessage(string missingKey)
|
||||
{
|
||||
return $"No {missingKey} found in the CIDSystemInfo dictionary: " + dictionary;
|
||||
}
|
||||
|
||||
if (!dictionary.TryGetByName(CosName.REGISTRY, out var registry) || !(registry is StringToken registryString))
|
||||
{
|
||||
if (isLenientParsing)
|
||||
{
|
||||
registryString = new StringToken("Adobe");
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException(GetErrorMessage("registry"));
|
||||
}
|
||||
}
|
||||
|
||||
if (!dictionary.TryGetByName(CosName.ORDERING, out var ordering) || !(ordering is StringToken orderingString))
|
||||
{
|
||||
if (isLenientParsing)
|
||||
{
|
||||
orderingString = new StringToken("");
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException(GetErrorMessage("ordering"));
|
||||
}
|
||||
}
|
||||
|
||||
if (!dictionary.TryGetByName(CosName.SUPPLEMENT, out var supplement) || !(supplement is NumericToken supplementNumeric))
|
||||
{
|
||||
if (isLenientParsing)
|
||||
{
|
||||
supplementNumeric = new NumericToken(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException(GetErrorMessage("supplement"));
|
||||
}
|
||||
}
|
||||
|
||||
return new CharacterIdentifierSystemInfo(registryString.Data, orderingString.Data, supplementNumeric.Int);
|
||||
}
|
||||
}
|
||||
}
|
||||
46
src/UglyToad.Pdf/Fonts/Parser/Parts/CidRangeParser.cs
Normal file
46
src/UglyToad.Pdf/Fonts/Parser/Parts/CidRangeParser.cs
Normal file
@@ -0,0 +1,46 @@
|
||||
namespace UglyToad.Pdf.Fonts.Parser.Parts
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Cmap;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class CidRangeParser : ICidFontPartParser<NumericToken>
|
||||
{
|
||||
public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
|
||||
{
|
||||
var ranges = new List<CidRange>();
|
||||
|
||||
for (var i = 0; i < numeric.Int; i++)
|
||||
{
|
||||
if (!scanner.TryReadToken(out HexToken startHexToken))
|
||||
{
|
||||
// TODO: message
|
||||
throw new InvalidOperationException();
|
||||
}
|
||||
|
||||
if (!scanner.TryReadToken(out HexToken endHexToken))
|
||||
{
|
||||
// TODO: message
|
||||
throw new InvalidOperationException();
|
||||
}
|
||||
|
||||
if (!scanner.TryReadToken(out NumericToken mappedCode))
|
||||
{
|
||||
// TODO: message
|
||||
throw new InvalidOperationException();
|
||||
}
|
||||
|
||||
var start = HexToken.ConvertHexBytesToInt(startHexToken);
|
||||
var end = HexToken.ConvertHexBytesToInt(endHexToken);
|
||||
|
||||
var range = new CidRange((char)start, (char)end, mappedCode.Int);
|
||||
|
||||
ranges.Add(range);
|
||||
}
|
||||
|
||||
builder.CidRanges = ranges;
|
||||
}
|
||||
}
|
||||
}
|
||||
42
src/UglyToad.Pdf/Fonts/Parser/Parts/CodespaceRangeParser.cs
Normal file
42
src/UglyToad.Pdf/Fonts/Parser/Parts/CodespaceRangeParser.cs
Normal file
@@ -0,0 +1,42 @@
|
||||
namespace UglyToad.Pdf.Fonts.Parser.Parts
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Cmap;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class CodespaceRangeParser : ICidFontPartParser<NumericToken>
|
||||
{
|
||||
public void Parse(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder, bool isLenientParsing)
|
||||
{
|
||||
/*
|
||||
* For example:
|
||||
3 begincodespacerange
|
||||
<00> <80>
|
||||
<8140> <9ffc>
|
||||
<a0> <de>
|
||||
endcodespacerange
|
||||
*/
|
||||
|
||||
var ranges = new List<CodespaceRange>(numeric.Int);
|
||||
|
||||
for (var i = 0; i < numeric.Int; i++)
|
||||
{
|
||||
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start))
|
||||
{
|
||||
throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
|
||||
}
|
||||
|
||||
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end))
|
||||
{
|
||||
throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
|
||||
}
|
||||
|
||||
ranges.Add(new CodespaceRange(start.Bytes, end.Bytes));
|
||||
}
|
||||
|
||||
builder.CodespaceRanges = ranges;
|
||||
}
|
||||
}
|
||||
}
|
||||
17
src/UglyToad.Pdf/Fonts/Parser/Parts/ICidFontPartParser.cs
Normal file
17
src/UglyToad.Pdf/Fonts/Parser/Parts/ICidFontPartParser.cs
Normal file
@@ -0,0 +1,17 @@
|
||||
namespace UglyToad.Pdf.Fonts.Parser.Parts
|
||||
{
|
||||
using Cmap;
|
||||
using Tokenization.Scanner;
|
||||
|
||||
/// <summary>
|
||||
/// Provides parsing for a certain operator type in a CID font definition.
|
||||
/// </summary>
|
||||
/// <typeparam name="TToken">The type of the token preceding the operation we wish to parse.</typeparam>
|
||||
internal interface ICidFontPartParser<in TToken>
|
||||
{
|
||||
/// <summary>
|
||||
/// Parse the definition for this part of the CID font and write the results to the <see cref="CharacterMapBuilder"/>.
|
||||
/// </summary>
|
||||
void Parse(TToken previous, ITokenScanner tokenScanner, CharacterMapBuilder builder, bool isLenientParsing);
|
||||
}
|
||||
}
|
||||
@@ -70,5 +70,19 @@ namespace UglyToad.Pdf.Tokenization.Tokens
|
||||
Bytes = bytes;
|
||||
Data = builder.ToString();
|
||||
}
|
||||
|
||||
public static int ConvertHexBytesToInt(HexToken token)
|
||||
{
|
||||
var bytes = token.Bytes;
|
||||
|
||||
var value = bytes[0] & 0xFF;
|
||||
if (bytes.Count == 2)
|
||||
{
|
||||
value <<= 8;
|
||||
value += bytes[1] & 0xFF;
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user