change hex token construction to use a lookup for mapping to bytes. add parsing for codespace range and base font characters in the cmap

This commit is contained in:
Eliot Jones 2017-11-14 22:58:06 +00:00
parent 511385a253
commit 2e5aa37c85
6 changed files with 213 additions and 73 deletions

View File

@ -1,5 +1,10 @@
namespace UglyToad.Pdf.Fonts.Cmap
{
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Util;
/// <summary>
/// A mutable class used when parsing and generating a <see cref="CMap"/>.
/// </summary>
@ -44,5 +49,39 @@
/// Defined as required.
/// </remarks>
public int Type { get; set; } = -1;
public IReadOnlyList<CodespaceRange> CodespaceRanges { get; set; }
public Dictionary<int, string> BaseFontCharacterMap { get; } = new Dictionary<int, string>();
public void AddBaseFontCharacter(IReadOnlyList<byte> bytes, IReadOnlyList<byte> value)
{
AddBaseFontCharacter(bytes, CreateStringFromBytes(value.ToArray()));
}
public void AddBaseFontCharacter(IReadOnlyList<byte> bytes, string value)
{
var code = GetCodeFromArray(bytes, bytes.Count);
BaseFontCharacterMap[code] = value;
}
private int GetCodeFromArray(IReadOnlyList<byte> data, int length)
{
int code = 0;
for (int i = 0; i < length; i++)
{
code <<= 8;
code |= (data[i] + 256) % 256;
}
return code;
}
private string CreateStringFromBytes(byte[] bytes)
{
return bytes.Length == 1
? OtherEncodings.BytesAsLatin1String(bytes)
: Encoding.BigEndianUnicode.GetString(bytes);
}
}
}

View File

@ -4,7 +4,7 @@
internal static class CmapUtils
{
public static int ToInt(this byte[] data, int length)
public static int ToInt(this IReadOnlyList<byte> data, int length)
{
int code = 0;
for (int i = 0; i < length; ++i)

View File

@ -1,66 +1,32 @@
using System;
using System.Collections.Generic;
using System.Text;
namespace UglyToad.Pdf.Fonts.Cmap
namespace UglyToad.Pdf.Fonts.Cmap
{
using System.Collections.Generic;
/// <summary>
/// A codespace range is specified by a pair of codes of some particular length giving the lower and upper bounds of that range.
/// </summary>
public class CodespaceRange
{
private byte[] start;
private byte[] end;
private int startInt;
private int endInt;
public IReadOnlyList<byte> Start { get; }
public int CodeLength { get; private set; }
public IReadOnlyList<byte> End { get; }
/**
* Creates a new instance of CodespaceRange.
*/
public CodespaceRange()
public int StartInt { get; }
public int EndInt { get; }
public int CodeLength { get; }
/// <summary>
/// Creates a new instance of <see cref="CodespaceRange"/>.
/// </summary>
public CodespaceRange(IReadOnlyList<byte> start, IReadOnlyList<byte> end)
{
}
/** Getter for property end.
* @return Value of property end.
*
*/
public byte[] getEnd()
{
return end;
}
/** Setter for property end.
* @param endBytes New value of property end.
*
*/
void setEnd(byte[] endBytes)
{
end = endBytes;
endInt = endBytes.ToInt(endBytes.Length);
}
/** Getter for property start.
* @return Value of property start.
*
*/
public byte[] getStart()
{
return start;
}
/** Setter for property start.
* @param startBytes New value of property start.
*
*/
void setStart(byte[] startBytes)
{
start = startBytes;
CodeLength = start.Length;
startInt = startBytes.ToInt(startBytes.Length);
Start = start;
End = end;
StartInt = start.ToInt(start.Count);
EndInt = end.ToInt(end.Count);
CodeLength = start.Count;
}
/**
@ -80,7 +46,7 @@ namespace UglyToad.Pdf.Fonts.Cmap
if (codeLen == CodeLength)
{
int value = code.ToInt(codeLen);
if (value >= startInt && value <= endInt)
if (value >= StartInt && value <= EndInt)
{
return true;
}

View File

@ -1,6 +1,7 @@
namespace UglyToad.Pdf.Fonts.Parser
{
using System;
using System.Collections.Generic;
using System.Globalization;
using Cmap;
using Cos;
@ -27,7 +28,38 @@
{
switch (operatorToken.Data)
{
default:
case "usecmap":
throw new NotImplementedException("External CMap files not yet supported, please submit a pull request!");
case "begincodespacerange":
{
if (previousToken is NumericToken numeric)
{
ParseCodespaceRange(numeric, scanner, builder);
}
else
{
throw new InvalidOperationException("Unexpected token preceding start of codespace range: " + previousToken);
}
}
break;
case "beginbfchar":
{
if (previousToken is NumericToken numeric)
{
ParseBaseFontCharacters(numeric, scanner, builder);
}
else
{
throw new InvalidOperationException("Unexpected token preceding start of base font characters: " + previousToken);
}
}
break;
case "beginbfrange":
break;
case "begincidchar":
break;
case "begingcidrange":
break;
}
}
@ -42,6 +74,66 @@
return null;
}
private static void ParseCodespaceRange(NumericToken count, ITokenScanner tokenScanner, CharacterMapBuilder builder)
{
/*
* For example:
3 begincodespacerange
<00> <80>
<8140> <9ffc>
<a0> <de>
endcodespacerange
*/
var ranges = new List<CodespaceRange>(count.Int);
for (var i = 0; i < count.Int; i++)
{
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start))
{
throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
}
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end))
{
throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
}
ranges.Add(new CodespaceRange(start.Bytes, end.Bytes));
}
builder.CodespaceRanges = ranges;
}
private static void ParseBaseFontCharacters(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder)
{
for (var i = 0; i < numeric.Int; i++)
{
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode))
{
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
}
if (!tokenScanner.MoveNext())
{
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
}
if (tokenScanner.CurrentToken is NameToken characterName)
{
builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name);
}
else if (tokenScanner.CurrentToken is HexToken characterCode)
{
builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes);
}
else
{
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
}
}
}
private static void ParseName(NameToken nameToken, CoreTokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
{
switch (nameToken.Data.Name)

View File

@ -1,6 +1,6 @@
namespace UglyToad.Pdf.Tokenization
{
using System.Text;
using System.Collections.Generic;
using IO;
using Parser.Parts;
using Tokens;
@ -18,7 +18,7 @@
return false;
}
var characters = new StringBuilder();
var characters = new List<char>();
while (inputBytes.MoveNext())
{
@ -39,10 +39,10 @@
return false;
}
characters.Append((char)current);
characters.Add((char)current);
}
token = new HexToken(characters.ToString());
token = new HexToken(characters);
return true;
}

View File

@ -1,30 +1,73 @@
namespace UglyToad.Pdf.Tokenization.Tokens
{
using System;
using System.Collections.Generic;
using System.Text;
public class HexToken : IDataToken<string>
{
private static readonly Dictionary<char, byte> HexMap = new Dictionary<char, byte>
{
{'0', 0x00 },
{'1', 0x01 },
{'2', 0x02 },
{'3', 0x03 },
{'4', 0x04 },
{'5', 0x05 },
{'6', 0x06 },
{'7', 0x07 },
{'8', 0x08 },
{'9', 0x09 },
{'A', 0x0A },
{'a', 0x0A },
{'B', 0x0B },
{'b', 0x0B },
{'C', 0x0C },
{'c', 0x0C },
{'D', 0x0D },
{'d', 0x0D },
{'E', 0x0E },
{'e', 0x0E },
{'F', 0x0F },
{'f', 0x0F }
};
private static byte Convert(char high, char low)
{
var highByte = HexMap[high];
var lowByte = HexMap[low];
return (byte)(highByte << 4 | lowByte);
}
public string Data { get; }
public IReadOnlyList<byte> Bytes { get; }
public HexToken(string characters)
public HexToken(IReadOnlyList<char> characters)
{
if (characters.Length % 2 != 0)
{
characters += "0";
}
var bytes = new List<byte>();
var builder = new StringBuilder();
byte[] raw = new byte[characters.Length / 2];
for (int i = 0; i < raw.Length; i++)
for (int i = 0; i < characters.Count; i += 2)
{
builder.Append((char)Convert.ToByte(characters.Substring(i * 2, 2), 16));
char high = characters[i];
char low;
if (i == characters.Count - 1)
{
low = '0';
}
else
{
low = characters[i + 1];
}
var b = Convert(high, low);
bytes.Add(b);
builder.Append((char)b);
}
Bytes = raw;
Bytes = bytes;
Data = builder.ToString();
}
}