mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-07-17 05:11:51 +08:00
change hex token construction to use a lookup for mapping to bytes. add parsing for codespace range and base font characters in the cmap
This commit is contained in:
parent
511385a253
commit
2e5aa37c85
@ -1,5 +1,10 @@
|
||||
namespace UglyToad.Pdf.Fonts.Cmap
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using Util;
|
||||
|
||||
/// <summary>
|
||||
/// A mutable class used when parsing and generating a <see cref="CMap"/>.
|
||||
/// </summary>
|
||||
@ -44,5 +49,39 @@
|
||||
/// Defined as required.
|
||||
/// </remarks>
|
||||
public int Type { get; set; } = -1;
|
||||
|
||||
public IReadOnlyList<CodespaceRange> CodespaceRanges { get; set; }
|
||||
|
||||
public Dictionary<int, string> BaseFontCharacterMap { get; } = new Dictionary<int, string>();
|
||||
|
||||
public void AddBaseFontCharacter(IReadOnlyList<byte> bytes, IReadOnlyList<byte> value)
|
||||
{
|
||||
AddBaseFontCharacter(bytes, CreateStringFromBytes(value.ToArray()));
|
||||
}
|
||||
|
||||
public void AddBaseFontCharacter(IReadOnlyList<byte> bytes, string value)
|
||||
{
|
||||
var code = GetCodeFromArray(bytes, bytes.Count);
|
||||
|
||||
BaseFontCharacterMap[code] = value;
|
||||
}
|
||||
|
||||
private int GetCodeFromArray(IReadOnlyList<byte> data, int length)
|
||||
{
|
||||
int code = 0;
|
||||
for (int i = 0; i < length; i++)
|
||||
{
|
||||
code <<= 8;
|
||||
code |= (data[i] + 256) % 256;
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
private string CreateStringFromBytes(byte[] bytes)
|
||||
{
|
||||
return bytes.Length == 1
|
||||
? OtherEncodings.BytesAsLatin1String(bytes)
|
||||
: Encoding.BigEndianUnicode.GetString(bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
internal static class CmapUtils
|
||||
{
|
||||
public static int ToInt(this byte[] data, int length)
|
||||
public static int ToInt(this IReadOnlyList<byte> data, int length)
|
||||
{
|
||||
int code = 0;
|
||||
for (int i = 0; i < length; ++i)
|
||||
|
@ -1,66 +1,32 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
|
||||
namespace UglyToad.Pdf.Fonts.Cmap
|
||||
namespace UglyToad.Pdf.Fonts.Cmap
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
|
||||
/// <summary>
|
||||
/// A codespace range is specified by a pair of codes of some particular length giving the lower and upper bounds of that range.
|
||||
/// </summary>
|
||||
public class CodespaceRange
|
||||
{
|
||||
private byte[] start;
|
||||
private byte[] end;
|
||||
private int startInt;
|
||||
private int endInt;
|
||||
public IReadOnlyList<byte> Start { get; }
|
||||
|
||||
public int CodeLength { get; private set; }
|
||||
public IReadOnlyList<byte> End { get; }
|
||||
|
||||
/**
|
||||
* Creates a new instance of CodespaceRange.
|
||||
*/
|
||||
public CodespaceRange()
|
||||
public int StartInt { get; }
|
||||
|
||||
public int EndInt { get; }
|
||||
|
||||
public int CodeLength { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new instance of <see cref="CodespaceRange"/>.
|
||||
/// </summary>
|
||||
public CodespaceRange(IReadOnlyList<byte> start, IReadOnlyList<byte> end)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
/** Getter for property end.
|
||||
* @return Value of property end.
|
||||
*
|
||||
*/
|
||||
public byte[] getEnd()
|
||||
{
|
||||
return end;
|
||||
}
|
||||
|
||||
/** Setter for property end.
|
||||
* @param endBytes New value of property end.
|
||||
*
|
||||
*/
|
||||
void setEnd(byte[] endBytes)
|
||||
{
|
||||
end = endBytes;
|
||||
endInt = endBytes.ToInt(endBytes.Length);
|
||||
}
|
||||
|
||||
/** Getter for property start.
|
||||
* @return Value of property start.
|
||||
*
|
||||
*/
|
||||
public byte[] getStart()
|
||||
{
|
||||
return start;
|
||||
}
|
||||
|
||||
/** Setter for property start.
|
||||
* @param startBytes New value of property start.
|
||||
*
|
||||
*/
|
||||
void setStart(byte[] startBytes)
|
||||
{
|
||||
start = startBytes;
|
||||
CodeLength = start.Length;
|
||||
startInt = startBytes.ToInt(startBytes.Length);
|
||||
Start = start;
|
||||
End = end;
|
||||
StartInt = start.ToInt(start.Count);
|
||||
EndInt = end.ToInt(end.Count);
|
||||
CodeLength = start.Count;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -80,7 +46,7 @@ namespace UglyToad.Pdf.Fonts.Cmap
|
||||
if (codeLen == CodeLength)
|
||||
{
|
||||
int value = code.ToInt(codeLen);
|
||||
if (value >= startInt && value <= endInt)
|
||||
if (value >= StartInt && value <= EndInt)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
namespace UglyToad.Pdf.Fonts.Parser
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Globalization;
|
||||
using Cmap;
|
||||
using Cos;
|
||||
@ -27,7 +28,38 @@
|
||||
{
|
||||
switch (operatorToken.Data)
|
||||
{
|
||||
default:
|
||||
case "usecmap":
|
||||
throw new NotImplementedException("External CMap files not yet supported, please submit a pull request!");
|
||||
case "begincodespacerange":
|
||||
{
|
||||
if (previousToken is NumericToken numeric)
|
||||
{
|
||||
ParseCodespaceRange(numeric, scanner, builder);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException("Unexpected token preceding start of codespace range: " + previousToken);
|
||||
}
|
||||
|
||||
}
|
||||
break;
|
||||
case "beginbfchar":
|
||||
{
|
||||
if (previousToken is NumericToken numeric)
|
||||
{
|
||||
ParseBaseFontCharacters(numeric, scanner, builder);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException("Unexpected token preceding start of base font characters: " + previousToken);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case "beginbfrange":
|
||||
break;
|
||||
case "begincidchar":
|
||||
break;
|
||||
case "begingcidrange":
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -42,6 +74,66 @@
|
||||
return null;
|
||||
}
|
||||
|
||||
private static void ParseCodespaceRange(NumericToken count, ITokenScanner tokenScanner, CharacterMapBuilder builder)
|
||||
{
|
||||
/*
|
||||
* For example:
|
||||
3 begincodespacerange
|
||||
<00> <80>
|
||||
<8140> <9ffc>
|
||||
<a0> <de>
|
||||
endcodespacerange
|
||||
*/
|
||||
|
||||
var ranges = new List<CodespaceRange>(count.Int);
|
||||
|
||||
for (var i = 0; i < count.Int; i++)
|
||||
{
|
||||
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken start))
|
||||
{
|
||||
throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
|
||||
}
|
||||
|
||||
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken end))
|
||||
{
|
||||
throw new InvalidOperationException("Codespace range contains an unexpected token: " + tokenScanner.CurrentToken);
|
||||
}
|
||||
|
||||
ranges.Add(new CodespaceRange(start.Bytes, end.Bytes));
|
||||
}
|
||||
|
||||
builder.CodespaceRanges = ranges;
|
||||
}
|
||||
|
||||
private static void ParseBaseFontCharacters(NumericToken numeric, ITokenScanner tokenScanner, CharacterMapBuilder builder)
|
||||
{
|
||||
for (var i = 0; i < numeric.Int; i++)
|
||||
{
|
||||
if (!tokenScanner.MoveNext() || !(tokenScanner.CurrentToken is HexToken inputCode))
|
||||
{
|
||||
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
|
||||
}
|
||||
|
||||
if (!tokenScanner.MoveNext())
|
||||
{
|
||||
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
|
||||
}
|
||||
|
||||
if (tokenScanner.CurrentToken is NameToken characterName)
|
||||
{
|
||||
builder.AddBaseFontCharacter(inputCode.Bytes, characterName.Data.Name);
|
||||
}
|
||||
else if (tokenScanner.CurrentToken is HexToken characterCode)
|
||||
{
|
||||
builder.AddBaseFontCharacter(inputCode.Bytes, characterCode.Bytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException($"Base font characters definition contains invalid item at index {i}: {tokenScanner.CurrentToken}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void ParseName(NameToken nameToken, CoreTokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
|
||||
{
|
||||
switch (nameToken.Data.Name)
|
||||
|
@ -1,6 +1,6 @@
|
||||
namespace UglyToad.Pdf.Tokenization
|
||||
{
|
||||
using System.Text;
|
||||
using System.Collections.Generic;
|
||||
using IO;
|
||||
using Parser.Parts;
|
||||
using Tokens;
|
||||
@ -18,7 +18,7 @@
|
||||
return false;
|
||||
}
|
||||
|
||||
var characters = new StringBuilder();
|
||||
var characters = new List<char>();
|
||||
|
||||
while (inputBytes.MoveNext())
|
||||
{
|
||||
@ -39,10 +39,10 @@
|
||||
return false;
|
||||
}
|
||||
|
||||
characters.Append((char)current);
|
||||
characters.Add((char)current);
|
||||
}
|
||||
|
||||
token = new HexToken(characters.ToString());
|
||||
token = new HexToken(characters);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -1,30 +1,73 @@
|
||||
namespace UglyToad.Pdf.Tokenization.Tokens
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
|
||||
public class HexToken : IDataToken<string>
|
||||
{
|
||||
private static readonly Dictionary<char, byte> HexMap = new Dictionary<char, byte>
|
||||
{
|
||||
{'0', 0x00 },
|
||||
{'1', 0x01 },
|
||||
{'2', 0x02 },
|
||||
{'3', 0x03 },
|
||||
{'4', 0x04 },
|
||||
{'5', 0x05 },
|
||||
{'6', 0x06 },
|
||||
{'7', 0x07 },
|
||||
{'8', 0x08 },
|
||||
{'9', 0x09 },
|
||||
|
||||
{'A', 0x0A },
|
||||
{'a', 0x0A },
|
||||
{'B', 0x0B },
|
||||
{'b', 0x0B },
|
||||
{'C', 0x0C },
|
||||
{'c', 0x0C },
|
||||
{'D', 0x0D },
|
||||
{'d', 0x0D },
|
||||
{'E', 0x0E },
|
||||
{'e', 0x0E },
|
||||
{'F', 0x0F },
|
||||
{'f', 0x0F }
|
||||
};
|
||||
|
||||
private static byte Convert(char high, char low)
|
||||
{
|
||||
var highByte = HexMap[high];
|
||||
var lowByte = HexMap[low];
|
||||
|
||||
return (byte)(highByte << 4 | lowByte);
|
||||
}
|
||||
|
||||
public string Data { get; }
|
||||
|
||||
public IReadOnlyList<byte> Bytes { get; }
|
||||
|
||||
public HexToken(string characters)
|
||||
public HexToken(IReadOnlyList<char> characters)
|
||||
{
|
||||
if (characters.Length % 2 != 0)
|
||||
{
|
||||
characters += "0";
|
||||
}
|
||||
|
||||
var bytes = new List<byte>();
|
||||
var builder = new StringBuilder();
|
||||
byte[] raw = new byte[characters.Length / 2];
|
||||
for (int i = 0; i < raw.Length; i++)
|
||||
|
||||
for (int i = 0; i < characters.Count; i += 2)
|
||||
{
|
||||
builder.Append((char)Convert.ToByte(characters.Substring(i * 2, 2), 16));
|
||||
char high = characters[i];
|
||||
char low;
|
||||
if (i == characters.Count - 1)
|
||||
{
|
||||
low = '0';
|
||||
}
|
||||
else
|
||||
{
|
||||
low = characters[i + 1];
|
||||
}
|
||||
|
||||
var b = Convert(high, low);
|
||||
bytes.Add(b);
|
||||
builder.Append((char)b);
|
||||
}
|
||||
|
||||
Bytes = raw;
|
||||
Bytes = bytes;
|
||||
Data = builder.ToString();
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user