adds per character byte mapping to truetype #98

this starts to add logic for per-character mapping of unicode characters to byte values for truetype fonts in the pdf document builder. in order to support unicode characters outside the 0-255 range when creating new pdf documents without using composite fonts, we need to map values outside these range into this range. to do this we start at 1 and map each character we encounter to the next code, up to a maximum of 255. we provide a custom tounicode cmap in the font dictionary which maps these byte values, 0-255, back to unicode code points (short).

we also provide a custom firstchar, lastchar and widths array for the font mapping just the values we use.

since fonts no longer contain just the latin character set the font descriptor enum is set to have the symbolic flag set. this means values will be looked up in either the mac-roman (1, 0) or windows-symbol (3, 0) cmap tables (these cmap tables are distinct from cmap tables in the pdf file) inside the actual truetype font bytes. this means the currently generated font file is invalid, because while the widths array and tounicode cmap return the correct values the actual font itself returns whatever values where in those positions before the remapping occurred.

in order to fix this we will need to override the windows-symbol cmap contained in the underlying truetype font to match our mapping. this will be a lot of work and involve significant rewriting of the font file itself, in order to preserve checksum integrity.
This commit is contained in:
Eliot Jones 2019-12-28 17:47:50 +00:00
parent f67cce31b5
commit f319e7f4b5
11 changed files with 247 additions and 117 deletions

View File

@ -0,0 +1,50 @@
namespace UglyToad.PdfPig.Tests.Writer.Fonts
{
using System.Collections.Generic;
using PdfPig.Fonts.Parser;
using PdfPig.IO;
using PdfPig.Util;
using PdfPig.Writer.Fonts;
using Xunit;
public class ToUnicodeCMapBuilderTests
{
[Fact]
public void WritesValidCMap()
{
var mappings = new Dictionary<char, byte>
{
{'1', 1},
{'=', 2},
{'H', 7},
{'a', 12},
{'2', 25}
};
var cmapStream = ToUnicodeCMapBuilder.ConvertToCMapStream(mappings);
var str = OtherEncodings.BytesAsLatin1String(cmapStream);
Assert.NotNull(str);
var result = new CMapParser().Parse(new ByteArrayInputBytes(cmapStream), false);
Assert.Equal(1, result.CodespaceRanges.Count);
var range = result.CodespaceRanges[0];
Assert.Equal(1, range.CodeLength);
Assert.Equal(0, range.StartInt);
Assert.Equal(byte.MaxValue, range.EndInt);
Assert.Equal(mappings.Count, result.BaseFontCharacterMap.Count);
foreach (var keyValuePair in result.BaseFontCharacterMap)
{
var match = mappings[keyValuePair.Value[0]];
Assert.Equal(match, keyValuePair.Key);
}
}
}
}

View File

@ -272,7 +272,7 @@
}
}
[Fact]
public void CanWriteSinglePageWithCzechCharacters()
{
var builder = new PdfDocumentBuilder();

View File

@ -32,7 +32,7 @@
/// Font uses a (sub)set of the Adobe standard Latin set.
/// </summary>
/// <remarks>Cannot be set at the same time as <see cref="Symbolic"/>.</remarks>
Nonsymbolic = 1 << 5,
NonSymbolic = 1 << 5,
/// <summary>
/// Font is italic.
/// </summary>

View File

@ -9,10 +9,14 @@
private static readonly byte WhiteSpace = OtherEncodings.StringAsLatin1Bytes(" ")[0];
private static readonly byte NewLine = OtherEncodings.StringAsLatin1Bytes("\n")[0];
public static void WriteText(this Stream stream, string text)
public static void WriteText(this Stream stream, string text, bool appendWhitespace = false)
{
var bytes = OtherEncodings.StringAsLatin1Bytes(text);
stream.Write(bytes, 0, bytes.Length);
if (appendWhitespace)
{
stream.WriteWhiteSpace();
}
}
public static void WriteHex(this Stream stream, byte[] bytes)

View File

@ -99,6 +99,7 @@
public static readonly NameToken CidFontType0 = new NameToken("CIDFontType0");
public static readonly NameToken CidFontType0C = new NameToken("CIDFontType0C");
public static readonly NameToken CidFontType2 = new NameToken("CIDFontType2");
public static readonly NameToken CidInit = new NameToken("CIDInit");
public static readonly NameToken CidToGidMap = new NameToken("CIDToGIDMap");
public static readonly NameToken CidSet = new NameToken("CIDSet");
public static readonly NameToken CidSystemInfo = new NameToken("CIDSystemInfo");
@ -108,6 +109,7 @@
public static readonly NameToken ClrFf = new NameToken("ClrFf");
public static readonly NameToken Cmap = new NameToken("CMap");
public static readonly NameToken Cmapname = new NameToken("CMapName");
public static readonly NameToken CmapType = new NameToken("CMapType");
public static readonly NameToken Cmyk = new NameToken("CMYK");
public static readonly NameToken Co = new NameToken("CO");
public static readonly NameToken ColorBurn = new NameToken("ColorBurn");

View File

@ -1,4 +1,4 @@
namespace UglyToad.PdfPig.Writer
namespace UglyToad.PdfPig.Writer.Fonts
{
using System.IO;
using Core;

View File

@ -1,13 +1,13 @@
namespace UglyToad.PdfPig.Writer
namespace UglyToad.PdfPig.Writer.Fonts
{
using System;
using System.Collections.Generic;
using System.IO;
using Core;
using Fonts;
using Fonts.Encodings;
using Geometry;
using Tokens;
using UglyToad.PdfPig.Fonts;
using UglyToad.PdfPig.Fonts.Encodings;
internal class Standard14WritingFont : IWritingFont
{

View File

@ -0,0 +1,114 @@
namespace UglyToad.PdfPig.Writer.Fonts
{
using System.Collections.Generic;
using System.IO;
using Graphics.Operations;
using Tokens;
using Util;
internal static class ToUnicodeCMapBuilder
{
private const string BeginToken = "begin";
private const string BeginCMapToken = "begincmap";
private const string DefToken = "def";
private const string DictToken = "dict";
private const string FindResourceToken = "findresource";
public static IReadOnlyList<byte> ConvertToCMapStream(IReadOnlyDictionary<char, byte> unicodeToCharacterCode)
{
using (var memoryStream = new MemoryStream())
{
TokenWriter.WriteToken(NameToken.CidInit, memoryStream);
TokenWriter.WriteToken(NameToken.ProcSet, memoryStream);
memoryStream.WriteText(FindResourceToken, true);
memoryStream.WriteText(BeginToken);
memoryStream.WriteNewLine();
memoryStream.WriteDecimal(12);
memoryStream.WriteWhiteSpace();
memoryStream.WriteText(DictToken, true);
memoryStream.WriteText(BeginToken);
memoryStream.WriteNewLine();
memoryStream.WriteText(BeginCMapToken);
memoryStream.WriteNewLine();
TokenWriter.WriteToken(NameToken.CidSystemInfo, memoryStream);
var dictionary = new DictionaryToken(new Dictionary<NameToken, IToken>
{
{ NameToken.Registry, new StringToken("Adobe") },
{ NameToken.Ordering, new StringToken("UCS") },
{ NameToken.Supplement, new NumericToken(0) }
});
TokenWriter.WriteToken(dictionary, memoryStream);
memoryStream.WriteWhiteSpace();
memoryStream.WriteText(DefToken);
memoryStream.WriteNewLine();
TokenWriter.WriteToken(NameToken.Cmapname, memoryStream);
TokenWriter.WriteToken(NameToken.Create("Adobe-Identity-UCS"), memoryStream);
memoryStream.WriteText(DefToken);
memoryStream.WriteNewLine();
TokenWriter.WriteToken(NameToken.CmapType, memoryStream);
memoryStream.WriteNumberText(2, DefToken);
memoryStream.WriteNumberText(1, "begincodespacerange");
TokenWriter.WriteToken(new HexToken(new[] {'0', '0'}), memoryStream);
TokenWriter.WriteToken(new HexToken(new[] {'F', 'F'}), memoryStream);
memoryStream.WriteNewLine();
memoryStream.WriteText("endcodespacerange");
memoryStream.WriteNewLine();
memoryStream.WriteNumberText(unicodeToCharacterCode.Count, "beginbfchar");
foreach (var keyValuePair in unicodeToCharacterCode)
{
var unicodeInt = (ushort) keyValuePair.Key;
var low = (byte) (unicodeInt >> 0);
var high = (byte) (unicodeInt >> 8);
var from = Hex.GetString(new[] {keyValuePair.Value});
var to = Hex.GetString(new[] {high, low});
TokenWriter.WriteToken(new HexToken(from.ToCharArray()), memoryStream);
TokenWriter.WriteToken(new HexToken(to.ToCharArray()), memoryStream);
memoryStream.WriteNewLine();
}
memoryStream.WriteText("endbfchar");
memoryStream.WriteNewLine();
memoryStream.WriteText("endcmap");
memoryStream.WriteNewLine();
memoryStream.WriteText("CMapName currentdict /CMap defineresource pop");
memoryStream.WriteNewLine();
memoryStream.WriteText("end");
memoryStream.WriteNewLine();
memoryStream.WriteText("end");
memoryStream.WriteNewLine();
return memoryStream.ToArray();
}
}
}
}

View File

@ -1,4 +1,4 @@
namespace UglyToad.PdfPig.Writer
namespace UglyToad.PdfPig.Writer.Fonts
{
using System;
using System.Collections.Generic;
@ -6,20 +6,23 @@
using System.Linq;
using Core;
using Filters;
using Fonts;
using Fonts.Encodings;
using Fonts.Exceptions;
using Fonts.TrueType;
using Fonts.TrueType.Tables;
using Geometry;
using Logging;
using Tokens;
using UglyToad.PdfPig.Fonts;
using UglyToad.PdfPig.Fonts.Exceptions;
using UglyToad.PdfPig.Fonts.TrueType;
using UglyToad.PdfPig.Fonts.TrueType.Tables;
internal class TrueTypeWritingFont : IWritingFont
{
private readonly TrueTypeFontProgram font;
private readonly IReadOnlyList<byte> fontFileBytes;
private readonly object mappingLock = new object();
private readonly Dictionary<char, byte> characterMapping = new Dictionary<char, byte>();
private int characterMappingCounter = 1;
public bool HasWidths { get; } = true;
public string Name => font.Name;
@ -48,7 +51,10 @@
public ObjectToken WriteFont(NameToken fontKeyName, Stream outputStream, BuilderContext context)
{
var bytes = CompressBytes();
// TODO: unfortunately we need to subset the font in order to support custom encoding.
// A symbolic font (one which contains characters not in the standard latin set) -
// should contain a MacRoman (1, 0) or Windows Symbolic (3,0) cmap subtable which maps character codes to glyph id.
var bytes = CompressBytes(fontFileBytes);
var embeddedFile = new StreamToken(new DictionaryToken(new Dictionary<NameToken, IToken>
{
{ NameToken.Length, new NumericToken(bytes.Length) },
@ -59,9 +65,7 @@
var fileRef = context.WriteObject(outputStream, embeddedFile);
var baseFont = NameToken.Create(font.TableRegister.NameTable.GetPostscriptName());
var charCodeToGlyphId = new CharacterCodeToGlyphIdMapper(font);
var postscript = font.TableRegister.PostScriptTable;
var hhead = font.TableRegister.HorizontalHeaderTable;
@ -97,11 +101,31 @@
descriptorDictionary[NameToken.StemV] = new NumericToken(((decimal)bbox.Width) * scaling * 0.13m);
var metrics = charCodeToGlyphId.GetMetrics(scaling);
var lastCharacter = 0;
var widths = new List<NumericToken> { NumericToken.Zero };
foreach (var kvp in characterMapping)
{
if (kvp.Value > lastCharacter)
{
lastCharacter = kvp.Value;
}
var widthsRef = context.WriteObject(outputStream, metrics.Widths);
var glyphId = font.WindowsUnicodeCMap.CharacterCodeToGlyphIndex(kvp.Key);
var width = font.TableRegister.HorizontalMetricsTable.GetAdvanceWidth(glyphId) * scaling;
widths.Add(new NumericToken(width));
}
var descriptor = context.WriteObject(outputStream, new DictionaryToken(descriptorDictionary));
var toUnicodeCMap = ToUnicodeCMapBuilder.ConvertToCMapStream(characterMapping);
var compressedToUnicodeCMap = CompressBytes(toUnicodeCMap);
var toUnicode = context.WriteObject(outputStream, new StreamToken(new DictionaryToken(new Dictionary<NameToken, IToken>()
{
{NameToken.Length, new NumericToken(compressedToUnicodeCMap.Length)},
{NameToken.Length1, new NumericToken(toUnicodeCMap.Count)},
{NameToken.Filter, new ArrayToken(new[] {NameToken.FlateDecode})}
}), compressedToUnicodeCMap));
var dictionary = new Dictionary<NameToken, IToken>
{
@ -109,10 +133,10 @@
{ NameToken.Subtype, NameToken.TrueType },
{ NameToken.BaseFont, baseFont },
{ NameToken.FontDescriptor, new IndirectReferenceToken(descriptor.Number) },
{ NameToken.FirstChar, metrics.FirstChar },
{ NameToken.LastChar, metrics.LastChar },
{ NameToken.Widths, new IndirectReferenceToken(widthsRef.Number) },
{ NameToken.Encoding, NameToken.WinAnsiEncoding }
{ NameToken.FirstChar, new NumericToken(0) },
{ NameToken.LastChar, new NumericToken(lastCharacter) },
{ NameToken.Widths, new ArrayToken(widths) },
{NameToken.ToUnicode, new IndirectReferenceToken(toUnicode.Number) }
};
var token = new DictionaryToken(dictionary);
@ -124,17 +148,38 @@
public byte GetValueForCharacter(char character)
{
return (byte) character;
lock (mappingLock)
{
if (characterMapping.TryGetValue(character, out var result))
{
return result;
}
if (characterMappingCounter > byte.MaxValue)
{
throw new NotSupportedException("Cannot support more than 255 separate characters in a simple TrueType font, please" +
" submit an issue since we will need to add support for composite fonts with multi-byte" +
" character identifiers.");
}
var value = (byte) characterMappingCounter++;
characterMapping[character] = value;
result = value;
return result;
}
}
private byte[] CompressBytes()
private static byte[] CompressBytes(IReadOnlyList<byte> bytes)
{
using (var memoryStream = new MemoryStream(fontFileBytes.ToArray()))
using (var memoryStream = new MemoryStream(bytes.ToArray()))
{
var parameters = new DictionaryToken(new Dictionary<NameToken, IToken>());
var flater = new FlateFilter(new DecodeParameterResolver(new NoOpLog()), new PngPredictor(), new NoOpLog());
var bytes = flater.Encode(memoryStream, parameters, 0);
return bytes;
var result = flater.Encode(memoryStream, parameters, 0);
return result;
}
}
@ -148,90 +193,5 @@
new NumericToken((decimal)boundingBox.Top * scaling)
});
}
private class CharacterCodeToGlyphIdMapper
{
private readonly TrueTypeFontProgram font;
public CharacterCodeToGlyphIdMapper(TrueTypeFontProgram font)
{
this.font = font ?? throw new ArgumentNullException(nameof(font));
}
public FontDictionaryMetrics GetMetrics(decimal scaling)
{
// TODO: differences array
var encoding = WinAnsiEncoding.Instance;
var firstCharacter = encoding.CodeToNameMap.Keys.Min();
var lastCharacter = encoding.CodeToNameMap.Keys.Max();
var glyphList = GlyphList.AdobeGlyphList;
var length = lastCharacter - firstCharacter + 1;
var widths = Enumerable.Range(0, length).Select(x => new NumericToken(0)).ToList();
foreach (var pair in encoding.CodeToNameMap)
{
var unicode = glyphList.NameToUnicode(pair.Value);
if (unicode == null)
{
continue;
}
var characterCode = (int) unicode[0];
if (characterCode < firstCharacter || characterCode > lastCharacter)
{
continue;
}
if (!font.TryGetBoundingAdvancedWidth(characterCode, out var width))
{
width = font.TableRegister.HorizontalMetricsTable.HorizontalMetrics[0].AdvanceWidth;
}
widths[pair.Key - firstCharacter] = new NumericToken((decimal)width * scaling);
}
return new FontDictionaryMetrics
{
FirstChar = new NumericToken(firstCharacter),
LastChar = new NumericToken(lastCharacter),
Widths = new ArrayToken(widths)
};
}
private Encoding ReadFontEncoding()
{
var codeToName = new Dictionary<int, string>();
var postscript = font.TableRegister.PostScriptTable;
for (var i = 0; i <= 256; i++)
{
if (!font.TableRegister.CMapTable.TryGetGlyphIndex(i, out var glyphIndex))
{
continue;
}
var name = postscript.GlyphNames[glyphIndex];
if (GlyphList.AdobeGlyphList.NameToUnicode(name) == null)
{
continue;
}
codeToName[i] = name;
}
return new BuiltInEncoding(codeToName);
}
}
private class FontDictionaryMetrics
{
public ArrayToken Widths { get; set; }
public NumericToken FirstChar { get; set; }
public NumericToken LastChar { get; set; }
}
}
}

View File

@ -6,11 +6,12 @@
using System.Linq;
using Content;
using Fonts;
using Fonts.TrueType;
using Fonts.TrueType.Parser;
using PdfPig.Fonts.TrueType;
using PdfPig.Fonts.TrueType.Parser;
using Geometry;
using Graphics.Operations;
using IO;
using PdfPig.Fonts;
using Tokens;
using Util;
using Util.JetBrains.Annotations;

View File

@ -4,6 +4,7 @@
using System.Collections.Generic;
using Content;
using Core;
using Fonts;
using Geometry;
using Graphics.Colors;
using Graphics.Operations;
@ -14,8 +15,6 @@
using Graphics.Operations.TextPositioning;
using Graphics.Operations.TextShowing;
using Graphics.Operations.TextState;
using Tokens;
using Util;
/// <summary>
/// A builder used to add construct a page in a PDF document.