implement glyph list for mapping from character code to name to unicode

This commit is contained in:
Eliot Jones
2018-01-01 17:23:32 +00:00
parent c34bdac92a
commit d7b9a9d559
10 changed files with 5002 additions and 4 deletions

View File

@@ -0,0 +1,18 @@
namespace UglyToad.Pdf.Tests.Fonts.Encodings
{
using Pdf.Fonts.Encodings;
using Xunit;
public class GlyphListFactoryTests
{
[Fact]
public void CanGetAdobeGlyphList()
{
var result = GlyphListFactory.Get("glyphlist");
var h = result.NameToUnicode("H");
Assert.Equal("H", h);
}
}
}

View File

@@ -0,0 +1,36 @@
namespace UglyToad.Pdf.Tests.Fonts.Encodings
{
using Pdf.Fonts.Encodings;
using Xunit;
public class GlyphListTests
{
[Fact]
public void CanLoadAdobeGlyphList()
{
var list = GlyphList.AdobeGlyphList;
var result = list.NameToUnicode("Acute");
Assert.Equal("\uF6C9", result);
}
[Fact]
public void CanLoadZapfDingbatsGlyphList()
{
var list = GlyphList.ZapfDingbats;
var result = list.NameToUnicode("a69");
Assert.Equal("\u274A", result);
}
[Fact]
public void UnicodeToNameWorks()
{
var result = GlyphList.AdobeGlyphList.UnicodeCodePointToName(79);
Assert.Equal("O", result);
}
}
}

View File

@@ -0,0 +1,129 @@
namespace UglyToad.Pdf.Fonts.Encodings
{
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Text;
using Exceptions;
internal class GlyphList
{
private const string NotDefined = ".notdef";
private readonly IReadOnlyDictionary<string, string> nameToUnicode;
private readonly IReadOnlyDictionary<string, string> unicodeToName;
private readonly Dictionary<string, string> oddNameToUnicodeCache = new Dictionary<string, string>();
private static readonly Lazy<GlyphList> LazyAdobeGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("glyphlist"));
public static GlyphList AdobeGlyphList => LazyAdobeGlyphList.Value;
private static readonly Lazy<GlyphList> LazyZapfDingbatsGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("zapfdingbats"));
public static GlyphList ZapfDingbats => LazyZapfDingbatsGlyphList.Value;
public GlyphList(IReadOnlyDictionary<string, string> namesToUnicode)
{
nameToUnicode = namesToUnicode;
var unicodeToNameTemp = new Dictionary<string, string>();
foreach (var pair in namesToUnicode)
{
var forceOverride =
WinAnsiEncoding.Instance.ContainsName(pair.Key) ||
MacRomanEncoding.Instance.ContainsName(pair.Key) ||
MacExpertEncoding.Instance.ContainsName(pair.Key) ||
SymbolEncoding.Instance.ContainsName(pair.Key) ||
ZapfDingbatsEncoding.Instance.ContainsName(pair.Key);
if (!unicodeToNameTemp.ContainsKey(pair.Value) || forceOverride)
{
unicodeToNameTemp[pair.Value] = pair.Key;
}
}
unicodeToName = unicodeToNameTemp;
}
public string UnicodeCodePointToName(int unicodeValue)
{
var value = char.ConvertFromUtf32(unicodeValue);
if (unicodeToName.TryGetValue(value, out var result))
{
return result;
}
return NotDefined;
}
public string NameToUnicode(string name)
{
if (name == null)
{
return null;
}
if (nameToUnicode.TryGetValue(name, out var unicodeValue))
{
return unicodeValue;
}
if (oddNameToUnicodeCache.TryGetValue(name, out var result))
{
return result;
}
string unicode;
// Remove suffixes
if (name.IndexOf('.') > 0)
{
unicode = NameToUnicode(name.Substring(0, name.IndexOf('.')));
}
else if (name.StartsWith("uni") && name.Length == 7)
{
// test for Unicode name in the format uniXXXX where X is hex
int nameLength = name.Length;
var uniStr = new StringBuilder();
for (int chPos = 3; chPos + 4 <= nameLength; chPos += 4)
{
int codePoint = int.Parse(name.Substring(chPos, chPos + 4), NumberStyles.HexNumber);
if (codePoint > 0xD7FF && codePoint < 0xE000)
{
throw new InvalidFontFormatException(
$"Unicode character name with disallowed code area: {name}");
}
uniStr.Append((char)codePoint);
}
unicode = uniStr.ToString();
}
else if (name.StartsWith("u") && name.Length == 5)
{
// test for an alternate Unicode name representation uXXXX
int codePoint = int.Parse(name.Substring(1), NumberStyles.HexNumber);
if (codePoint > 0xD7FF && codePoint < 0xE000)
{
throw new InvalidFontFormatException(
$"Unicode character name with disallowed code area: {name}");
}
unicode = char.ConvertFromUtf32(codePoint);
}
else
{
throw new InvalidFontFormatException($"Could not find the unicode glyph for the name {name}.");
}
oddNameToUnicodeCache[name] = unicode;
return unicode;
}
}
}

View File

@@ -0,0 +1,65 @@
namespace UglyToad.Pdf.Fonts.Encodings
{
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
internal class GlyphListFactory
{
public static GlyphList Get(string listName)
{
var result = new Dictionary<string, string>();
using (var resource = typeof(GlyphListFactory).Assembly.GetManifestResourceStream($"UglyToad.Pdf.Resources.GlyphList.{listName}"))
{
if (resource == null)
{
throw new ArgumentException($"No embedded glyph list resource was found with the name {listName}.");
}
using (var reader = new StreamReader(resource))
{
while (!reader.EndOfStream)
{
var line = reader.ReadLine();
if (string.IsNullOrWhiteSpace(line))
{
continue;
}
if (line[0] == '#')
{
continue;
}
var parts = line.Split(new[] {';'});
if (parts.Length != 2)
{
throw new InvalidOperationException(
$"The line in the glyph list did not match the expected format. Line was: {line}");
}
var key = parts[0];
var values = parts[1].Split(' ');
var value = string.Empty;
foreach (var s in values)
{
var code = int.Parse(s, NumberStyles.HexNumber);
value += char.ConvertFromUtf32(code);
}
result[key] = value;
}
}
}
return new GlyphList(result);
}
}
}

View File

@@ -89,7 +89,7 @@
}
}
return new TrueTypeSimpleFont(name, firstCharacter, lastCharacter, widths, descriptor, toUnicodeCMap);
return new TrueTypeSimpleFont(name, firstCharacter, lastCharacter, widths, descriptor, toUnicodeCMap, encoding);
}
private static int GetFirstCharacter(PdfDictionary dictionary)

View File

@@ -4,6 +4,7 @@
using Cmap;
using Composite;
using Cos;
using Encodings;
using Geometry;
using IO;
using Util.JetBrains.Annotations;
@@ -14,6 +15,8 @@
private readonly int lastCharacterCode;
private readonly decimal[] widths;
private readonly FontDescriptor descriptor;
[CanBeNull]
private readonly Encoding encoding;
public CosName Name { get; }
@@ -24,12 +27,14 @@
public TrueTypeSimpleFont(CosName name, int firstCharacterCode, int lastCharacterCode, decimal[] widths,
FontDescriptor descriptor,
[CanBeNull]CMap toUnicodeCMap)
[CanBeNull]CMap toUnicodeCMap,
[CanBeNull]Encoding encoding)
{
this.firstCharacterCode = firstCharacterCode;
this.lastCharacterCode = lastCharacterCode;
this.widths = widths;
this.descriptor = descriptor;
this.encoding = encoding;
Name = name;
IsVertical = false;
@@ -46,12 +51,30 @@
{
value = null;
// Behaviour specified by the Extraction of Text Content section of the specification.
// If the font contains a ToUnicode CMap use that.
if (ToUnicode.CanMapToUnicode)
{
return ToUnicode.TryGet(characterCode, out value);
}
// If the font is a simple font that uses one of the predefined encodings MacRomanEncoding, MacExpertEncoding, or WinAnsiEncoding...
// Map the character code to a character name.
var encodedCharacterName = encoding.GetName(characterCode);
// Look up the character name in the Adobe Glyph List.
if (!ToUnicode.CanMapToUnicode)
{
// For now just cast to character
try
{
if (encoding != null)
{
}
value = ((char) characterCode).ToString();
return true;

View File

@@ -0,0 +1,146 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Format: Semicolon-delimited fields:
# (1) glyph name
# (2) Unicode scalar value
#
# These mappings are missing in glyphlist.txt
#
angbracketleft;3008
angbracketright;3009
circlecopyrt;00A9
controlNULL;0000
#
# TeX-related mappings using named values
#
angbracketleftbig;2329
angbracketleftBig;2329
angbracketleftbigg;2329
angbracketleftBigg;2329
angbracketrightBig;232A
angbracketrightbig;232A
angbracketrightBigg;232A
angbracketrightbigg;232A
arrowhookleft;21AA
arrowhookright;21A9
arrowlefttophalf;21BC
arrowleftbothalf;21BD
arrownortheast;2197
arrownorthwest;2196
arrowrighttophalf;21C0
arrowrightbothalf;21C1
arrowsoutheast;2198
arrowsouthwest;2199
backslashbig;2216
backslashBig;2216
backslashBigg;2216
backslashbigg;2216
bardbl;2016
bracehtipdownleft;FE37
bracehtipdownright;FE37
bracehtipupleft;FE38
bracehtipupright;FE38
braceleftBig;007B
braceleftbig;007B
braceleftbigg;007B
braceleftBigg;007B
bracerightBig;007D
bracerightbig;007D
bracerightbigg;007D
bracerightBigg;007D
bracketleftbig;005B
bracketleftBig;005B
bracketleftbigg;005B
bracketleftBigg;005B
bracketrightBig;005D
bracketrightbig;005D
bracketrightbigg;005D
bracketrightBigg;005D
ceilingleftbig;2308
ceilingleftBig;2308
ceilingleftBigg;2308
ceilingleftbigg;2308
ceilingrightbig;2309
ceilingrightBig;2309
ceilingrightbigg;2309
ceilingrightBigg;2309
circledotdisplay;2299
circledottext;2299
circlemultiplydisplay;2297
circlemultiplytext;2297
circleplusdisplay;2295
circleplustext;2295
contintegraldisplay;222E
contintegraltext;222E
coproductdisplay;2210
coproducttext;2210
floorleftBig;230A
floorleftbig;230A
floorleftbigg;230A
floorleftBigg;230A
floorrightbig;230B
floorrightBig;230B
floorrightBigg;230B
floorrightbigg;230B
hatwide;0302
hatwider;0302
hatwidest;0302
intercal;1D40
integraldisplay;222B
integraltext;222B
intersectiondisplay;22C2
intersectiontext;22C2
logicalanddisplay;2227
logicalandtext;2227
logicalordisplay;2228
logicalortext;2228
parenleftBig;0028
parenleftbig;0028
parenleftBigg;0028
parenleftbigg;0028
parenrightBig;0029
parenrightbig;0029
parenrightBigg;0029
parenrightbigg;0029
prime;2032
productdisplay;220F
producttext;220F
radicalbig;221A
radicalBig;221A
radicalBigg;221A
radicalbigg;221A
radicalbt;221A
radicaltp;221A
radicalvertex;221A
slashbig;002F
slashBig;002F
slashBigg;002F
slashbigg;002F
summationdisplay;2211
summationtext;2211
tildewide;02DC
tildewider;02DC
tildewidest;02DC
uniondisplay;22C3
unionmultidisplay;228E
unionmultitext;228E
unionsqdisplay;2294
unionsqtext;2294
uniontext;22C3
vextenddouble;2225
vextendsingle;2223
#END

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,248 @@
# -----------------------------------------------------------
# Copyright 2002, 2010 Adobe Systems Incorporated.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or
# without modification, are permitted provided that the
# following conditions are met:
#
# Redistributions of source code must retain the above
# copyright notice, this list of conditions and the following
# disclaimer.
#
# Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# Neither the name of Adobe Systems Incorporated nor the names
# of its contributors may be used to endorse or promote
# products derived from this software without specific prior
# written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# -----------------------------------------------------------
# Name: ITC Zapf Dingbats Glyph List
# Table version: 2.0
# Date: September 20, 2002
# URL: http://sourceforge.net/adobe/aglfn/
#
# Format: two semicolon-delimited fields:
# (1) glyph name--upper/lowercase letters and digits
# (2) Unicode scalar value--four uppercase hexadecimal digits
#
a100;275E
a101;2761
a102;2762
a103;2763
a104;2764
a105;2710
a106;2765
a107;2766
a108;2767
a109;2660
a10;2721
a110;2665
a111;2666
a112;2663
a117;2709
a118;2708
a119;2707
a11;261B
a120;2460
a121;2461
a122;2462
a123;2463
a124;2464
a125;2465
a126;2466
a127;2467
a128;2468
a129;2469
a12;261E
a130;2776
a131;2777
a132;2778
a133;2779
a134;277A
a135;277B
a136;277C
a137;277D
a138;277E
a139;277F
a13;270C
a140;2780
a141;2781
a142;2782
a143;2783
a144;2784
a145;2785
a146;2786
a147;2787
a148;2788
a149;2789
a14;270D
a150;278A
a151;278B
a152;278C
a153;278D
a154;278E
a155;278F
a156;2790
a157;2791
a158;2792
a159;2793
a15;270E
a160;2794
a161;2192
a162;27A3
a163;2194
a164;2195
a165;2799
a166;279B
a167;279C
a168;279D
a169;279E
a16;270F
a170;279F
a171;27A0
a172;27A1
a173;27A2
a174;27A4
a175;27A5
a176;27A6
a177;27A7
a178;27A8
a179;27A9
a17;2711
a180;27AB
a181;27AD
a182;27AF
a183;27B2
a184;27B3
a185;27B5
a186;27B8
a187;27BA
a188;27BB
a189;27BC
a18;2712
a190;27BD
a191;27BE
a192;279A
a193;27AA
a194;27B6
a195;27B9
a196;2798
a197;27B4
a198;27B7
a199;27AC
a19;2713
a1;2701
a200;27AE
a201;27B1
a202;2703
a203;2750
a204;2752
a205;276E
a206;2770
a20;2714
a21;2715
a22;2716
a23;2717
a24;2718
a25;2719
a26;271A
a27;271B
a28;271C
a29;2722
a2;2702
a30;2723
a31;2724
a32;2725
a33;2726
a34;2727
a35;2605
a36;2729
a37;272A
a38;272B
a39;272C
a3;2704
a40;272D
a41;272E
a42;272F
a43;2730
a44;2731
a45;2732
a46;2733
a47;2734
a48;2735
a49;2736
a4;260E
a50;2737
a51;2738
a52;2739
a53;273A
a54;273B
a55;273C
a56;273D
a57;273E
a58;273F
a59;2740
a5;2706
a60;2741
a61;2742
a62;2743
a63;2744
a64;2745
a65;2746
a66;2747
a67;2748
a68;2749
a69;274A
a6;271D
a70;274B
a71;25CF
a72;274D
a73;25A0
a74;274F
a75;2751
a76;25B2
a77;25BC
a78;25C6
a79;2756
a7;271E
a81;25D7
a82;2758
a83;2759
a84;275A
a85;276F
a86;2771
a87;2772
a88;2773
a89;2768
a8;271F
a90;2769
a91;276C
a92;276D
a93;276A
a94;276B
a95;2774
a96;2775
a97;275B
a98;275C
a99;275D
a9;2720
space;0020
#END

View File

@@ -112,6 +112,9 @@
<None Remove="Resources\CMap\UniKS-UTF16-H" />
<None Remove="Resources\CMap\UniKS-UTF16-V" />
<None Remove="Resources\CMap\V" />
<None Remove="Resources\GlyphList\additional" />
<None Remove="Resources\GlyphList\glyphlist" />
<None Remove="Resources\GlyphList\zapfdingbats" />
</ItemGroup>
<ItemGroup>
@@ -226,6 +229,9 @@
<EmbeddedResource Include="Resources\CMap\UniKS-UTF16-H" />
<EmbeddedResource Include="Resources\CMap\UniKS-UTF16-V" />
<EmbeddedResource Include="Resources\CMap\V" />
<EmbeddedResource Include="Resources\GlyphList\additional" />
<EmbeddedResource Include="Resources\GlyphList\glyphlist" />
<EmbeddedResource Include="Resources\GlyphList\zapfdingbats" />
</ItemGroup>
</Project>