mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-08-20 09:37:44 +08:00
Handle odd ligatures names and fix #945
This commit is contained in:
parent
20804245d0
commit
de8f39eb7a
@ -13,7 +13,7 @@
|
||||
public class GlyphList
|
||||
{
|
||||
/// <summary>
|
||||
/// <c>.notdef</c>.
|
||||
/// <c>.notdef</c> name.
|
||||
/// </summary>
|
||||
public const string NotDefined = ".notdef";
|
||||
|
||||
@ -37,7 +37,7 @@
|
||||
public static GlyphList AdditionalGlyphList => LazyAdditionalGlyphList.Value;
|
||||
|
||||
private static readonly Lazy<GlyphList> LazyZapfDingbatsGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("zapfdingbats"));
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Zapf Dingbats.
|
||||
/// </summary>
|
||||
@ -84,6 +84,7 @@
|
||||
|
||||
/// <summary>
|
||||
/// Get the unicode value for the glyph name.
|
||||
/// See <see href="https://github.com/adobe-type-tools/agl-specification"/>.
|
||||
/// </summary>
|
||||
public string NameToUnicode(string name)
|
||||
{
|
||||
@ -103,25 +104,47 @@
|
||||
}
|
||||
|
||||
string unicode;
|
||||
// Remove suffixes
|
||||
// 1. Drop all the characters from the glyph name starting with the first occurrence of a period (U+002E FULL STOP), if any.
|
||||
if (name.IndexOf('.') > 0)
|
||||
{
|
||||
unicode = NameToUnicode(name.Substring(0, name.IndexOf('.')));
|
||||
}
|
||||
else if (name.StartsWith("uni") && name.Length == 7)
|
||||
// 2. Split the remaining string into a sequence of components, using underscore (U+005F LOW LINE) as the delimiter.
|
||||
else if (name.IndexOf('_') > 0)
|
||||
{
|
||||
/*
|
||||
* MOZILLA-3136-0.pdf
|
||||
* 68-1990-01_A.pdf
|
||||
* TIKA-2054-0.pdf
|
||||
*/
|
||||
var sb = new StringBuilder();
|
||||
foreach (var s in name.Split('_'))
|
||||
{
|
||||
sb.Append(NameToUnicode(s));
|
||||
}
|
||||
|
||||
unicode = sb.ToString();
|
||||
}
|
||||
// Otherwise, if the component is of the form ‘uni’ (U+0075, U+006E, and U+0069) followed by a sequence of uppercase hexadecimal
|
||||
// digits (0–9 and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046), if the length of that sequence is a multiple
|
||||
// of four, and if each group of four digits represents a value in the ranges 0000 through D7FF or E000 through FFFF, then
|
||||
// interpret each as a Unicode scalar value and map the component to the string made of those scalar values. Note that the range
|
||||
// and digit-length restrictions mean that the ‘uni’ glyph name prefix can be used only with UVs in the Basic Multilingual Plane (BMP).
|
||||
else if (name.StartsWith("uni") && (name.Length - 3) % 4 == 0)
|
||||
{
|
||||
// test for Unicode name in the format uniXXXX where X is hex
|
||||
int nameLength = name.Length;
|
||||
|
||||
var uniStr = new StringBuilder();
|
||||
|
||||
var foundUnicode = true;
|
||||
for (int chPos = 3; chPos + 4 <= nameLength; chPos += 4)
|
||||
{
|
||||
if (!int.TryParse(name.AsSpanOrSubstring(chPos, 4), NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var codePoint))
|
||||
if (!int.TryParse(name.AsSpanOrSubstring(chPos, 4),
|
||||
NumberStyles.HexNumber,
|
||||
CultureInfo.InvariantCulture,
|
||||
out var codePoint))
|
||||
{
|
||||
foundUnicode = false;
|
||||
break;
|
||||
return null;
|
||||
}
|
||||
|
||||
if (codePoint > 0xD7FF && codePoint < 0xE000)
|
||||
@ -132,33 +155,30 @@
|
||||
uniStr.Append((char)codePoint);
|
||||
}
|
||||
|
||||
if (!foundUnicode)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
unicode = uniStr.ToString();
|
||||
}
|
||||
else if (name.StartsWith("u", StringComparison.Ordinal) && name.Length == 5)
|
||||
// Otherwise, if the component is of the form ‘u’ (U+0075) followed by a sequence of four to six uppercase hexadecimal digits (0–9
|
||||
// and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046), and those digits represents a value in the ranges 0000 through
|
||||
// D7FF or E000 through 10FFFF, then interpret it as a Unicode scalar value and map the component to the string made of this scalar value.
|
||||
else if (name.StartsWith("u", StringComparison.Ordinal) && name.Length >= 5 && name.Length <= 7)
|
||||
{
|
||||
// test for an alternate Unicode name representation uXXXX
|
||||
var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.HexNumber, CultureInfo.InvariantCulture);
|
||||
|
||||
if (codePoint > 0xD7FF && codePoint < 0xE000)
|
||||
{
|
||||
throw new InvalidFontFormatException(
|
||||
$"Unicode character name with disallowed code area: {name}");
|
||||
throw new InvalidFontFormatException($"Unicode character name with disallowed code area: {name}");
|
||||
}
|
||||
|
||||
unicode = char.ConvertFromUtf32(codePoint);
|
||||
}
|
||||
// Ad-hoc special cases
|
||||
else if (name.StartsWith("c", StringComparison.OrdinalIgnoreCase) && name.Length >= 3 && name.Length <= 4)
|
||||
{
|
||||
// name representation cXXX
|
||||
var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.Integer, CultureInfo.InvariantCulture);
|
||||
System.Diagnostics.Debug.Assert(codePoint > 0);
|
||||
unicode = char.ConvertFromUtf32(codePoint);
|
||||
}
|
||||
// Otherwise, map the component to an empty string.
|
||||
else
|
||||
{
|
||||
return null;
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
{
|
||||
var list = new GlyphList(new Dictionary<string, string>
|
||||
{
|
||||
{"Boris", "B"}
|
||||
{ "Boris", "B" }
|
||||
});
|
||||
|
||||
var result = list.NameToUnicode("Boris.Special");
|
||||
@ -70,7 +70,7 @@
|
||||
{
|
||||
var list = new GlyphList(new Dictionary<string, string>
|
||||
{
|
||||
{"B", "X"}
|
||||
{ "B", "X" }
|
||||
});
|
||||
|
||||
var result = list.NameToUnicode("uni0042");
|
||||
@ -83,12 +83,27 @@
|
||||
{
|
||||
var list = new GlyphList(new Dictionary<string, string>
|
||||
{
|
||||
{"E", "Æ"}
|
||||
{ "E", "Æ" }
|
||||
});
|
||||
|
||||
var result = list.NameToUnicode("u0045");
|
||||
|
||||
Assert.Equal("E", result);
|
||||
}
|
||||
|
||||
|
||||
[Fact(Skip = "TODO - String don't match")]
|
||||
public void NameToUnicodeConvertAglSpecification()
|
||||
{
|
||||
// https://github.com/adobe-type-tools/agl-specification?tab=readme-ov-file#3-examples
|
||||
var list = new GlyphList(new Dictionary<string, string>
|
||||
{
|
||||
{ "Lcommaaccent", "\u013B" }
|
||||
});
|
||||
|
||||
var result = list.NameToUnicode("Lcommaaccent_uni20AC0308_u1040C.alternate");
|
||||
|
||||
Assert.Equal("\u013B\u20AC\u0308\u1040C", result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BIN
src/UglyToad.PdfPig.Tests/Integration/Documents/TIKA-2054-0.pdf
Normal file
BIN
src/UglyToad.PdfPig.Tests/Integration/Documents/TIKA-2054-0.pdf
Normal file
Binary file not shown.
@ -6,6 +6,41 @@
|
||||
|
||||
public class GithubIssuesTests
|
||||
{
|
||||
[Fact]
|
||||
public void Issue945()
|
||||
{
|
||||
// Odd ligatures names
|
||||
var path = IntegrationHelpers.GetDocumentPath("MOZILLA-3136-0.pdf");
|
||||
using (var document = PdfDocument.Open(path))
|
||||
{
|
||||
var page = document.GetPage(2);
|
||||
Assert.Contains("ff", page.Letters.Select(l => l.Value));
|
||||
}
|
||||
|
||||
path = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf");
|
||||
using (var document = PdfDocument.Open(path))
|
||||
{
|
||||
var page = document.GetPage(7);
|
||||
Assert.Contains("fi", page.Letters.Select(l => l.Value));
|
||||
}
|
||||
|
||||
path = IntegrationHelpers.GetDocumentPath("TIKA-2054-0.pdf");
|
||||
using (var document = PdfDocument.Open(path))
|
||||
{
|
||||
var page = document.GetPage(3);
|
||||
Assert.Contains("fi", page.Letters.Select(l => l.Value));
|
||||
|
||||
page = document.GetPage(4);
|
||||
Assert.Contains("ff", page.Letters.Select(l => l.Value));
|
||||
|
||||
page = document.GetPage(6);
|
||||
Assert.Contains("fl", page.Letters.Select(l => l.Value));
|
||||
|
||||
page = document.GetPage(16);
|
||||
Assert.Contains("ffi", page.Letters.Select(l => l.Value));
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Issue943()
|
||||
{
|
||||
|
||||
Loading…
Reference in New Issue
Block a user