Handle alternate Unicode name representation cXXX and fix #943

This commit is contained in:
BobLd 2024-11-24 20:16:20 +00:00
parent bcc8ccecbe
commit d12afb0b8d
2 changed files with 30 additions and 0 deletions

View File

@ -152,6 +152,13 @@
unicode = char.ConvertFromUtf32(codePoint);
}
else if (name.StartsWith("c", StringComparison.OrdinalIgnoreCase) && name.Length >= 3 && name.Length <= 4)
{
// name representation cXXX
var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.Integer, CultureInfo.InvariantCulture);
System.Diagnostics.Debug.Assert(codePoint > 0);
unicode = char.ConvertFromUtf32(codePoint);
}
else
{
return null;

View File

@ -1,9 +1,32 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using Content;
using DocumentLayoutAnalysis.PageSegmenter;
using DocumentLayoutAnalysis.WordExtractor;
public class GithubIssuesTests
{
[Fact]
public void Issue943()
{
var path = IntegrationHelpers.GetDocumentPath("MOZILLA-10225-0.pdf");
using (var document = PdfDocument.Open(path))
{
var page = document.GetPage(1);
Assert.NotNull(page);
var letters = page.Letters;
Assert.NotNull(letters);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words);
Assert.Equal("Rocket and Spacecraft Propulsion", blocks[0].TextLines[0].Text);
Assert.Equal("Principles, Practice and New Developments (Second Edition)", blocks[0].TextLines[1].Text);
}
}
[Fact]
public void Issue736()
{