mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-07-16 11:59:51 +08:00
Merge pull request #526 from fnatzke/Issue455-Issue_extracting_unicode_from_CJK_file
Fix #455 extracting unicode from CJK file
This commit is contained in:
commit
7b891edb69
Binary file not shown.
99
src/UglyToad.PdfPig.Tests/Integration/Type0_CJK_FontTests.cs
Normal file
99
src/UglyToad.PdfPig.Tests/Integration/Type0_CJK_FontTests.cs
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
namespace UglyToad.PdfPig.Tests.Integration
|
||||||
|
{
|
||||||
|
using System.IO;
|
||||||
|
using System.Linq;
|
||||||
|
using Content;
|
||||||
|
using Xunit;
|
||||||
|
|
||||||
|
public class Type0_CJK_FontTests
|
||||||
|
{
|
||||||
|
private static string GetFilename()
|
||||||
|
{
|
||||||
|
return IntegrationHelpers.GetDocumentPath("Type0_CJK_Font");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void HasCorrectNumberOfPages()
|
||||||
|
{
|
||||||
|
var file = GetFilename();
|
||||||
|
|
||||||
|
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
|
||||||
|
{
|
||||||
|
Assert.Equal(95, document.NumberOfPages);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void HasCorrectChineseCharacters()
|
||||||
|
{
|
||||||
|
using (var document = PdfDocument.Open(GetFilename()))
|
||||||
|
{
|
||||||
|
var page = document.GetPage(1);
|
||||||
|
|
||||||
|
var text = string.Join(string.Empty, page.Letters.Select(x => x.Value));
|
||||||
|
|
||||||
|
Assert.True(text?.Contains("中航动力控制股份有限公司"));
|
||||||
|
Assert.True(text?.Contains("年半年度报告"));
|
||||||
|
Assert.True(text?.Contains("中航动力控制股份有限公司董事会"));
|
||||||
|
Assert.True(text?.Contains("2010年8月17日"));
|
||||||
|
|
||||||
|
//charcode, cid, unicode, char,0xc1a6,0x09ef,\u529b,力
|
||||||
|
//charcode, cid, unicode, char,0xbfd8,0x0965,\u63a7,控
|
||||||
|
//charcode, cid, unicode, char,0xd6c6,0x11c5,\u5236,制
|
||||||
|
//charcode, cid, unicode, char,0xb9c9,0x0722,\u80a1,股
|
||||||
|
//charcode, cid, unicode, char,0xb7dd,0x067a,\u4efd,份
|
||||||
|
//charcode, cid, unicode, char,0xd3d0,0x10b5,\u6709,有
|
||||||
|
//charcode, cid, unicode, char,0xcfde,0x0f4b,\u9650,限
|
||||||
|
//charcode, cid, unicode, char,0xb9ab,0x0704,\u516c,公
|
||||||
|
//charcode, cid, unicode, char,0xcbbe,0x0db3,\u53f8,司
|
||||||
|
//charcode, cid, unicode, char,0xb6ad,0x05ec,\u8463,董
|
||||||
|
//charcode, cid, unicode, char,0xcac2,0x0d59,\u4e8b,事
|
||||||
|
//charcode, cid, unicode, char,0xbbe1,0x07f6,\u4f1a,会
|
||||||
|
//charcode, cid, unicode, char,0xc4ea,0x0b4d,\u5e74,年
|
||||||
|
//charcode, cid, unicode, char,0xd4c2,0x1105,\u6708,月
|
||||||
|
//charcode, cid, unicode, char,0xc8d5,0x0cb0,\u65e5,日
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
Font Dictionary of Page 1
|
||||||
|
Name References
|
||||||
|
TT2 321
|
||||||
|
TT4 322
|
||||||
|
TT6 327
|
||||||
|
TT7 329
|
||||||
|
TT9 330
|
||||||
|
TT10 322
|
||||||
|
|
||||||
|
Font Details
|
||||||
|
TT2 {[BaseFont, {/TimesNewRoman}]} {[Encoding, {/WinAnsiEncoding}]}
|
||||||
|
TT4 {[BaseFont, {/TimesNewRoman,Bold}]} {[Encoding, {/WinAnsiEncoding}]}
|
||||||
|
TT6 {[BaseFont, {/KaiTi_GB2312}]} {[Encoding, {/GBK-EUC-H}]}
|
||||||
|
DescendndFont {[BaseFont, {/KaiTi_GB2312}]}
|
||||||
|
{[Subtype, {/CIDFontType2}]}
|
||||||
|
{[CIDSystemInfo, {<Registry, (Adobe)>, <Ordering, (GB1)>, <Supplement, 2>}]}
|
||||||
|
|
||||||
|
TT7 {[BaseFont, {/KaiTi_GB2312+2}]}
|
||||||
|
{[Subtype, {/TrueType}]}
|
||||||
|
{[Encoding, {/WinAnsiEncoding}]}
|
||||||
|
{[BaseFont, {/KaiTi_GB2312+2}]}
|
||||||
|
|
||||||
|
|
||||||
|
TT9 {[BaseFont, {/SimHei}]}
|
||||||
|
{[Subtype, {/TrueType}]}
|
||||||
|
{[Encoding, {/WinAnsiEncoding}]}
|
||||||
|
|
||||||
|
TT10 {[BaseFont, {/SimHei+2}]}
|
||||||
|
{[Encoding, {/GBK-EUC-H}]}
|
||||||
|
{[Subtype, {/Type0}]}
|
||||||
|
DescendndFont
|
||||||
|
{[Subtype, {/CIDFontType2}]}
|
||||||
|
{[CIDSystemInfo, {<Registry, (Adobe)>, <Ordering, (GB1)>, <Supplement, 2>}]}
|
||||||
|
{[BaseFont, {/SimHei+2}]}
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -8,7 +8,7 @@
|
|||||||
using Geometry;
|
using Geometry;
|
||||||
using Tokens;
|
using Tokens;
|
||||||
using Util.JetBrains.Annotations;
|
using Util.JetBrains.Annotations;
|
||||||
|
using Debug = System.Diagnostics.Debug;
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Defines glyphs using a CIDFont
|
/// Defines glyphs using a CIDFont
|
||||||
/// </summary>
|
/// </summary>
|
||||||
@ -66,16 +66,37 @@
|
|||||||
|
|
||||||
public bool TryGetUnicode(int characterCode, out string value)
|
public bool TryGetUnicode(int characterCode, out string value)
|
||||||
{
|
{
|
||||||
value = null;
|
value = null;
|
||||||
|
|
||||||
if (!ToUnicode.CanMapToUnicode)
|
var HaveCMap = ToUnicode.CanMapToUnicode;
|
||||||
|
if (HaveCMap == false)
|
||||||
{
|
{
|
||||||
if (ucs2CMap != null && ucs2CMap.TryConvertToUnicode(characterCode, out value))
|
var HaveUnicode2CMap = (ucs2CMap is null == false);
|
||||||
{
|
if (HaveUnicode2CMap)
|
||||||
return value != null;
|
{
|
||||||
}
|
// Have both ucs2Map and CMap convert to unicode by
|
||||||
|
// characterCode ----by CMAP---> CID ---ucs2Map---> Unicode
|
||||||
|
var CID = CMap.ConvertToCid(characterCode);
|
||||||
|
if (CID == 0)
|
||||||
|
{
|
||||||
|
Debug.WriteLine($"Warning: No mapping from characterCode (0x{characterCode:X} to CID by ucs2Map.");
|
||||||
|
return false; // No mapping from characterCode to CID.
|
||||||
|
}
|
||||||
|
// CID ---ucs2Map---> Unicode
|
||||||
|
if (ucs2CMap.TryConvertToUnicode(CID, out value))
|
||||||
|
{
|
||||||
|
return value != null;
|
||||||
|
}
|
||||||
|
|
||||||
return false;
|
}
|
||||||
|
if (HaveUnicode2CMap) // 2022-12-24 @fnatzke left as fall-back. Possible?
|
||||||
|
{
|
||||||
|
// characterCode ---ucs2Map---> Unicode (?) @fnatzke possible?
|
||||||
|
if (ucs2CMap.TryConvertToUnicode(characterCode, out value))
|
||||||
|
{
|
||||||
|
return value != null;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// According to PdfBox certain providers incorrectly using Identity CMaps as ToUnicode.
|
// According to PdfBox certain providers incorrectly using Identity CMaps as ToUnicode.
|
||||||
|
Loading…
Reference in New Issue
Block a user