Fix windows-1252 encoding not working on net6.0 and 8.0

This commit is contained in:
Jason Nelson 2024-04-14 19:48:00 -07:00
parent 7c36e203d0
commit ec720b1c91
4 changed files with 28 additions and 4 deletions

View File

@ -5,6 +5,22 @@
public class EncodingsTests
{
[Fact]
public void Windows1252Encoding()
{
string path = IntegrationHelpers.GetDocumentPath("GHOSTSCRIPT-698363-0.pdf");
using (var document = PdfDocument.Open(path))
{
var page = document.GetPage(1);
string actual = string.Concat(page.Letters.Select(l => l.Value));
// The expected string value is just here to make sure we have the same results across net versions.
// Feel free to correct/update it if chars are not actually correct.
string expected = "ҘҹЧѥЧКጹѝঐܮ̂ҥ҇ҁӃ࿋\u0c0dҀғҊ˺෨ཌආр෨ཌ̂ҘҹЧѥЧКጹѝঐܮ̂ҥ҇ҁӃ࿋\u0c0dҀғҊ˺෨ཌආр෨ཌ̂ݰႺࢥ༢࣭\u089aѽ̔ҫһҐ̔ݰႺࢥ༢࣭\u089aѽ̔ҫһҐ̔";
Assert.Equal(expected, actual);
}
}
[Fact]
public void Issue688()
{

View File

@ -3,10 +3,11 @@
public class IntegrationDocumentTests
{
private static readonly Lazy<string> DocumentFolder = new Lazy<string>(() => Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents")));
private static readonly HashSet<string> _documentsToIgnore = new HashSet<string>()
{
"issue_671.pdf"
};
private static readonly HashSet<string> _documentsToIgnore =
[
"issue_671.pdf",
"GHOSTSCRIPT-698363-0.pdf"
];
[Theory]
[MemberData(nameof(GetAllDocuments))]

View File

@ -11,6 +11,13 @@
internal class NameTokenizer : ITokenizer
{
static NameTokenizer()
{
#if NET6_0_OR_GREATER
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
}
public bool ReadsNextByte { get; } = true;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)