From 0432f703c419ef80b9bf21facc282fd487f1192c Mon Sep 17 00:00:00 2001 From: Richard Webb Date: Sun, 23 Jun 2019 01:18:48 +0100 Subject: [PATCH 1/2] extend HexToken to support UTF-16BE encoded hex strings --- src/UglyToad.PdfPig/Tokens/HexToken.cs | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/UglyToad.PdfPig/Tokens/HexToken.cs b/src/UglyToad.PdfPig/Tokens/HexToken.cs index 7978491f..d652dc62 100644 --- a/src/UglyToad.PdfPig/Tokens/HexToken.cs +++ b/src/UglyToad.PdfPig/Tokens/HexToken.cs @@ -62,7 +62,6 @@ namespace UglyToad.PdfPig.Tokens } var bytes = new List(); - var builder = new StringBuilder(); for (var i = 0; i < characters.Count; i += 2) { @@ -79,15 +78,29 @@ namespace UglyToad.PdfPig.Tokens var b = Convert(high, low); bytes.Add(b); + } - if (b != '\0') + // Handle UTF-16BE format strings. + if (bytes.Count >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) + { + Data = Encoding.BigEndianUnicode.GetString(bytes.ToArray(), 2, bytes.Count - 2); + } + else + { + var builder = new StringBuilder(); + + foreach (var b in bytes) { - builder.Append((char)b); + if (b != '\0') + { + builder.Append((char)b); + } } + + Data = builder.ToString(); } Bytes = bytes; - Data = builder.ToString(); } /// From b5b862e63f6417b0134636d0e7d7a607176d65a1 Mon Sep 17 00:00:00 2001 From: Richard Webb Date: Sun, 23 Jun 2019 01:19:43 +0100 Subject: [PATCH 2/2] unit tests for tokenizing UTF16 encoded hex strings. --- .../Tokenization/HexTokenizerTests.cs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/HexTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/HexTokenizerTests.cs index 659c1f74..b74d3de8 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/HexTokenizerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/HexTokenizerTests.cs @@ -37,6 +37,19 @@ Assert.Equal(expected, AssertHexToken(token).Data); } + [Theory] + [InlineData("", "LibreOffice 6.1")] + [InlineData("", "こんにちは世界")] + public void HandlesUtf16Strings(string s, string expected) + { + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + Assert.Equal(expected, AssertHexToken(token).Data); + } + private static HexToken AssertHexToken(IToken token) { Assert.NotNull(token);