diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/HexTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/HexTokenizerTests.cs index 659c1f74..b74d3de8 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/HexTokenizerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/HexTokenizerTests.cs @@ -37,6 +37,19 @@ Assert.Equal(expected, AssertHexToken(token).Data); } + [Theory] + [InlineData("", "LibreOffice 6.1")] + [InlineData("", "こんにちは世界")] + public void HandlesUtf16Strings(string s, string expected) + { + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + Assert.Equal(expected, AssertHexToken(token).Data); + } + private static HexToken AssertHexToken(IToken token) { Assert.NotNull(token); diff --git a/src/UglyToad.PdfPig/Tokens/HexToken.cs b/src/UglyToad.PdfPig/Tokens/HexToken.cs index 7978491f..d652dc62 100644 --- a/src/UglyToad.PdfPig/Tokens/HexToken.cs +++ b/src/UglyToad.PdfPig/Tokens/HexToken.cs @@ -62,7 +62,6 @@ namespace UglyToad.PdfPig.Tokens } var bytes = new List(); - var builder = new StringBuilder(); for (var i = 0; i < characters.Count; i += 2) { @@ -79,15 +78,29 @@ namespace UglyToad.PdfPig.Tokens var b = Convert(high, low); bytes.Add(b); + } - if (b != '\0') + // Handle UTF-16BE format strings. + if (bytes.Count >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) + { + Data = Encoding.BigEndianUnicode.GetString(bytes.ToArray(), 2, bytes.Count - 2); + } + else + { + var builder = new StringBuilder(); + + foreach (var b in bytes) { - builder.Append((char)b); + if (b != '\0') + { + builder.Append((char)b); + } } + + Data = builder.ToString(); } Bytes = bytes; - Data = builder.ToString(); } ///