Merge pull request #40 from Numpsy/rw/unicode_hex_strings

add utf-16 parsing support to hextoken
This commit is contained in:
Eliot Jones
2019-06-23 12:38:44 +01:00
committed by GitHub
2 changed files with 30 additions and 4 deletions

View File

@@ -37,6 +37,19 @@
Assert.Equal(expected, AssertHexToken(token).Data);
}
[Theory]
[InlineData("<FEFF004C0069006200720065004F0066006600690063006500200036002E0031>", "LibreOffice 6.1")]
[InlineData("<FEFF30533093306B3061306F4E16754C>", "こんにちは世界")]
public void HandlesUtf16Strings(string s, string expected)
{
var input = StringBytesTestConverter.Convert(s);
var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token);
Assert.True(result);
Assert.Equal(expected, AssertHexToken(token).Data);
}
private static HexToken AssertHexToken(IToken token)
{
Assert.NotNull(token);

View File

@@ -62,7 +62,6 @@ namespace UglyToad.PdfPig.Tokens
}
var bytes = new List<byte>();
var builder = new StringBuilder();
for (var i = 0; i < characters.Count; i += 2)
{
@@ -79,15 +78,29 @@ namespace UglyToad.PdfPig.Tokens
var b = Convert(high, low);
bytes.Add(b);
}
if (b != '\0')
// Handle UTF-16BE format strings.
if (bytes.Count >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
{
Data = Encoding.BigEndianUnicode.GetString(bytes.ToArray(), 2, bytes.Count - 2);
}
else
{
var builder = new StringBuilder();
foreach (var b in bytes)
{
builder.Append((char)b);
if (b != '\0')
{
builder.Append((char)b);
}
}
Data = builder.ToString();
}
Bytes = bytes;
Data = builder.ToString();
}
/// <summary>