From 39d05e6a4776add7079112890237699eb9c83cc5 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Wed, 5 Jun 2019 18:02:24 +0100 Subject: [PATCH] support big endian and little endian utf 16 in string tokens #32 --- .../Tokenization/StringTokenizerTests.cs | 17 +++++++++++- .../Tokenization/StringTokenizer.cs | 27 ++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs index 22c653ae..16a52d02 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs @@ -18,7 +18,6 @@ } [Theory] - [InlineData(')')] [InlineData('<')] [InlineData('\\')] [InlineData('A')] @@ -261,6 +260,22 @@ are the same.)"; Assert.Equal(@" (sleep 1; printf ""QUIT\r\n"") | ", AssertStringToken(token).Data); } + [Fact] + public void HandlesUtf16Strings() + { + var input = new ByteArrayInputBytes(new byte[] + { + 0xFE, 0xFF, 0x00, 0x4D, 0x00, 0x69, 0x00, + 0x63, 0x29 + }); + + var result = tokenizer.TryTokenize(0x28, input, out var token); + + Assert.True(result); + + Assert.Equal(@"Mic", AssertStringToken(token).Data); + } + private static StringToken AssertStringToken(IToken token) { Assert.NotNull(token); diff --git a/src/UglyToad.PdfPig/Tokenization/StringTokenizer.cs b/src/UglyToad.PdfPig/Tokenization/StringTokenizer.cs index 6fa3447b..d8a16617 100644 --- a/src/UglyToad.PdfPig/Tokenization/StringTokenizer.cs +++ b/src/UglyToad.PdfPig/Tokenization/StringTokenizer.cs @@ -146,7 +146,32 @@ } } - token = new StringToken(builder.ToString()); + string tokenStr; + if (builder.Length >= 2) + { + if (builder[0] == 0xFE && builder[1] == 0xFF) + { + var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString()); + + tokenStr = Encoding.BigEndianUnicode.GetString(rawBytes); + } + else if (builder[0] == 0xFF && builder[1] == 0xFE) + { + var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString()); + + tokenStr = Encoding.Unicode.GetString(rawBytes); + } + else + { + tokenStr = builder.ToString(); + } + } + else + { + tokenStr = builder.ToString(); + } + + token = new StringToken(tokenStr); return true; }