diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs index 16a52d02..fc213903 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs @@ -273,7 +273,23 @@ are the same.)"; Assert.True(result); - Assert.Equal(@"Mic", AssertStringToken(token).Data); + Assert.Equal(@"Mic", AssertStringToken(token).Data); + } + + [Fact] + public void HandlesUtf16BigEndianStrings() + { + var input = new ByteArrayInputBytes(new byte[] + { + 0xFF, 0xFE, 0x4D, 0x00, 0x69, 0x00, 0x63, + 0x00, 0x29 + }); + + var result = tokenizer.TryTokenize(0x28, input, out var token); + + Assert.True(result); + + Assert.Equal(@"Mic", AssertStringToken(token).Data); } private static StringToken AssertStringToken(IToken token) diff --git a/src/UglyToad.PdfPig/Tokenization/StringTokenizer.cs b/src/UglyToad.PdfPig/Tokenization/StringTokenizer.cs index d8a16617..24756fb8 100644 --- a/src/UglyToad.PdfPig/Tokenization/StringTokenizer.cs +++ b/src/UglyToad.PdfPig/Tokenization/StringTokenizer.cs @@ -153,13 +153,13 @@ { var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString()); - tokenStr = Encoding.BigEndianUnicode.GetString(rawBytes); + tokenStr = Encoding.BigEndianUnicode.GetString(rawBytes).Substring(1); } else if (builder[0] == 0xFF && builder[1] == 0xFE) { var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString()); - tokenStr = Encoding.Unicode.GetString(rawBytes); + tokenStr = Encoding.Unicode.GetString(rawBytes).Substring(1); } else { @@ -188,34 +188,6 @@ octals[0] = value; } - //private static int CheckForEndOfString(IRandomAccessRead reader, int bracesParameter) - //{ - // int braces = bracesParameter; - // byte[] nextThreeBytes = new byte[3]; - // int amountRead = reader.Read(nextThreeBytes); - - // // Check the next 3 bytes if available - // // The following cases are valid indicators for the end of the string - // // 1. Next line contains another COSObject: CR + LF + '/' - // // 2. CosDictionary ends in the next line: CR + LF + '>' - // // 3. Next line contains another COSObject: CR + '/' - // // 4. CosDictionary ends in the next line: CR + '>' - // if (amountRead == 3 && nextThreeBytes[0] == ReadHelper.AsciiCarriageReturn) - // { - // if (nextThreeBytes[1] == ReadHelper.AsciiLineFeed && nextThreeBytes[2] == '/' || nextThreeBytes[2] == '>' - // || nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>') - // { - // braces = 0; - // } - // } - // if (amountRead > 0) - // { - // reader.Unread(nextThreeBytes, 0, amountRead); - // } - // return braces; - //} - //} - private static void ProcessEscapedCharacter(char c, StringBuilder builder, short[] octal, ref bool isOctalActive, ref int octalsRead, ref bool isLineBreaking) {