strings record encoding used to create them.

in order to recreate the valid bytes for use in decryption it is necessary to know which encoding was used to read a string token. this is because utf16-be encoding has a byte-order marker which should be included in the resulting bytes.
2025-10-14 19:05:01 +08:00 · 2020-01-26 17:07:58 +00:00
parent 693a3d5958
commit 6cf257a331
5 changed files with 345 additions and 4 deletions
--- a/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs
@@ -144,6 +144,7 @@
                }
            }

+            StringToken.Encoding encodedWith;
            string tokenStr;
            if (builder.Length >= 2)
            {
@@ -152,24 +153,32 @@
                    var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString());

                    tokenStr = Encoding.BigEndianUnicode.GetString(rawBytes).Substring(1);
+
+                    encodedWith = StringToken.Encoding.Utf16BE;
                }
                else if (builder[0] == 0xFF && builder[1] == 0xFE)
                {
                    var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString());

                    tokenStr = Encoding.Unicode.GetString(rawBytes).Substring(1);
+
+                    encodedWith = StringToken.Encoding.Utf16;
                }
                else
                {
                    tokenStr = builder.ToString();
+
+                    encodedWith = StringToken.Encoding.Iso88591;
                }
            }
            else
            {
                tokenStr = builder.ToString();
+
+                encodedWith = StringToken.Encoding.Iso88591;
            }

-            token = new StringToken(tokenStr);
+            token = new StringToken(tokenStr, encodedWith);

            return true;
        }