mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
strings record encoding used to create them.
in order to recreate the valid bytes for use in decryption it is necessary to know which encoding was used to read a string token. this is because utf16-be encoding has a byte-order marker which should be included in the resulting bytes.
This commit is contained in:
@@ -57,5 +57,280 @@
|
|||||||
|
|
||||||
return Iso88591.GetString(bytes);
|
return Iso88591.GetString(bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The encoding for strings in a PDF file which encodes all of the ISO Latin 1 character set.
|
||||||
|
/// </summary>
|
||||||
|
public static class PdfDocEncoding
|
||||||
|
{
|
||||||
|
private static readonly IReadOnlyDictionary<byte, char> CodeToUnicode = new Dictionary<byte, char>
|
||||||
|
{
|
||||||
|
{0, '\u0000'},
|
||||||
|
{1, '\u0001'},
|
||||||
|
{2, '\u0002'},
|
||||||
|
{3, '\u0003'},
|
||||||
|
{4, '\u0004'},
|
||||||
|
{5, '\u0005'},
|
||||||
|
{6, '\u0006'},
|
||||||
|
{7, '\u0007'},
|
||||||
|
{8, '\u0008'},
|
||||||
|
{9, '\u0009'},
|
||||||
|
{10, '\u000A'},
|
||||||
|
{11, '\u000B'},
|
||||||
|
{12, '\u000C'},
|
||||||
|
{13, '\u000D'},
|
||||||
|
{14, '\u000E'},
|
||||||
|
{15, '\u000F'},
|
||||||
|
{16, '\u0010'},
|
||||||
|
{17, '\u0011'},
|
||||||
|
{18, '\u0012'},
|
||||||
|
{19, '\u0013'},
|
||||||
|
{20, '\u0014'},
|
||||||
|
{21, '\u0015'},
|
||||||
|
{22, '\u0017'},
|
||||||
|
{23, '\u0017'},
|
||||||
|
{24, '\u02D8'},
|
||||||
|
{25, '\u02C7'},
|
||||||
|
{26, '\u02C6'},
|
||||||
|
{27, '\u02D9'},
|
||||||
|
{28, '\u02DD'},
|
||||||
|
{29, '\u02DB'},
|
||||||
|
{30, '\u02DA'},
|
||||||
|
{31, '\u02DC'},
|
||||||
|
{32, '\u0020'},
|
||||||
|
{33, '\u0021'},
|
||||||
|
{34, '\u0022'},
|
||||||
|
{35, '\u0023'},
|
||||||
|
{36, '\u0024'},
|
||||||
|
{37, '\u0025'},
|
||||||
|
{38, '\u0026'},
|
||||||
|
{39, '\u0027'},
|
||||||
|
{40, '\u0028'},
|
||||||
|
{41, '\u0029'},
|
||||||
|
{42, '\u002A'},
|
||||||
|
{43, '\u002B'},
|
||||||
|
{44, '\u002C'},
|
||||||
|
{45, '\u002D'},
|
||||||
|
{46, '\u002E'},
|
||||||
|
{47, '\u002F'},
|
||||||
|
{48, '\u0030'},
|
||||||
|
{49, '\u0031'},
|
||||||
|
{50, '\u0032'},
|
||||||
|
{51, '\u0033'},
|
||||||
|
{52, '\u0034'},
|
||||||
|
{53, '\u0035'},
|
||||||
|
{54, '\u0036'},
|
||||||
|
{55, '\u0037'},
|
||||||
|
{56, '\u0038'},
|
||||||
|
{57, '\u0039'},
|
||||||
|
{58, '\u003A'},
|
||||||
|
{59, '\u003B'},
|
||||||
|
{60, '\u003C'},
|
||||||
|
{61, '\u003D'},
|
||||||
|
{62, '\u003E'},
|
||||||
|
{63, '\u003F'},
|
||||||
|
{64, '\u0040'},
|
||||||
|
{65, '\u0041'},
|
||||||
|
{66, '\u0042'},
|
||||||
|
{67, '\u0043'},
|
||||||
|
{68, '\u0044'},
|
||||||
|
{69, '\u0045'},
|
||||||
|
{70, '\u0046'},
|
||||||
|
{71, '\u0047'},
|
||||||
|
{72, '\u0048'},
|
||||||
|
{73, '\u0049'},
|
||||||
|
{74, '\u004A'},
|
||||||
|
{75, '\u004B'},
|
||||||
|
{76, '\u004C'},
|
||||||
|
{77, '\u004D'},
|
||||||
|
{78, '\u004E'},
|
||||||
|
{79, '\u004F'},
|
||||||
|
{80, '\u0050'},
|
||||||
|
{81, '\u0051'},
|
||||||
|
{82, '\u0052'},
|
||||||
|
{83, '\u0053'},
|
||||||
|
{84, '\u0054'},
|
||||||
|
{85, '\u0055'},
|
||||||
|
{86, '\u0056'},
|
||||||
|
{87, '\u0057'},
|
||||||
|
{88, '\u0058'},
|
||||||
|
{89, '\u0059'},
|
||||||
|
{90, '\u005A'},
|
||||||
|
{91, '\u005B'},
|
||||||
|
{92, '\u005C'},
|
||||||
|
{93, '\u005D'},
|
||||||
|
{94, '\u005E'},
|
||||||
|
{95, '\u005F'},
|
||||||
|
{96, '\u0060'},
|
||||||
|
{97, '\u0061'},
|
||||||
|
{98, '\u0062'},
|
||||||
|
{99, '\u0063'},
|
||||||
|
{100, '\u0064'},
|
||||||
|
{101, '\u0065'},
|
||||||
|
{102, '\u0066'},
|
||||||
|
{103, '\u0067'},
|
||||||
|
{104, '\u0068'},
|
||||||
|
{105, '\u0069'},
|
||||||
|
{106, '\u006A'},
|
||||||
|
{107, '\u006B'},
|
||||||
|
{108, '\u006C'},
|
||||||
|
{109, '\u006D'},
|
||||||
|
{110, '\u006E'},
|
||||||
|
{111, '\u006F'},
|
||||||
|
{112, '\u0070'},
|
||||||
|
{113, '\u0071'},
|
||||||
|
{114, '\u0072'},
|
||||||
|
{115, '\u0073'},
|
||||||
|
{116, '\u0074'},
|
||||||
|
{117, '\u0075'},
|
||||||
|
{118, '\u0076'},
|
||||||
|
{119, '\u0077'},
|
||||||
|
{120, '\u0078'},
|
||||||
|
{121, '\u0079'},
|
||||||
|
{122, '\u007A'},
|
||||||
|
{123, '\u007B'},
|
||||||
|
{124, '\u007C'},
|
||||||
|
{125, '\u007D'},
|
||||||
|
{126, '\u007E'},
|
||||||
|
{128, '\u2022'},
|
||||||
|
{129, '\u2020'},
|
||||||
|
{130, '\u2021'},
|
||||||
|
{131, '\u2026'},
|
||||||
|
{132, '\u2014'},
|
||||||
|
{133, '\u2013'},
|
||||||
|
{134, '\u0192'},
|
||||||
|
{135, '\u2044'},
|
||||||
|
{136, '\u2039'},
|
||||||
|
{137, '\u203A'},
|
||||||
|
{138, '\u2212'},
|
||||||
|
{139, '\u2030'},
|
||||||
|
{140, '\u201E'},
|
||||||
|
{141, '\u201C'},
|
||||||
|
{142, '\u201D'},
|
||||||
|
{143, '\u2018'},
|
||||||
|
{144, '\u2019'},
|
||||||
|
{145, '\u201A'},
|
||||||
|
{146, '\u2122'},
|
||||||
|
{147, '\uFB01'},
|
||||||
|
{148, '\uFB02'},
|
||||||
|
{149, '\u0141'},
|
||||||
|
{150, '\u0152'},
|
||||||
|
{151, '\u0160'},
|
||||||
|
{152, '\u0178'},
|
||||||
|
{153, '\u017D'},
|
||||||
|
{154, '\u0131'},
|
||||||
|
{155, '\u0142'},
|
||||||
|
{156, '\u0153'},
|
||||||
|
{157, '\u0161'},
|
||||||
|
{158, '\u017E'},
|
||||||
|
{160, '\u20AC'},
|
||||||
|
{161, '\u00A1'},
|
||||||
|
{162, '\u00A2'},
|
||||||
|
{163, '\u00A3'},
|
||||||
|
{164, '\u00A4'},
|
||||||
|
{165, '\u00A5'},
|
||||||
|
{166, '\u00A6'},
|
||||||
|
{167, '\u00A7'},
|
||||||
|
{168, '\u00A8'},
|
||||||
|
{169, '\u00A9'},
|
||||||
|
{170, '\u00AA'},
|
||||||
|
{171, '\u00AB'},
|
||||||
|
{172, '\u00AC'},
|
||||||
|
{174, '\u00AE'},
|
||||||
|
{175, '\u00AF'},
|
||||||
|
{176, '\u00B0'},
|
||||||
|
{177, '\u00B1'},
|
||||||
|
{178, '\u00B2'},
|
||||||
|
{179, '\u00B3'},
|
||||||
|
{180, '\u00B4'},
|
||||||
|
{181, '\u00B5'},
|
||||||
|
{182, '\u00B6'},
|
||||||
|
{183, '\u00B7'},
|
||||||
|
{184, '\u00B8'},
|
||||||
|
{185, '\u00B9'},
|
||||||
|
{186, '\u00BA'},
|
||||||
|
{187, '\u00BB'},
|
||||||
|
{188, '\u00BC'},
|
||||||
|
{189, '\u00BD'},
|
||||||
|
{190, '\u00BE'},
|
||||||
|
{191, '\u00BF'},
|
||||||
|
{192, '\u00C0'},
|
||||||
|
{193, '\u00C1'},
|
||||||
|
{194, '\u00C2'},
|
||||||
|
{195, '\u00C3'},
|
||||||
|
{196, '\u00C4'},
|
||||||
|
{197, '\u00C5'},
|
||||||
|
{198, '\u00C6'},
|
||||||
|
{199, '\u00C7'},
|
||||||
|
{200, '\u00C8'},
|
||||||
|
{201, '\u00C9'},
|
||||||
|
{202, '\u00CA'},
|
||||||
|
{203, '\u00CB'},
|
||||||
|
{204, '\u00CC'},
|
||||||
|
{205, '\u00CD'},
|
||||||
|
{206, '\u00CE'},
|
||||||
|
{207, '\u00CF'},
|
||||||
|
{208, '\u00D0'},
|
||||||
|
{209, '\u00D1'},
|
||||||
|
{210, '\u00D2'},
|
||||||
|
{211, '\u00D3'},
|
||||||
|
{212, '\u00D4'},
|
||||||
|
{213, '\u00D5'},
|
||||||
|
{214, '\u00D6'},
|
||||||
|
{215, '\u00D7'},
|
||||||
|
{216, '\u00D8'},
|
||||||
|
{217, '\u00D9'},
|
||||||
|
{218, '\u00DA'},
|
||||||
|
{219, '\u00DB'},
|
||||||
|
{220, '\u00DC'},
|
||||||
|
{221, '\u00DD'},
|
||||||
|
{222, '\u00DE'},
|
||||||
|
{223, '\u00DF'},
|
||||||
|
{224, '\u00E0'},
|
||||||
|
{225, '\u00E1'},
|
||||||
|
{226, '\u00E2'},
|
||||||
|
{227, '\u00E3'},
|
||||||
|
{228, '\u00E4'},
|
||||||
|
{229, '\u00E5'},
|
||||||
|
{230, '\u00E6'},
|
||||||
|
{231, '\u00E7'},
|
||||||
|
{232, '\u00E8'},
|
||||||
|
{233, '\u00E9'},
|
||||||
|
{234, '\u00EA'},
|
||||||
|
{235, '\u00EB'},
|
||||||
|
{236, '\u00EC'},
|
||||||
|
{237, '\u00ED'},
|
||||||
|
{238, '\u00EE'},
|
||||||
|
{239, '\u00EF'},
|
||||||
|
{240, '\u00F0'},
|
||||||
|
{241, '\u00F1'},
|
||||||
|
{242, '\u00F2'},
|
||||||
|
{243, '\u00F3'},
|
||||||
|
{244, '\u00F4'},
|
||||||
|
{245, '\u00F5'},
|
||||||
|
{246, '\u00F6'},
|
||||||
|
{247, '\u00F7'},
|
||||||
|
{248, '\u00F8'},
|
||||||
|
{249, '\u00F9'},
|
||||||
|
{250, '\u00FA'},
|
||||||
|
{251, '\u00FB'},
|
||||||
|
{252, '\u00FC'},
|
||||||
|
{253, '\u00FD'},
|
||||||
|
{254, '\u00FE'},
|
||||||
|
{255, '\u00FF'},
|
||||||
|
// Undefined codes follow.
|
||||||
|
{127, '\uFFFD'},
|
||||||
|
{159, '\uFFFD'},
|
||||||
|
{173, '\uFFFD'}
|
||||||
|
};
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Whether the PDF Doc Encoding contains a corresponding character.
|
||||||
|
/// </summary>
|
||||||
|
public static bool ContainsChar(char c)
|
||||||
|
{
|
||||||
|
return CodeToUnicode.Any(x => x.Value == c);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -144,6 +144,7 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
StringToken.Encoding encodedWith;
|
||||||
string tokenStr;
|
string tokenStr;
|
||||||
if (builder.Length >= 2)
|
if (builder.Length >= 2)
|
||||||
{
|
{
|
||||||
@@ -152,24 +153,32 @@
|
|||||||
var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString());
|
var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString());
|
||||||
|
|
||||||
tokenStr = Encoding.BigEndianUnicode.GetString(rawBytes).Substring(1);
|
tokenStr = Encoding.BigEndianUnicode.GetString(rawBytes).Substring(1);
|
||||||
|
|
||||||
|
encodedWith = StringToken.Encoding.Utf16BE;
|
||||||
}
|
}
|
||||||
else if (builder[0] == 0xFF && builder[1] == 0xFE)
|
else if (builder[0] == 0xFF && builder[1] == 0xFE)
|
||||||
{
|
{
|
||||||
var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString());
|
var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString());
|
||||||
|
|
||||||
tokenStr = Encoding.Unicode.GetString(rawBytes).Substring(1);
|
tokenStr = Encoding.Unicode.GetString(rawBytes).Substring(1);
|
||||||
|
|
||||||
|
encodedWith = StringToken.Encoding.Utf16;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
tokenStr = builder.ToString();
|
tokenStr = builder.ToString();
|
||||||
|
|
||||||
|
encodedWith = StringToken.Encoding.Iso88591;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
tokenStr = builder.ToString();
|
tokenStr = builder.ToString();
|
||||||
|
|
||||||
|
encodedWith = StringToken.Encoding.Iso88591;
|
||||||
}
|
}
|
||||||
|
|
||||||
token = new StringToken(tokenStr);
|
token = new StringToken(tokenStr, encodedWith);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
namespace UglyToad.PdfPig.Tokens
|
namespace UglyToad.PdfPig.Tokens
|
||||||
{
|
{
|
||||||
using System;
|
using System;
|
||||||
|
using Core;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Represents a string of text contained in a PDF document.
|
/// Represents a string of text contained in a PDF document.
|
||||||
@@ -12,13 +13,49 @@ namespace UglyToad.PdfPig.Tokens
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public string Data { get; }
|
public string Data { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The encoding used to generate the <see langword="string"/> in <see cref="Data"/>
|
||||||
|
/// from the bytes in the file.
|
||||||
|
/// </summary>
|
||||||
|
public Encoding EncodedWith { get; }
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Create a new <see cref="StringToken"/>.
|
/// Create a new <see cref="StringToken"/>.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="data">The string data for the token to contain.</param>
|
/// <param name="data">The string data for the token to contain.</param>
|
||||||
public StringToken(string data)
|
/// <param name="encodedWith">The encoding used to generate the <see cref="Data"/>.</param>
|
||||||
|
public StringToken(string data, Encoding encodedWith = Encoding.Iso88591)
|
||||||
{
|
{
|
||||||
Data = data ?? throw new ArgumentNullException(nameof(data));
|
Data = data ?? throw new ArgumentNullException(nameof(data));
|
||||||
|
EncodedWith = encodedWith;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Convert the <see langword="string"/> in <see cref="Data"/> back to bytes.
|
||||||
|
/// </summary>
|
||||||
|
public byte[] GetBytes()
|
||||||
|
{
|
||||||
|
switch (EncodedWith)
|
||||||
|
{
|
||||||
|
case Encoding.Utf16BE:
|
||||||
|
{
|
||||||
|
var data = System.Text.Encoding.BigEndianUnicode.GetBytes(Data);
|
||||||
|
|
||||||
|
var result = new byte[data.Length + 2];
|
||||||
|
result[0] = 0xFE;
|
||||||
|
result[1] = 0xFF;
|
||||||
|
|
||||||
|
Array.Copy(data, 0, result, 2, data.Length);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
case Encoding.Utf16:
|
||||||
|
{
|
||||||
|
return System.Text.Encoding.Unicode.GetBytes(Data);
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return OtherEncodings.StringAsLatin1Bytes(Data);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
@@ -26,5 +63,24 @@ namespace UglyToad.PdfPig.Tokens
|
|||||||
{
|
{
|
||||||
return $"({Data})";
|
return $"({Data})";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The encoding used to convert the underlying file bytes to the string.
|
||||||
|
/// </summary>
|
||||||
|
public enum Encoding : byte
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// <see cref="OtherEncodings.Iso88591"/>.
|
||||||
|
/// </summary>
|
||||||
|
Iso88591 = 0,
|
||||||
|
/// <summary>
|
||||||
|
/// UTF-16.
|
||||||
|
/// </summary>
|
||||||
|
Utf16 = 1,
|
||||||
|
/// <summary>
|
||||||
|
/// UTF-16 Big Endian.
|
||||||
|
/// </summary>
|
||||||
|
Utf16BE = 2
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
|
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
|
||||||
|
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=BE/@EntryIndexedValue">BE</s:String>
|
||||||
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=CIE/@EntryIndexedValue">CIE</s:String>
|
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=CIE/@EntryIndexedValue">CIE</s:String>
|
||||||
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=CMYK/@EntryIndexedValue">CMYK</s:String>
|
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=CMYK/@EntryIndexedValue">CMYK</s:String>
|
||||||
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=ICC/@EntryIndexedValue">ICC</s:String>
|
<s:String x:Key="/Default/CodeStyle/Naming/CSharpNaming/Abbreviations/=ICC/@EntryIndexedValue">ICC</s:String>
|
||||||
|
|||||||
@@ -43,7 +43,7 @@
|
|||||||
{
|
{
|
||||||
if (ownerToken is StringToken ownerString)
|
if (ownerToken is StringToken ownerString)
|
||||||
{
|
{
|
||||||
ownerBytes = OtherEncodings.StringAsLatin1Bytes(ownerString.Data);
|
ownerBytes = ownerString.GetBytes();
|
||||||
}
|
}
|
||||||
else if (ownerToken is HexToken ownerHex)
|
else if (ownerToken is HexToken ownerHex)
|
||||||
{
|
{
|
||||||
@@ -56,7 +56,7 @@
|
|||||||
{
|
{
|
||||||
if (userToken is StringToken userString)
|
if (userToken is StringToken userString)
|
||||||
{
|
{
|
||||||
userBytes = OtherEncodings.StringAsLatin1Bytes(userString.Data);
|
userBytes = userString.GetBytes();
|
||||||
}
|
}
|
||||||
else if (userToken is HexToken userHex)
|
else if (userToken is HexToken userHex)
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user