mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-08-20 09:21:57 +08:00
use pdfdocencoding when parsing strings
This commit is contained in:
parent
7fe5fc2272
commit
6f59bed9a2
@ -57,280 +57,5 @@
|
||||
|
||||
return Iso88591.GetString(bytes);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The encoding for strings in a PDF file which encodes all of the ISO Latin 1 character set.
|
||||
/// </summary>
|
||||
public static class PdfDocEncoding
|
||||
{
|
||||
private static readonly IReadOnlyDictionary<byte, char> CodeToUnicode = new Dictionary<byte, char>
|
||||
{
|
||||
{0, '\u0000'},
|
||||
{1, '\u0001'},
|
||||
{2, '\u0002'},
|
||||
{3, '\u0003'},
|
||||
{4, '\u0004'},
|
||||
{5, '\u0005'},
|
||||
{6, '\u0006'},
|
||||
{7, '\u0007'},
|
||||
{8, '\u0008'},
|
||||
{9, '\u0009'},
|
||||
{10, '\u000A'},
|
||||
{11, '\u000B'},
|
||||
{12, '\u000C'},
|
||||
{13, '\u000D'},
|
||||
{14, '\u000E'},
|
||||
{15, '\u000F'},
|
||||
{16, '\u0010'},
|
||||
{17, '\u0011'},
|
||||
{18, '\u0012'},
|
||||
{19, '\u0013'},
|
||||
{20, '\u0014'},
|
||||
{21, '\u0015'},
|
||||
{22, '\u0017'},
|
||||
{23, '\u0017'},
|
||||
{24, '\u02D8'},
|
||||
{25, '\u02C7'},
|
||||
{26, '\u02C6'},
|
||||
{27, '\u02D9'},
|
||||
{28, '\u02DD'},
|
||||
{29, '\u02DB'},
|
||||
{30, '\u02DA'},
|
||||
{31, '\u02DC'},
|
||||
{32, '\u0020'},
|
||||
{33, '\u0021'},
|
||||
{34, '\u0022'},
|
||||
{35, '\u0023'},
|
||||
{36, '\u0024'},
|
||||
{37, '\u0025'},
|
||||
{38, '\u0026'},
|
||||
{39, '\u0027'},
|
||||
{40, '\u0028'},
|
||||
{41, '\u0029'},
|
||||
{42, '\u002A'},
|
||||
{43, '\u002B'},
|
||||
{44, '\u002C'},
|
||||
{45, '\u002D'},
|
||||
{46, '\u002E'},
|
||||
{47, '\u002F'},
|
||||
{48, '\u0030'},
|
||||
{49, '\u0031'},
|
||||
{50, '\u0032'},
|
||||
{51, '\u0033'},
|
||||
{52, '\u0034'},
|
||||
{53, '\u0035'},
|
||||
{54, '\u0036'},
|
||||
{55, '\u0037'},
|
||||
{56, '\u0038'},
|
||||
{57, '\u0039'},
|
||||
{58, '\u003A'},
|
||||
{59, '\u003B'},
|
||||
{60, '\u003C'},
|
||||
{61, '\u003D'},
|
||||
{62, '\u003E'},
|
||||
{63, '\u003F'},
|
||||
{64, '\u0040'},
|
||||
{65, '\u0041'},
|
||||
{66, '\u0042'},
|
||||
{67, '\u0043'},
|
||||
{68, '\u0044'},
|
||||
{69, '\u0045'},
|
||||
{70, '\u0046'},
|
||||
{71, '\u0047'},
|
||||
{72, '\u0048'},
|
||||
{73, '\u0049'},
|
||||
{74, '\u004A'},
|
||||
{75, '\u004B'},
|
||||
{76, '\u004C'},
|
||||
{77, '\u004D'},
|
||||
{78, '\u004E'},
|
||||
{79, '\u004F'},
|
||||
{80, '\u0050'},
|
||||
{81, '\u0051'},
|
||||
{82, '\u0052'},
|
||||
{83, '\u0053'},
|
||||
{84, '\u0054'},
|
||||
{85, '\u0055'},
|
||||
{86, '\u0056'},
|
||||
{87, '\u0057'},
|
||||
{88, '\u0058'},
|
||||
{89, '\u0059'},
|
||||
{90, '\u005A'},
|
||||
{91, '\u005B'},
|
||||
{92, '\u005C'},
|
||||
{93, '\u005D'},
|
||||
{94, '\u005E'},
|
||||
{95, '\u005F'},
|
||||
{96, '\u0060'},
|
||||
{97, '\u0061'},
|
||||
{98, '\u0062'},
|
||||
{99, '\u0063'},
|
||||
{100, '\u0064'},
|
||||
{101, '\u0065'},
|
||||
{102, '\u0066'},
|
||||
{103, '\u0067'},
|
||||
{104, '\u0068'},
|
||||
{105, '\u0069'},
|
||||
{106, '\u006A'},
|
||||
{107, '\u006B'},
|
||||
{108, '\u006C'},
|
||||
{109, '\u006D'},
|
||||
{110, '\u006E'},
|
||||
{111, '\u006F'},
|
||||
{112, '\u0070'},
|
||||
{113, '\u0071'},
|
||||
{114, '\u0072'},
|
||||
{115, '\u0073'},
|
||||
{116, '\u0074'},
|
||||
{117, '\u0075'},
|
||||
{118, '\u0076'},
|
||||
{119, '\u0077'},
|
||||
{120, '\u0078'},
|
||||
{121, '\u0079'},
|
||||
{122, '\u007A'},
|
||||
{123, '\u007B'},
|
||||
{124, '\u007C'},
|
||||
{125, '\u007D'},
|
||||
{126, '\u007E'},
|
||||
{128, '\u2022'},
|
||||
{129, '\u2020'},
|
||||
{130, '\u2021'},
|
||||
{131, '\u2026'},
|
||||
{132, '\u2014'},
|
||||
{133, '\u2013'},
|
||||
{134, '\u0192'},
|
||||
{135, '\u2044'},
|
||||
{136, '\u2039'},
|
||||
{137, '\u203A'},
|
||||
{138, '\u2212'},
|
||||
{139, '\u2030'},
|
||||
{140, '\u201E'},
|
||||
{141, '\u201C'},
|
||||
{142, '\u201D'},
|
||||
{143, '\u2018'},
|
||||
{144, '\u2019'},
|
||||
{145, '\u201A'},
|
||||
{146, '\u2122'},
|
||||
{147, '\uFB01'},
|
||||
{148, '\uFB02'},
|
||||
{149, '\u0141'},
|
||||
{150, '\u0152'},
|
||||
{151, '\u0160'},
|
||||
{152, '\u0178'},
|
||||
{153, '\u017D'},
|
||||
{154, '\u0131'},
|
||||
{155, '\u0142'},
|
||||
{156, '\u0153'},
|
||||
{157, '\u0161'},
|
||||
{158, '\u017E'},
|
||||
{160, '\u20AC'},
|
||||
{161, '\u00A1'},
|
||||
{162, '\u00A2'},
|
||||
{163, '\u00A3'},
|
||||
{164, '\u00A4'},
|
||||
{165, '\u00A5'},
|
||||
{166, '\u00A6'},
|
||||
{167, '\u00A7'},
|
||||
{168, '\u00A8'},
|
||||
{169, '\u00A9'},
|
||||
{170, '\u00AA'},
|
||||
{171, '\u00AB'},
|
||||
{172, '\u00AC'},
|
||||
{174, '\u00AE'},
|
||||
{175, '\u00AF'},
|
||||
{176, '\u00B0'},
|
||||
{177, '\u00B1'},
|
||||
{178, '\u00B2'},
|
||||
{179, '\u00B3'},
|
||||
{180, '\u00B4'},
|
||||
{181, '\u00B5'},
|
||||
{182, '\u00B6'},
|
||||
{183, '\u00B7'},
|
||||
{184, '\u00B8'},
|
||||
{185, '\u00B9'},
|
||||
{186, '\u00BA'},
|
||||
{187, '\u00BB'},
|
||||
{188, '\u00BC'},
|
||||
{189, '\u00BD'},
|
||||
{190, '\u00BE'},
|
||||
{191, '\u00BF'},
|
||||
{192, '\u00C0'},
|
||||
{193, '\u00C1'},
|
||||
{194, '\u00C2'},
|
||||
{195, '\u00C3'},
|
||||
{196, '\u00C4'},
|
||||
{197, '\u00C5'},
|
||||
{198, '\u00C6'},
|
||||
{199, '\u00C7'},
|
||||
{200, '\u00C8'},
|
||||
{201, '\u00C9'},
|
||||
{202, '\u00CA'},
|
||||
{203, '\u00CB'},
|
||||
{204, '\u00CC'},
|
||||
{205, '\u00CD'},
|
||||
{206, '\u00CE'},
|
||||
{207, '\u00CF'},
|
||||
{208, '\u00D0'},
|
||||
{209, '\u00D1'},
|
||||
{210, '\u00D2'},
|
||||
{211, '\u00D3'},
|
||||
{212, '\u00D4'},
|
||||
{213, '\u00D5'},
|
||||
{214, '\u00D6'},
|
||||
{215, '\u00D7'},
|
||||
{216, '\u00D8'},
|
||||
{217, '\u00D9'},
|
||||
{218, '\u00DA'},
|
||||
{219, '\u00DB'},
|
||||
{220, '\u00DC'},
|
||||
{221, '\u00DD'},
|
||||
{222, '\u00DE'},
|
||||
{223, '\u00DF'},
|
||||
{224, '\u00E0'},
|
||||
{225, '\u00E1'},
|
||||
{226, '\u00E2'},
|
||||
{227, '\u00E3'},
|
||||
{228, '\u00E4'},
|
||||
{229, '\u00E5'},
|
||||
{230, '\u00E6'},
|
||||
{231, '\u00E7'},
|
||||
{232, '\u00E8'},
|
||||
{233, '\u00E9'},
|
||||
{234, '\u00EA'},
|
||||
{235, '\u00EB'},
|
||||
{236, '\u00EC'},
|
||||
{237, '\u00ED'},
|
||||
{238, '\u00EE'},
|
||||
{239, '\u00EF'},
|
||||
{240, '\u00F0'},
|
||||
{241, '\u00F1'},
|
||||
{242, '\u00F2'},
|
||||
{243, '\u00F3'},
|
||||
{244, '\u00F4'},
|
||||
{245, '\u00F5'},
|
||||
{246, '\u00F6'},
|
||||
{247, '\u00F7'},
|
||||
{248, '\u00F8'},
|
||||
{249, '\u00F9'},
|
||||
{250, '\u00FA'},
|
||||
{251, '\u00FB'},
|
||||
{252, '\u00FC'},
|
||||
{253, '\u00FD'},
|
||||
{254, '\u00FE'},
|
||||
{255, '\u00FF'},
|
||||
// Undefined codes follow.
|
||||
{127, '\uFFFD'},
|
||||
{159, '\uFFFD'},
|
||||
{173, '\uFFFD'}
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Whether the PDF Doc Encoding contains a corresponding character.
|
||||
/// </summary>
|
||||
public static bool ContainsChar(char c)
|
||||
{
|
||||
return CodeToUnicode.Any(x => x.Value == c);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
312
src/UglyToad.PdfPig.Core/PdfDocEncoding.cs
Normal file
312
src/UglyToad.PdfPig.Core/PdfDocEncoding.cs
Normal file
@ -0,0 +1,312 @@
|
||||
namespace UglyToad.PdfPig.Core
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
|
||||
/// <summary>
|
||||
/// <para>
|
||||
/// PDFDocEncoding, defined in the spec for text strings in PDF objects (but not content stream contents,
|
||||
/// Type 1 font contents, etc).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Matches ASCII for code points 32 - 126.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
public static class PdfDocEncoding
|
||||
{
|
||||
private static readonly Dictionary<char, byte> UnicodeToCode = new Dictionary<char, byte>();
|
||||
private static readonly Dictionary<byte, char> CodeToUnicode = new Dictionary<byte, char>
|
||||
{
|
||||
{9, '\u0009'},
|
||||
{10, '\u000A'},
|
||||
{13, '\u000D'},
|
||||
{24, '\u02D8'},
|
||||
{25, '\u02C7'},
|
||||
{26, '\u02C6'},
|
||||
{27, '\u02D9'},
|
||||
{28, '\u02DD'},
|
||||
{29, '\u02DB'},
|
||||
{30, '\u02DA'},
|
||||
{31, '\u02DC'},
|
||||
{32, '\u0020'},
|
||||
{33, '\u0021'},
|
||||
{34, '\u0022'},
|
||||
{35, '\u0023'},
|
||||
{36, '\u0024'},
|
||||
{37, '\u0025'},
|
||||
{38, '\u0026'},
|
||||
{39, '\u0027'},
|
||||
{40, '\u0028'},
|
||||
{41, '\u0029'},
|
||||
{42, '\u002A'},
|
||||
{43, '\u002B'},
|
||||
{44, '\u002C'},
|
||||
{45, '\u002D'},
|
||||
{46, '\u002E'},
|
||||
{47, '\u002F'},
|
||||
{48, '\u0030'},
|
||||
{49, '\u0031'},
|
||||
{50, '\u0032'},
|
||||
{51, '\u0033'},
|
||||
{52, '\u0034'},
|
||||
{53, '\u0035'},
|
||||
{54, '\u0036'},
|
||||
{55, '\u0037'},
|
||||
{56, '\u0038'},
|
||||
{57, '\u0039'},
|
||||
{58, '\u003A'},
|
||||
{59, '\u003B'},
|
||||
{60, '\u003C'},
|
||||
{61, '\u003D'},
|
||||
{62, '\u003E'},
|
||||
{63, '\u003F'},
|
||||
{64, '\u0040'},
|
||||
{65, '\u0041'},
|
||||
{66, '\u0042'},
|
||||
{67, '\u0043'},
|
||||
{68, '\u0044'},
|
||||
{69, '\u0045'},
|
||||
{70, '\u0046'},
|
||||
{71, '\u0047'},
|
||||
{72, '\u0048'},
|
||||
{73, '\u0049'},
|
||||
{74, '\u004A'},
|
||||
{75, '\u004B'},
|
||||
{76, '\u004C'},
|
||||
{77, '\u004D'},
|
||||
{78, '\u004E'},
|
||||
{79, '\u004F'},
|
||||
{80, '\u0050'},
|
||||
{81, '\u0051'},
|
||||
{82, '\u0052'},
|
||||
{83, '\u0053'},
|
||||
{84, '\u0054'},
|
||||
{85, '\u0055'},
|
||||
{86, '\u0056'},
|
||||
{87, '\u0057'},
|
||||
{88, '\u0058'},
|
||||
{89, '\u0059'},
|
||||
{90, '\u005A'},
|
||||
{91, '\u005B'},
|
||||
{92, '\u005C'},
|
||||
{93, '\u005D'},
|
||||
{94, '\u005E'},
|
||||
{95, '\u005F'},
|
||||
{96, '\u0060'},
|
||||
{97, '\u0061'},
|
||||
{98, '\u0062'},
|
||||
{99, '\u0063'},
|
||||
{100, '\u0064'},
|
||||
{101, '\u0065'},
|
||||
{102, '\u0066'},
|
||||
{103, '\u0067'},
|
||||
{104, '\u0068'},
|
||||
{105, '\u0069'},
|
||||
{106, '\u006A'},
|
||||
{107, '\u006B'},
|
||||
{108, '\u006C'},
|
||||
{109, '\u006D'},
|
||||
{110, '\u006E'},
|
||||
{111, '\u006F'},
|
||||
{112, '\u0070'},
|
||||
{113, '\u0071'},
|
||||
{114, '\u0072'},
|
||||
{115, '\u0073'},
|
||||
{116, '\u0074'},
|
||||
{117, '\u0075'},
|
||||
{118, '\u0076'},
|
||||
{119, '\u0077'},
|
||||
{120, '\u0078'},
|
||||
{121, '\u0079'},
|
||||
{122, '\u007A'},
|
||||
{123, '\u007B'},
|
||||
{124, '\u007C'},
|
||||
{125, '\u007D'},
|
||||
{126, '\u007E'},
|
||||
{128, '\u2022'},
|
||||
{129, '\u2020'},
|
||||
{130, '\u2021'},
|
||||
{131, '\u2026'},
|
||||
{132, '\u2014'},
|
||||
{133, '\u2013'},
|
||||
{134, '\u0192'},
|
||||
{135, '\u2044'},
|
||||
{136, '\u2039'},
|
||||
{137, '\u203A'},
|
||||
{138, '\u2212'},
|
||||
{139, '\u2030'},
|
||||
{140, '\u201E'},
|
||||
{141, '\u201C'},
|
||||
{142, '\u201D'},
|
||||
{143, '\u2018'},
|
||||
{144, '\u2019'},
|
||||
{145, '\u201A'},
|
||||
{146, '\u2122'},
|
||||
{147, '\uFB01'},
|
||||
{148, '\uFB02'},
|
||||
{149, '\u0141'},
|
||||
{150, '\u0152'},
|
||||
{151, '\u0160'},
|
||||
{152, '\u0178'},
|
||||
{153, '\u017D'},
|
||||
{154, '\u0131'},
|
||||
{155, '\u0142'},
|
||||
{156, '\u0153'},
|
||||
{157, '\u0161'},
|
||||
{158, '\u017E'},
|
||||
{160, '\u20AC'},
|
||||
{161, '\u00A1'},
|
||||
{162, '\u00A2'},
|
||||
{163, '\u00A3'},
|
||||
{164, '\u00A4'},
|
||||
{165, '\u00A5'},
|
||||
{166, '\u00A6'},
|
||||
{167, '\u00A7'},
|
||||
{168, '\u00A8'},
|
||||
{169, '\u00A9'},
|
||||
{170, '\u00AA'},
|
||||
{171, '\u00AB'},
|
||||
{172, '\u00AC'},
|
||||
{174, '\u00AE'},
|
||||
{175, '\u00AF'},
|
||||
{176, '\u00B0'},
|
||||
{177, '\u00B1'},
|
||||
{178, '\u00B2'},
|
||||
{179, '\u00B3'},
|
||||
{180, '\u00B4'},
|
||||
{181, '\u00B5'},
|
||||
{182, '\u00B6'},
|
||||
{183, '\u00B7'},
|
||||
{184, '\u00B8'},
|
||||
{185, '\u00B9'},
|
||||
{186, '\u00BA'},
|
||||
{187, '\u00BB'},
|
||||
{188, '\u00BC'},
|
||||
{189, '\u00BD'},
|
||||
{190, '\u00BE'},
|
||||
{191, '\u00BF'},
|
||||
{192, '\u00C0'},
|
||||
{193, '\u00C1'},
|
||||
{194, '\u00C2'},
|
||||
{195, '\u00C3'},
|
||||
{196, '\u00C4'},
|
||||
{197, '\u00C5'},
|
||||
{198, '\u00C6'},
|
||||
{199, '\u00C7'},
|
||||
{200, '\u00C8'},
|
||||
{201, '\u00C9'},
|
||||
{202, '\u00CA'},
|
||||
{203, '\u00CB'},
|
||||
{204, '\u00CC'},
|
||||
{205, '\u00CD'},
|
||||
{206, '\u00CE'},
|
||||
{207, '\u00CF'},
|
||||
{208, '\u00D0'},
|
||||
{209, '\u00D1'},
|
||||
{210, '\u00D2'},
|
||||
{211, '\u00D3'},
|
||||
{212, '\u00D4'},
|
||||
{213, '\u00D5'},
|
||||
{214, '\u00D6'},
|
||||
{215, '\u00D7'},
|
||||
{216, '\u00D8'},
|
||||
{217, '\u00D9'},
|
||||
{218, '\u00DA'},
|
||||
{219, '\u00DB'},
|
||||
{220, '\u00DC'},
|
||||
{221, '\u00DD'},
|
||||
{222, '\u00DE'},
|
||||
{223, '\u00DF'},
|
||||
{224, '\u00E0'},
|
||||
{225, '\u00E1'},
|
||||
{226, '\u00E2'},
|
||||
{227, '\u00E3'},
|
||||
{228, '\u00E4'},
|
||||
{229, '\u00E5'},
|
||||
{230, '\u00E6'},
|
||||
{231, '\u00E7'},
|
||||
{232, '\u00E8'},
|
||||
{233, '\u00E9'},
|
||||
{234, '\u00EA'},
|
||||
{235, '\u00EB'},
|
||||
{236, '\u00EC'},
|
||||
{237, '\u00ED'},
|
||||
{238, '\u00EE'},
|
||||
{239, '\u00EF'},
|
||||
{240, '\u00F0'},
|
||||
{241, '\u00F1'},
|
||||
{242, '\u00F2'},
|
||||
{243, '\u00F3'},
|
||||
{244, '\u00F4'},
|
||||
{245, '\u00F5'},
|
||||
{246, '\u00F6'},
|
||||
{247, '\u00F7'},
|
||||
{248, '\u00F8'},
|
||||
{249, '\u00F9'},
|
||||
{250, '\u00FA'},
|
||||
{251, '\u00FB'},
|
||||
{252, '\u00FC'},
|
||||
{253, '\u00FD'},
|
||||
{254, '\u00FE'},
|
||||
{255, '\u00FF'}
|
||||
};
|
||||
|
||||
static PdfDocEncoding()
|
||||
{
|
||||
foreach (var c in CodeToUnicode)
|
||||
{
|
||||
UnicodeToCode.Add(c.Value, c.Key);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Try to convert raw bytes to a PdfDocEncoding encoded string. If unsupported characters are encountered
|
||||
/// meaning we cannot safely round-trip the value to bytes this will instead return false.
|
||||
/// </summary>
|
||||
public static bool TryConvertBytesToString(byte[] bytes, out string result)
|
||||
{
|
||||
result = null;
|
||||
if (bytes.Length == 0)
|
||||
{
|
||||
result = string.Empty;
|
||||
return true;
|
||||
}
|
||||
|
||||
var arr = new char[bytes.Length];
|
||||
|
||||
for (var i = 0; i < bytes.Length; i++)
|
||||
{
|
||||
var b = bytes[i];
|
||||
|
||||
if (!CodeToUnicode.TryGetValue(b, out var c))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
arr[i] = c;
|
||||
}
|
||||
|
||||
result = new string(arr);
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Map from string back to bytes. This is not a reversible operation for all inputs.
|
||||
/// </summary>
|
||||
public static byte[] StringToBytes(string s)
|
||||
{
|
||||
var result = new byte[s.Length];
|
||||
for (int i = 0; i < s.Length; i++)
|
||||
{
|
||||
var c = s[i];
|
||||
|
||||
if (UnicodeToCode.TryGetValue(c, out var b))
|
||||
{
|
||||
result[i] = b;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -42,7 +42,7 @@
|
||||
inputBytes = new ByteArrayInputBytes(ascii);
|
||||
}
|
||||
|
||||
var scanner = new CoreTokenScanner(inputBytes);
|
||||
var scanner = new CoreTokenScanner(inputBytes, false);
|
||||
|
||||
if (!scanner.TryReadToken(out CommentToken comment) || !comment.Data.StartsWith("!"))
|
||||
{
|
||||
|
||||
@ -108,7 +108,7 @@
|
||||
{
|
||||
var parser = new CodespaceRangeParser();
|
||||
var byteArrayInput = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes("1 begincodespacerange\nendcodespacerange"));
|
||||
var tokenScanner = new CoreTokenScanner(byteArrayInput);
|
||||
var tokenScanner = new CoreTokenScanner(byteArrayInput, false);
|
||||
|
||||
Assert.True(tokenScanner.MoveNext());
|
||||
Assert.True(tokenScanner.CurrentToken is NumericToken);
|
||||
|
||||
@ -0,0 +1,38 @@
|
||||
namespace UglyToad.PdfPig.Tests.Integration;
|
||||
|
||||
using System.Linq;
|
||||
using Xunit;
|
||||
|
||||
public class AccentedCharactersInBookmarksTests
|
||||
{
|
||||
[Fact]
|
||||
public void CanReadAccentedBookmarksCorrectly()
|
||||
{
|
||||
var path = IntegrationHelpers.GetDocumentPath("bookmarks-with-accented-characters.pdf");
|
||||
|
||||
using var document = PdfDocument.Open(path);
|
||||
|
||||
var isFound = document.TryGetBookmarks(out var bookmarks);
|
||||
|
||||
Assert.True(isFound);
|
||||
|
||||
var nodes = bookmarks.GetNodes().Select(x => x.Title).ToList();
|
||||
|
||||
Assert.Equal(new[]
|
||||
{
|
||||
"ž",
|
||||
"žč",
|
||||
"žđ",
|
||||
"žć",
|
||||
"žš",
|
||||
"ž ajklyghvbnmxcseqwuioprtzdf",
|
||||
"š",
|
||||
"šč",
|
||||
"šđ",
|
||||
"šć",
|
||||
"šž",
|
||||
"š ajklyghvbnmxcseqwuioprtzdf"
|
||||
},
|
||||
nodes);
|
||||
}
|
||||
}
|
||||
Binary file not shown.
@ -1,182 +1,182 @@
|
||||
namespace UglyToad.PdfPig.Tests.Parser.Parts
|
||||
{
|
||||
using System;
|
||||
using Logging;
|
||||
using PdfPig.Core;
|
||||
using PdfPig.Parser.FileStructure;
|
||||
using PdfPig.Tokenization.Scanner;
|
||||
using PdfPig.Tokens;
|
||||
using System.Linq;
|
||||
using Xunit;
|
||||
|
||||
public class FileHeaderParserTests
|
||||
{
|
||||
private readonly ILog log = new NoOpLog();
|
||||
[Fact]
|
||||
public void NullScannerThrows()
|
||||
{
|
||||
Action action = () => FileHeaderParser.Parse(null, null, false, log);
|
||||
|
||||
Assert.Throws<ArgumentNullException>(action);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("PDF-1.0")]
|
||||
[InlineData("PDF-1.1")]
|
||||
[InlineData("PDF-1.7")]
|
||||
[InlineData("PDF-1.9")]
|
||||
[InlineData("FDF-1.0")]
|
||||
[InlineData("FDF-1.9")]
|
||||
public void ReadsConformingHeader(string format)
|
||||
{
|
||||
var input = $"%{format}\nany garbage";
|
||||
|
||||
var scanner = StringBytesTestConverter.Scanner(input);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Equal(format, result.VersionString);
|
||||
Assert.Equal(0, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsHeaderWithBlankSpaceBefore()
|
||||
{
|
||||
const string input = @"
|
||||
|
||||
%PDF-1.2";
|
||||
|
||||
var scanner = StringBytesTestConverter.Scanner(input);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Equal(1.2m, result.Version);
|
||||
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 7 : 9, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void EmptyInputThrows()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner(string.Empty);
|
||||
|
||||
Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HeaderPrecededByJunkNonLenientDoesNotThrow()
|
||||
namespace UglyToad.PdfPig.Tests.Parser.Parts
|
||||
{
|
||||
using System;
|
||||
using Logging;
|
||||
using PdfPig.Core;
|
||||
using PdfPig.Parser.FileStructure;
|
||||
using PdfPig.Tokenization.Scanner;
|
||||
using PdfPig.Tokens;
|
||||
using System.Linq;
|
||||
using Xunit;
|
||||
|
||||
public class FileHeaderParserTests
|
||||
{
|
||||
private readonly ILog log = new NoOpLog();
|
||||
[Fact]
|
||||
public void NullScannerThrows()
|
||||
{
|
||||
var input = @"one
|
||||
%PDF-1.2";
|
||||
var scanner = StringBytesTestConverter.Scanner(input);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Equal(1.2m, result.Version);
|
||||
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HeaderPrecededByJunkLenientReads()
|
||||
Action action = () => FileHeaderParser.Parse(null, null, false, log);
|
||||
|
||||
Assert.Throws<ArgumentNullException>(action);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("PDF-1.0")]
|
||||
[InlineData("PDF-1.1")]
|
||||
[InlineData("PDF-1.7")]
|
||||
[InlineData("PDF-1.9")]
|
||||
[InlineData("FDF-1.0")]
|
||||
[InlineData("FDF-1.9")]
|
||||
public void ReadsConformingHeader(string format)
|
||||
{
|
||||
var input = @"one
|
||||
%PDF-1.7";
|
||||
var scanner = StringBytesTestConverter.Scanner(input);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||
|
||||
Assert.Equal(1.7m, result.Version);
|
||||
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HeaderPrecededByJunkDoesNotThrow()
|
||||
var input = $"%{format}\nany garbage";
|
||||
|
||||
var scanner = StringBytesTestConverter.Scanner(input);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Equal(format, result.VersionString);
|
||||
Assert.Equal(0, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsHeaderWithBlankSpaceBefore()
|
||||
{
|
||||
var s = @"one two
|
||||
const string input = @"
|
||||
|
||||
%PDF-1.2";
|
||||
|
||||
var scanner = StringBytesTestConverter.Scanner(input);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Equal(1.2m, result.Version);
|
||||
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 7 : 9, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void EmptyInputThrows()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner(string.Empty);
|
||||
|
||||
Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HeaderPrecededByJunkNonLenientDoesNotThrow()
|
||||
{
|
||||
var input = @"one
|
||||
%PDF-1.2";
|
||||
var scanner = StringBytesTestConverter.Scanner(input);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Equal(1.2m, result.Version);
|
||||
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HeaderPrecededByJunkLenientReads()
|
||||
{
|
||||
var input = @"one
|
||||
%PDF-1.7";
|
||||
var scanner = StringBytesTestConverter.Scanner(input);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||
|
||||
Assert.Equal(1.7m, result.Version);
|
||||
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HeaderPrecededByJunkDoesNotThrow()
|
||||
{
|
||||
var s = @"one two
|
||||
three %PDF-1.6";
|
||||
|
||||
var scanner = StringBytesTestConverter.Scanner(s);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||
|
||||
Assert.Equal(1.6m, result.Version);
|
||||
Assert.Equal(TestEnvironment.IsSingleByteNewLine(s) ? 14 : 15, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void JunkThenEndThrows()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner(@"one two");
|
||||
|
||||
Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VersionFormatInvalidNotLenientThrows()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
|
||||
|
||||
Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VersionFormatInvalidLenientDefaults1Point4()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||
|
||||
Assert.Equal(1.4m, result.Version);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ParsingResetsPosition()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner(@"%FDF-1.6");
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Equal(0, scanner.scanner.CurrentPosition);
|
||||
Assert.Equal(0, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Issue334()
|
||||
{
|
||||
var input = OtherEncodings.StringAsLatin1Bytes("%PDF-1.7\r\n%âãÏÓ\r\n1 0 obj\r\n<</Lang(en-US)>>\r\nendobj");
|
||||
|
||||
var bytes = new ByteArrayInputBytes(input);
|
||||
|
||||
var scanner = new CoreTokenScanner(bytes, ScannerScope.None);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner, bytes, false, log);
|
||||
|
||||
Assert.Equal(1.7m, result.Version);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Issue443()
|
||||
{
|
||||
const string hex =
|
||||
@"00 0F 4A 43 42 31 33 36 36 31 32 32 37 2E 70 64 66 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 50 44 46 20 43 41 52 4F 01 00 FF FF FF FF 00 00 00 00 00 04 DF 28 00 00 00 00 AF 51 7E 82 AF 52 D7 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 81 81 03 0D 00 00 25 50 44 46 2D 31 2E 31 0A 25 E2 E3 CF D3 0D 0A 31 20 30 20 6F 62 6A";
|
||||
|
||||
var bytes = hex.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(x => HexToken.Convert(x[0], x[1]));
|
||||
|
||||
var str = OtherEncodings.BytesAsLatin1String(bytes.ToArray());
|
||||
|
||||
var scanner = StringBytesTestConverter.Scanner(str);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Equal(0, scanner.scanner.CurrentPosition);
|
||||
Assert.Equal(128, result.OffsetInFile);
|
||||
Assert.Equal(1.1m, result.Version);
|
||||
Assert.Equal("PDF-1.1", result.VersionString);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var scanner = StringBytesTestConverter.Scanner(s);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||
|
||||
Assert.Equal(1.6m, result.Version);
|
||||
Assert.Equal(TestEnvironment.IsSingleByteNewLine(s) ? 14 : 15, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void JunkThenEndThrows()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner(@"one two");
|
||||
|
||||
Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VersionFormatInvalidNotLenientThrows()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
|
||||
|
||||
Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VersionFormatInvalidLenientDefaults1Point4()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||
|
||||
Assert.Equal(1.4m, result.Version);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ParsingResetsPosition()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner(@"%FDF-1.6");
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Equal(0, scanner.scanner.CurrentPosition);
|
||||
Assert.Equal(0, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Issue334()
|
||||
{
|
||||
var input = OtherEncodings.StringAsLatin1Bytes("%PDF-1.7\r\n%âãÏÓ\r\n1 0 obj\r\n<</Lang(en-US)>>\r\nendobj");
|
||||
|
||||
var bytes = new ByteArrayInputBytes(input);
|
||||
|
||||
var scanner = new CoreTokenScanner(bytes, true, ScannerScope.None);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner, bytes, false, log);
|
||||
|
||||
Assert.Equal(1.7m, result.Version);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Issue443()
|
||||
{
|
||||
const string hex =
|
||||
@"00 0F 4A 43 42 31 33 36 36 31 32 32 37 2E 70 64 66 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 50 44 46 20 43 41 52 4F 01 00 FF FF FF FF 00 00 00 00 00 04 DF 28 00 00 00 00 AF 51 7E 82 AF 52 D7 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 81 81 03 0D 00 00 25 50 44 46 2D 31 2E 31 0A 25 E2 E3 CF D3 0D 0A 31 20 30 20 6F 62 6A";
|
||||
|
||||
var bytes = hex.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(x => HexToken.Convert(x[0], x[1]));
|
||||
|
||||
var str = OtherEncodings.BytesAsLatin1String(bytes.ToArray());
|
||||
|
||||
var scanner = StringBytesTestConverter.Scanner(str);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Equal(0, scanner.scanner.CurrentPosition);
|
||||
Assert.Equal(128, result.OffsetInFile);
|
||||
Assert.Equal(1.1m, result.Version);
|
||||
Assert.Equal("PDF-1.1", result.VersionString);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -23,7 +23,7 @@ startxref
|
||||
|
||||
%%EOF", false);
|
||||
|
||||
var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
|
||||
var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
|
||||
|
||||
Assert.Equal(456, result);
|
||||
}
|
||||
@ -49,7 +49,7 @@ startxref
|
||||
startxref
|
||||
17", false);
|
||||
|
||||
var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
|
||||
var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
|
||||
|
||||
Assert.Equal(17, result);
|
||||
}
|
||||
@ -75,7 +75,7 @@ startref
|
||||
start_rexf
|
||||
17", false);
|
||||
|
||||
Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
|
||||
Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
|
||||
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
@ -85,7 +85,7 @@ start_rexf
|
||||
{
|
||||
var input = StringBytesTestConverter.Convert("11 0 obj", false);
|
||||
|
||||
Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(null, new CoreTokenScanner(input.Bytes), false);
|
||||
Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(null, new CoreTokenScanner(input.Bytes, true), false);
|
||||
|
||||
Assert.Throws<ArgumentNullException>(action);
|
||||
}
|
||||
@ -111,7 +111,7 @@ startxref
|
||||
<< /Why (am i here?) >> 69
|
||||
%EOF", false);
|
||||
|
||||
Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
|
||||
Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
|
||||
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
@ -126,7 +126,7 @@ endobj
|
||||
startxref
|
||||
", false);
|
||||
|
||||
Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
|
||||
Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
|
||||
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
@ -152,7 +152,7 @@ startxref
|
||||
|
||||
%%EOF", false);
|
||||
|
||||
var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
|
||||
var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
|
||||
|
||||
Assert.Equal(1274665676543, result);
|
||||
}
|
||||
@ -166,7 +166,7 @@ startxref %Commented here
|
||||
|
||||
%%EOF", false);
|
||||
|
||||
var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
|
||||
var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
|
||||
|
||||
Assert.Equal(57695, result);
|
||||
}
|
||||
|
||||
@ -34,7 +34,7 @@
|
||||
internal static (CoreTokenScanner scanner, IInputBytes bytes) Scanner(string s)
|
||||
{
|
||||
var inputBytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s));
|
||||
var result = new CoreTokenScanner(inputBytes);
|
||||
var result = new CoreTokenScanner(inputBytes, true);
|
||||
|
||||
return (result, inputBytes);
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
|
||||
public class ArrayTokenizerTests
|
||||
{
|
||||
private readonly ArrayTokenizer tokenizer = new ArrayTokenizer();
|
||||
private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true);
|
||||
|
||||
[Theory]
|
||||
[InlineData("]")]
|
||||
|
||||
@ -10,7 +10,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization
|
||||
|
||||
public class DictionaryTokenizerTests
|
||||
{
|
||||
private readonly DictionaryTokenizer tokenizer = new DictionaryTokenizer();
|
||||
private readonly DictionaryTokenizer tokenizer = new DictionaryTokenizer(true);
|
||||
|
||||
[Theory]
|
||||
[InlineData("[rjee]")]
|
||||
|
||||
@ -14,7 +14,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization.Scanner
|
||||
|
||||
public CoreTokenScannerTests()
|
||||
{
|
||||
scannerFactory = x => new CoreTokenScanner(x);
|
||||
scannerFactory = x => new CoreTokenScanner(x, true);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
|
||||
public class StringTokenizerTests
|
||||
{
|
||||
private readonly StringTokenizer tokenizer = new StringTokenizer();
|
||||
private readonly StringTokenizer tokenizer = new StringTokenizer(true);
|
||||
|
||||
[Fact]
|
||||
public void NullInput_ReturnsFalse()
|
||||
|
||||
@ -7,8 +7,15 @@
|
||||
|
||||
internal class ArrayTokenizer : ITokenizer
|
||||
{
|
||||
private readonly bool usePdfDocEncoding;
|
||||
|
||||
public bool ReadsNextByte { get; } = false;
|
||||
|
||||
public ArrayTokenizer(bool usePdfDocEncoding)
|
||||
{
|
||||
this.usePdfDocEncoding = usePdfDocEncoding;
|
||||
}
|
||||
|
||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||
{
|
||||
token = null;
|
||||
@ -18,7 +25,7 @@
|
||||
return false;
|
||||
}
|
||||
|
||||
var scanner = new CoreTokenScanner(inputBytes, ScannerScope.Array);
|
||||
var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Array);
|
||||
|
||||
var contents = new List<IToken>();
|
||||
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
|
||||
internal class DictionaryTokenizer : ITokenizer
|
||||
{
|
||||
private readonly bool usePdfDocEncoding;
|
||||
private readonly IReadOnlyList<NameToken> requiredKeys;
|
||||
|
||||
public bool ReadsNextByte { get; } = false;
|
||||
@ -14,12 +15,16 @@
|
||||
/// <summary>
|
||||
/// Create a new <see cref="DictionaryTokenizer"/>.
|
||||
/// </summary>
|
||||
/// <param name="usePdfDocEncoding">
|
||||
/// Whether to read strings using the PdfDocEncoding.
|
||||
/// </param>
|
||||
/// <param name="requiredKeys">
|
||||
/// Can be provided to recover from errors with missing dictionary end symbols if the
|
||||
/// set of keys expected in the dictionary are known.
|
||||
/// </param>
|
||||
public DictionaryTokenizer(IReadOnlyList<NameToken> requiredKeys = null)
|
||||
public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList<NameToken> requiredKeys = null)
|
||||
{
|
||||
this.usePdfDocEncoding = usePdfDocEncoding;
|
||||
this.requiredKeys = requiredKeys;
|
||||
}
|
||||
|
||||
@ -75,7 +80,7 @@
|
||||
return false;
|
||||
}
|
||||
|
||||
var coreScanner = new CoreTokenScanner(inputBytes, ScannerScope.Dictionary);
|
||||
var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary);
|
||||
|
||||
var tokens = new List<IToken>();
|
||||
|
||||
|
||||
@ -10,9 +10,7 @@
|
||||
/// </summary>
|
||||
public class CoreTokenScanner : ISeekableTokenScanner
|
||||
{
|
||||
private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer();
|
||||
private static readonly CommentTokenizer CommentTokenizer = new CommentTokenizer();
|
||||
private static readonly DictionaryTokenizer DictionaryTokenizer = new DictionaryTokenizer();
|
||||
private static readonly HexTokenizer HexTokenizer = new HexTokenizer();
|
||||
private static readonly NameTokenizer NameTokenizer = new NameTokenizer();
|
||||
|
||||
@ -20,11 +18,14 @@
|
||||
// StringBuilder it re-uses.
|
||||
private readonly PlainTokenizer PlainTokenizer = new PlainTokenizer();
|
||||
private readonly NumericTokenizer NumericTokenizer = new NumericTokenizer();
|
||||
private readonly StringTokenizer StringTokenizer = new StringTokenizer();
|
||||
private readonly StringTokenizer stringTokenizer;
|
||||
private readonly ArrayTokenizer arrayTokenizer;
|
||||
private readonly DictionaryTokenizer dictionaryTokenizer;
|
||||
|
||||
private readonly ScannerScope scope;
|
||||
private readonly IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys;
|
||||
private readonly IInputBytes inputBytes;
|
||||
private readonly bool usePdfDocEncoding;
|
||||
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
|
||||
|
||||
/// <summary>
|
||||
@ -49,10 +50,15 @@
|
||||
/// </summary>
|
||||
public CoreTokenScanner(
|
||||
IInputBytes inputBytes,
|
||||
bool usePdfDocEncoding,
|
||||
ScannerScope scope = ScannerScope.None,
|
||||
IReadOnlyDictionary<NameToken, IReadOnlyList<NameToken>> namedDictionaryRequiredKeys = null)
|
||||
{
|
||||
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
|
||||
this.usePdfDocEncoding = usePdfDocEncoding;
|
||||
this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
|
||||
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding);
|
||||
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding);
|
||||
this.scope = scope;
|
||||
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
|
||||
}
|
||||
@ -121,20 +127,20 @@
|
||||
switch (c)
|
||||
{
|
||||
case '(':
|
||||
tokenizer = StringTokenizer;
|
||||
tokenizer = stringTokenizer;
|
||||
break;
|
||||
case '<':
|
||||
var following = inputBytes.Peek();
|
||||
if (following == '<')
|
||||
{
|
||||
isSkippingSymbol = true;
|
||||
tokenizer = DictionaryTokenizer;
|
||||
tokenizer = dictionaryTokenizer;
|
||||
|
||||
if (namedDictionaryRequiredKeys != null
|
||||
&& CurrentToken is NameToken name
|
||||
&& namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys))
|
||||
{
|
||||
tokenizer = new DictionaryTokenizer(requiredKeys);
|
||||
tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys);
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -150,7 +156,7 @@
|
||||
}
|
||||
break;
|
||||
case '[':
|
||||
tokenizer = ArrayTokenizer;
|
||||
tokenizer = arrayTokenizer;
|
||||
break;
|
||||
case ']' when scope == ScannerScope.Array:
|
||||
return false;
|
||||
|
||||
@ -6,9 +6,17 @@
|
||||
|
||||
internal class StringTokenizer : ITokenizer
|
||||
{
|
||||
private readonly bool usePdfDocEncoding;
|
||||
|
||||
private readonly StringBuilder stringBuilder = new StringBuilder();
|
||||
|
||||
public bool ReadsNextByte { get; } = false;
|
||||
|
||||
public StringTokenizer(bool usePdfDocEncoding)
|
||||
{
|
||||
this.usePdfDocEncoding = usePdfDocEncoding;
|
||||
}
|
||||
|
||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||
{
|
||||
token = null;
|
||||
@ -164,6 +172,21 @@
|
||||
|
||||
encodedWith = StringToken.Encoding.Utf16;
|
||||
}
|
||||
else if (usePdfDocEncoding)
|
||||
{
|
||||
var builtStr = builder.ToString();
|
||||
var rawBytes = OtherEncodings.StringAsLatin1Bytes(builtStr);
|
||||
if (PdfDocEncoding.TryConvertBytesToString(rawBytes, out var str))
|
||||
{
|
||||
tokenStr = str;
|
||||
encodedWith = StringToken.Encoding.PdfDocEncoding;
|
||||
}
|
||||
else
|
||||
{
|
||||
tokenStr = builtStr;
|
||||
encodedWith = StringToken.Encoding.Iso88591;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
tokenStr = builder.ToString();
|
||||
@ -171,6 +194,21 @@
|
||||
encodedWith = StringToken.Encoding.Iso88591;
|
||||
}
|
||||
}
|
||||
else if (usePdfDocEncoding)
|
||||
{
|
||||
var builtStr = builder.ToString();
|
||||
var rawBytes = OtherEncodings.StringAsLatin1Bytes(builtStr);
|
||||
if (PdfDocEncoding.TryConvertBytesToString(rawBytes, out var str))
|
||||
{
|
||||
tokenStr = str;
|
||||
encodedWith = StringToken.Encoding.PdfDocEncoding;
|
||||
}
|
||||
else
|
||||
{
|
||||
tokenStr = builtStr;
|
||||
encodedWith = StringToken.Encoding.Iso88591;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
tokenStr = builder.ToString();
|
||||
|
||||
@ -53,6 +53,8 @@ namespace UglyToad.PdfPig.Tokens
|
||||
{
|
||||
return System.Text.Encoding.Unicode.GetBytes(Data);
|
||||
}
|
||||
case Encoding.PdfDocEncoding:
|
||||
return PdfDocEncoding.StringToBytes(Data);
|
||||
default:
|
||||
return OtherEncodings.StringAsLatin1Bytes(Data);
|
||||
}
|
||||
@ -96,7 +98,11 @@ namespace UglyToad.PdfPig.Tokens
|
||||
/// <summary>
|
||||
/// UTF-16 Big Endian.
|
||||
/// </summary>
|
||||
Utf16BE = 2
|
||||
Utf16BE = 2,
|
||||
/// <summary>
|
||||
/// The PdfDocEncoding for strings in the body of a PDF file.
|
||||
/// </summary>
|
||||
PdfDocEncoding = 3,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -21,10 +21,12 @@
|
||||
this.operationFactory = operationFactory;
|
||||
}
|
||||
|
||||
public IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes,
|
||||
public IReadOnlyList<IGraphicsStateOperation> Parse(
|
||||
int pageNumber,
|
||||
IInputBytes inputBytes,
|
||||
ILog log)
|
||||
{
|
||||
var scanner = new CoreTokenScanner(inputBytes);
|
||||
var scanner = new CoreTokenScanner(inputBytes, false);
|
||||
|
||||
var precedingTokens = new List<IToken>();
|
||||
var graphicsStateOperations = new List<IGraphicsStateOperation>();
|
||||
|
||||
@ -70,7 +70,7 @@
|
||||
{
|
||||
var isLenientParsing = options?.UseLenientParsing ?? true;
|
||||
|
||||
var tokenScanner = new CoreTokenScanner(inputBytes);
|
||||
var tokenScanner = new CoreTokenScanner(inputBytes, true);
|
||||
|
||||
var passwords = new List<string>();
|
||||
|
||||
|
||||
@ -22,6 +22,7 @@
|
||||
public CMap Parse(IInputBytes inputBytes)
|
||||
{
|
||||
var scanner = new CoreTokenScanner(inputBytes,
|
||||
false,
|
||||
namedDictionaryRequiredKeys: new Dictionary<NameToken, IReadOnlyList<NameToken>>
|
||||
{
|
||||
{ NameToken.CidSystemInfo, new[] { NameToken.Registry, NameToken.Ordering, NameToken.Supplement } }
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
this.objectLocationProvider = objectLocationProvider;
|
||||
this.filterProvider = filterProvider;
|
||||
this.encryptionHandler = encryptionHandler;
|
||||
coreTokenScanner = new CoreTokenScanner(inputBytes);
|
||||
coreTokenScanner = new CoreTokenScanner(inputBytes, true);
|
||||
}
|
||||
|
||||
public void UpdateEncryptionHandler(IEncryptionHandler newHandler)
|
||||
@ -797,7 +797,7 @@
|
||||
// Read the N integers
|
||||
var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider, this));
|
||||
|
||||
var scanner = new CoreTokenScanner(bytes);
|
||||
var scanner = new CoreTokenScanner(bytes, true);
|
||||
|
||||
var objects = new List<Tuple<long, long>>();
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user