diff --git a/src/UglyToad.PdfPig.Core/OtherEncodings.cs b/src/UglyToad.PdfPig.Core/OtherEncodings.cs
index c54cf4ad..2c4cdf1b 100644
--- a/src/UglyToad.PdfPig.Core/OtherEncodings.cs
+++ b/src/UglyToad.PdfPig.Core/OtherEncodings.cs
@@ -57,280 +57,5 @@
return Iso88591.GetString(bytes);
}
-
- ///
- /// The encoding for strings in a PDF file which encodes all of the ISO Latin 1 character set.
- ///
- public static class PdfDocEncoding
- {
- private static readonly IReadOnlyDictionary CodeToUnicode = new Dictionary
- {
- {0, '\u0000'},
- {1, '\u0001'},
- {2, '\u0002'},
- {3, '\u0003'},
- {4, '\u0004'},
- {5, '\u0005'},
- {6, '\u0006'},
- {7, '\u0007'},
- {8, '\u0008'},
- {9, '\u0009'},
- {10, '\u000A'},
- {11, '\u000B'},
- {12, '\u000C'},
- {13, '\u000D'},
- {14, '\u000E'},
- {15, '\u000F'},
- {16, '\u0010'},
- {17, '\u0011'},
- {18, '\u0012'},
- {19, '\u0013'},
- {20, '\u0014'},
- {21, '\u0015'},
- {22, '\u0017'},
- {23, '\u0017'},
- {24, '\u02D8'},
- {25, '\u02C7'},
- {26, '\u02C6'},
- {27, '\u02D9'},
- {28, '\u02DD'},
- {29, '\u02DB'},
- {30, '\u02DA'},
- {31, '\u02DC'},
- {32, '\u0020'},
- {33, '\u0021'},
- {34, '\u0022'},
- {35, '\u0023'},
- {36, '\u0024'},
- {37, '\u0025'},
- {38, '\u0026'},
- {39, '\u0027'},
- {40, '\u0028'},
- {41, '\u0029'},
- {42, '\u002A'},
- {43, '\u002B'},
- {44, '\u002C'},
- {45, '\u002D'},
- {46, '\u002E'},
- {47, '\u002F'},
- {48, '\u0030'},
- {49, '\u0031'},
- {50, '\u0032'},
- {51, '\u0033'},
- {52, '\u0034'},
- {53, '\u0035'},
- {54, '\u0036'},
- {55, '\u0037'},
- {56, '\u0038'},
- {57, '\u0039'},
- {58, '\u003A'},
- {59, '\u003B'},
- {60, '\u003C'},
- {61, '\u003D'},
- {62, '\u003E'},
- {63, '\u003F'},
- {64, '\u0040'},
- {65, '\u0041'},
- {66, '\u0042'},
- {67, '\u0043'},
- {68, '\u0044'},
- {69, '\u0045'},
- {70, '\u0046'},
- {71, '\u0047'},
- {72, '\u0048'},
- {73, '\u0049'},
- {74, '\u004A'},
- {75, '\u004B'},
- {76, '\u004C'},
- {77, '\u004D'},
- {78, '\u004E'},
- {79, '\u004F'},
- {80, '\u0050'},
- {81, '\u0051'},
- {82, '\u0052'},
- {83, '\u0053'},
- {84, '\u0054'},
- {85, '\u0055'},
- {86, '\u0056'},
- {87, '\u0057'},
- {88, '\u0058'},
- {89, '\u0059'},
- {90, '\u005A'},
- {91, '\u005B'},
- {92, '\u005C'},
- {93, '\u005D'},
- {94, '\u005E'},
- {95, '\u005F'},
- {96, '\u0060'},
- {97, '\u0061'},
- {98, '\u0062'},
- {99, '\u0063'},
- {100, '\u0064'},
- {101, '\u0065'},
- {102, '\u0066'},
- {103, '\u0067'},
- {104, '\u0068'},
- {105, '\u0069'},
- {106, '\u006A'},
- {107, '\u006B'},
- {108, '\u006C'},
- {109, '\u006D'},
- {110, '\u006E'},
- {111, '\u006F'},
- {112, '\u0070'},
- {113, '\u0071'},
- {114, '\u0072'},
- {115, '\u0073'},
- {116, '\u0074'},
- {117, '\u0075'},
- {118, '\u0076'},
- {119, '\u0077'},
- {120, '\u0078'},
- {121, '\u0079'},
- {122, '\u007A'},
- {123, '\u007B'},
- {124, '\u007C'},
- {125, '\u007D'},
- {126, '\u007E'},
- {128, '\u2022'},
- {129, '\u2020'},
- {130, '\u2021'},
- {131, '\u2026'},
- {132, '\u2014'},
- {133, '\u2013'},
- {134, '\u0192'},
- {135, '\u2044'},
- {136, '\u2039'},
- {137, '\u203A'},
- {138, '\u2212'},
- {139, '\u2030'},
- {140, '\u201E'},
- {141, '\u201C'},
- {142, '\u201D'},
- {143, '\u2018'},
- {144, '\u2019'},
- {145, '\u201A'},
- {146, '\u2122'},
- {147, '\uFB01'},
- {148, '\uFB02'},
- {149, '\u0141'},
- {150, '\u0152'},
- {151, '\u0160'},
- {152, '\u0178'},
- {153, '\u017D'},
- {154, '\u0131'},
- {155, '\u0142'},
- {156, '\u0153'},
- {157, '\u0161'},
- {158, '\u017E'},
- {160, '\u20AC'},
- {161, '\u00A1'},
- {162, '\u00A2'},
- {163, '\u00A3'},
- {164, '\u00A4'},
- {165, '\u00A5'},
- {166, '\u00A6'},
- {167, '\u00A7'},
- {168, '\u00A8'},
- {169, '\u00A9'},
- {170, '\u00AA'},
- {171, '\u00AB'},
- {172, '\u00AC'},
- {174, '\u00AE'},
- {175, '\u00AF'},
- {176, '\u00B0'},
- {177, '\u00B1'},
- {178, '\u00B2'},
- {179, '\u00B3'},
- {180, '\u00B4'},
- {181, '\u00B5'},
- {182, '\u00B6'},
- {183, '\u00B7'},
- {184, '\u00B8'},
- {185, '\u00B9'},
- {186, '\u00BA'},
- {187, '\u00BB'},
- {188, '\u00BC'},
- {189, '\u00BD'},
- {190, '\u00BE'},
- {191, '\u00BF'},
- {192, '\u00C0'},
- {193, '\u00C1'},
- {194, '\u00C2'},
- {195, '\u00C3'},
- {196, '\u00C4'},
- {197, '\u00C5'},
- {198, '\u00C6'},
- {199, '\u00C7'},
- {200, '\u00C8'},
- {201, '\u00C9'},
- {202, '\u00CA'},
- {203, '\u00CB'},
- {204, '\u00CC'},
- {205, '\u00CD'},
- {206, '\u00CE'},
- {207, '\u00CF'},
- {208, '\u00D0'},
- {209, '\u00D1'},
- {210, '\u00D2'},
- {211, '\u00D3'},
- {212, '\u00D4'},
- {213, '\u00D5'},
- {214, '\u00D6'},
- {215, '\u00D7'},
- {216, '\u00D8'},
- {217, '\u00D9'},
- {218, '\u00DA'},
- {219, '\u00DB'},
- {220, '\u00DC'},
- {221, '\u00DD'},
- {222, '\u00DE'},
- {223, '\u00DF'},
- {224, '\u00E0'},
- {225, '\u00E1'},
- {226, '\u00E2'},
- {227, '\u00E3'},
- {228, '\u00E4'},
- {229, '\u00E5'},
- {230, '\u00E6'},
- {231, '\u00E7'},
- {232, '\u00E8'},
- {233, '\u00E9'},
- {234, '\u00EA'},
- {235, '\u00EB'},
- {236, '\u00EC'},
- {237, '\u00ED'},
- {238, '\u00EE'},
- {239, '\u00EF'},
- {240, '\u00F0'},
- {241, '\u00F1'},
- {242, '\u00F2'},
- {243, '\u00F3'},
- {244, '\u00F4'},
- {245, '\u00F5'},
- {246, '\u00F6'},
- {247, '\u00F7'},
- {248, '\u00F8'},
- {249, '\u00F9'},
- {250, '\u00FA'},
- {251, '\u00FB'},
- {252, '\u00FC'},
- {253, '\u00FD'},
- {254, '\u00FE'},
- {255, '\u00FF'},
- // Undefined codes follow.
- {127, '\uFFFD'},
- {159, '\uFFFD'},
- {173, '\uFFFD'}
- };
-
- ///
- /// Whether the PDF Doc Encoding contains a corresponding character.
- ///
- public static bool ContainsChar(char c)
- {
- return CodeToUnicode.Any(x => x.Value == c);
- }
- }
}
}
diff --git a/src/UglyToad.PdfPig.Core/PdfDocEncoding.cs b/src/UglyToad.PdfPig.Core/PdfDocEncoding.cs
new file mode 100644
index 00000000..e58ad0f7
--- /dev/null
+++ b/src/UglyToad.PdfPig.Core/PdfDocEncoding.cs
@@ -0,0 +1,312 @@
+namespace UglyToad.PdfPig.Core
+{
+ using System.Collections.Generic;
+
+ ///
+ ///
+ /// PDFDocEncoding, defined in the spec for text strings in PDF objects (but not content stream contents,
+ /// Type 1 font contents, etc).
+ ///
+ ///
+ /// Matches ASCII for code points 32 - 126.
+ ///
+ ///
+ public static class PdfDocEncoding
+ {
+ private static readonly Dictionary UnicodeToCode = new Dictionary();
+ private static readonly Dictionary CodeToUnicode = new Dictionary
+ {
+ {9, '\u0009'},
+ {10, '\u000A'},
+ {13, '\u000D'},
+ {24, '\u02D8'},
+ {25, '\u02C7'},
+ {26, '\u02C6'},
+ {27, '\u02D9'},
+ {28, '\u02DD'},
+ {29, '\u02DB'},
+ {30, '\u02DA'},
+ {31, '\u02DC'},
+ {32, '\u0020'},
+ {33, '\u0021'},
+ {34, '\u0022'},
+ {35, '\u0023'},
+ {36, '\u0024'},
+ {37, '\u0025'},
+ {38, '\u0026'},
+ {39, '\u0027'},
+ {40, '\u0028'},
+ {41, '\u0029'},
+ {42, '\u002A'},
+ {43, '\u002B'},
+ {44, '\u002C'},
+ {45, '\u002D'},
+ {46, '\u002E'},
+ {47, '\u002F'},
+ {48, '\u0030'},
+ {49, '\u0031'},
+ {50, '\u0032'},
+ {51, '\u0033'},
+ {52, '\u0034'},
+ {53, '\u0035'},
+ {54, '\u0036'},
+ {55, '\u0037'},
+ {56, '\u0038'},
+ {57, '\u0039'},
+ {58, '\u003A'},
+ {59, '\u003B'},
+ {60, '\u003C'},
+ {61, '\u003D'},
+ {62, '\u003E'},
+ {63, '\u003F'},
+ {64, '\u0040'},
+ {65, '\u0041'},
+ {66, '\u0042'},
+ {67, '\u0043'},
+ {68, '\u0044'},
+ {69, '\u0045'},
+ {70, '\u0046'},
+ {71, '\u0047'},
+ {72, '\u0048'},
+ {73, '\u0049'},
+ {74, '\u004A'},
+ {75, '\u004B'},
+ {76, '\u004C'},
+ {77, '\u004D'},
+ {78, '\u004E'},
+ {79, '\u004F'},
+ {80, '\u0050'},
+ {81, '\u0051'},
+ {82, '\u0052'},
+ {83, '\u0053'},
+ {84, '\u0054'},
+ {85, '\u0055'},
+ {86, '\u0056'},
+ {87, '\u0057'},
+ {88, '\u0058'},
+ {89, '\u0059'},
+ {90, '\u005A'},
+ {91, '\u005B'},
+ {92, '\u005C'},
+ {93, '\u005D'},
+ {94, '\u005E'},
+ {95, '\u005F'},
+ {96, '\u0060'},
+ {97, '\u0061'},
+ {98, '\u0062'},
+ {99, '\u0063'},
+ {100, '\u0064'},
+ {101, '\u0065'},
+ {102, '\u0066'},
+ {103, '\u0067'},
+ {104, '\u0068'},
+ {105, '\u0069'},
+ {106, '\u006A'},
+ {107, '\u006B'},
+ {108, '\u006C'},
+ {109, '\u006D'},
+ {110, '\u006E'},
+ {111, '\u006F'},
+ {112, '\u0070'},
+ {113, '\u0071'},
+ {114, '\u0072'},
+ {115, '\u0073'},
+ {116, '\u0074'},
+ {117, '\u0075'},
+ {118, '\u0076'},
+ {119, '\u0077'},
+ {120, '\u0078'},
+ {121, '\u0079'},
+ {122, '\u007A'},
+ {123, '\u007B'},
+ {124, '\u007C'},
+ {125, '\u007D'},
+ {126, '\u007E'},
+ {128, '\u2022'},
+ {129, '\u2020'},
+ {130, '\u2021'},
+ {131, '\u2026'},
+ {132, '\u2014'},
+ {133, '\u2013'},
+ {134, '\u0192'},
+ {135, '\u2044'},
+ {136, '\u2039'},
+ {137, '\u203A'},
+ {138, '\u2212'},
+ {139, '\u2030'},
+ {140, '\u201E'},
+ {141, '\u201C'},
+ {142, '\u201D'},
+ {143, '\u2018'},
+ {144, '\u2019'},
+ {145, '\u201A'},
+ {146, '\u2122'},
+ {147, '\uFB01'},
+ {148, '\uFB02'},
+ {149, '\u0141'},
+ {150, '\u0152'},
+ {151, '\u0160'},
+ {152, '\u0178'},
+ {153, '\u017D'},
+ {154, '\u0131'},
+ {155, '\u0142'},
+ {156, '\u0153'},
+ {157, '\u0161'},
+ {158, '\u017E'},
+ {160, '\u20AC'},
+ {161, '\u00A1'},
+ {162, '\u00A2'},
+ {163, '\u00A3'},
+ {164, '\u00A4'},
+ {165, '\u00A5'},
+ {166, '\u00A6'},
+ {167, '\u00A7'},
+ {168, '\u00A8'},
+ {169, '\u00A9'},
+ {170, '\u00AA'},
+ {171, '\u00AB'},
+ {172, '\u00AC'},
+ {174, '\u00AE'},
+ {175, '\u00AF'},
+ {176, '\u00B0'},
+ {177, '\u00B1'},
+ {178, '\u00B2'},
+ {179, '\u00B3'},
+ {180, '\u00B4'},
+ {181, '\u00B5'},
+ {182, '\u00B6'},
+ {183, '\u00B7'},
+ {184, '\u00B8'},
+ {185, '\u00B9'},
+ {186, '\u00BA'},
+ {187, '\u00BB'},
+ {188, '\u00BC'},
+ {189, '\u00BD'},
+ {190, '\u00BE'},
+ {191, '\u00BF'},
+ {192, '\u00C0'},
+ {193, '\u00C1'},
+ {194, '\u00C2'},
+ {195, '\u00C3'},
+ {196, '\u00C4'},
+ {197, '\u00C5'},
+ {198, '\u00C6'},
+ {199, '\u00C7'},
+ {200, '\u00C8'},
+ {201, '\u00C9'},
+ {202, '\u00CA'},
+ {203, '\u00CB'},
+ {204, '\u00CC'},
+ {205, '\u00CD'},
+ {206, '\u00CE'},
+ {207, '\u00CF'},
+ {208, '\u00D0'},
+ {209, '\u00D1'},
+ {210, '\u00D2'},
+ {211, '\u00D3'},
+ {212, '\u00D4'},
+ {213, '\u00D5'},
+ {214, '\u00D6'},
+ {215, '\u00D7'},
+ {216, '\u00D8'},
+ {217, '\u00D9'},
+ {218, '\u00DA'},
+ {219, '\u00DB'},
+ {220, '\u00DC'},
+ {221, '\u00DD'},
+ {222, '\u00DE'},
+ {223, '\u00DF'},
+ {224, '\u00E0'},
+ {225, '\u00E1'},
+ {226, '\u00E2'},
+ {227, '\u00E3'},
+ {228, '\u00E4'},
+ {229, '\u00E5'},
+ {230, '\u00E6'},
+ {231, '\u00E7'},
+ {232, '\u00E8'},
+ {233, '\u00E9'},
+ {234, '\u00EA'},
+ {235, '\u00EB'},
+ {236, '\u00EC'},
+ {237, '\u00ED'},
+ {238, '\u00EE'},
+ {239, '\u00EF'},
+ {240, '\u00F0'},
+ {241, '\u00F1'},
+ {242, '\u00F2'},
+ {243, '\u00F3'},
+ {244, '\u00F4'},
+ {245, '\u00F5'},
+ {246, '\u00F6'},
+ {247, '\u00F7'},
+ {248, '\u00F8'},
+ {249, '\u00F9'},
+ {250, '\u00FA'},
+ {251, '\u00FB'},
+ {252, '\u00FC'},
+ {253, '\u00FD'},
+ {254, '\u00FE'},
+ {255, '\u00FF'}
+ };
+
+ static PdfDocEncoding()
+ {
+ foreach (var c in CodeToUnicode)
+ {
+ UnicodeToCode.Add(c.Value, c.Key);
+ }
+ }
+
+
+ ///
+ /// Try to convert raw bytes to a PdfDocEncoding encoded string. If unsupported characters are encountered
+ /// meaning we cannot safely round-trip the value to bytes this will instead return false.
+ ///
+ public static bool TryConvertBytesToString(byte[] bytes, out string result)
+ {
+ result = null;
+ if (bytes.Length == 0)
+ {
+ result = string.Empty;
+ return true;
+ }
+
+ var arr = new char[bytes.Length];
+
+ for (var i = 0; i < bytes.Length; i++)
+ {
+ var b = bytes[i];
+
+ if (!CodeToUnicode.TryGetValue(b, out var c))
+ {
+ return false;
+ }
+
+ arr[i] = c;
+ }
+
+ result = new string(arr);
+ return true;
+ }
+
+ ///
+ /// Map from string back to bytes. This is not a reversible operation for all inputs.
+ ///
+ public static byte[] StringToBytes(string s)
+ {
+ var result = new byte[s.Length];
+ for (int i = 0; i < s.Length; i++)
+ {
+ var c = s[i];
+
+ if (UnicodeToCode.TryGetValue(c, out var b))
+ {
+ result[i] = b;
+ }
+ }
+
+ return result;
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs
index 38c6d22c..a6be5f62 100644
--- a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs
+++ b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs
@@ -42,7 +42,7 @@
inputBytes = new ByteArrayInputBytes(ascii);
}
- var scanner = new CoreTokenScanner(inputBytes);
+ var scanner = new CoreTokenScanner(inputBytes, false);
if (!scanner.TryReadToken(out CommentToken comment) || !comment.Data.StartsWith("!"))
{
diff --git a/src/UglyToad.PdfPig.Tests/Fonts/Cmap/CodespaceRangeTests.cs b/src/UglyToad.PdfPig.Tests/Fonts/Cmap/CodespaceRangeTests.cs
index 2aa35af5..48c6a730 100644
--- a/src/UglyToad.PdfPig.Tests/Fonts/Cmap/CodespaceRangeTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Fonts/Cmap/CodespaceRangeTests.cs
@@ -108,7 +108,7 @@
{
var parser = new CodespaceRangeParser();
var byteArrayInput = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes("1 begincodespacerange\nendcodespacerange"));
- var tokenScanner = new CoreTokenScanner(byteArrayInput);
+ var tokenScanner = new CoreTokenScanner(byteArrayInput, false);
Assert.True(tokenScanner.MoveNext());
Assert.True(tokenScanner.CurrentToken is NumericToken);
diff --git a/src/UglyToad.PdfPig.Tests/Integration/AccentedCharactersInBookmarksTests.cs b/src/UglyToad.PdfPig.Tests/Integration/AccentedCharactersInBookmarksTests.cs
new file mode 100644
index 00000000..2608e6c2
--- /dev/null
+++ b/src/UglyToad.PdfPig.Tests/Integration/AccentedCharactersInBookmarksTests.cs
@@ -0,0 +1,38 @@
+namespace UglyToad.PdfPig.Tests.Integration;
+
+using System.Linq;
+using Xunit;
+
+public class AccentedCharactersInBookmarksTests
+{
+ [Fact]
+ public void CanReadAccentedBookmarksCorrectly()
+ {
+ var path = IntegrationHelpers.GetDocumentPath("bookmarks-with-accented-characters.pdf");
+
+ using var document = PdfDocument.Open(path);
+
+ var isFound = document.TryGetBookmarks(out var bookmarks);
+
+ Assert.True(isFound);
+
+ var nodes = bookmarks.GetNodes().Select(x => x.Title).ToList();
+
+ Assert.Equal(new[]
+ {
+ "ž",
+ "žč",
+ "žđ",
+ "žć",
+ "žš",
+ "ž ajklyghvbnmxcseqwuioprtzdf",
+ "š",
+ "šč",
+ "šđ",
+ "šć",
+ "šž",
+ "š ajklyghvbnmxcseqwuioprtzdf"
+ },
+ nodes);
+ }
+}
\ No newline at end of file
diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/bookmarks-with-accented-characters.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/bookmarks-with-accented-characters.pdf
new file mode 100644
index 00000000..e03d8c42
Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/bookmarks-with-accented-characters.pdf differ
diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs
index 6a50d0c6..33fc5187 100644
--- a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs
@@ -1,182 +1,182 @@
-namespace UglyToad.PdfPig.Tests.Parser.Parts
-{
- using System;
- using Logging;
- using PdfPig.Core;
- using PdfPig.Parser.FileStructure;
- using PdfPig.Tokenization.Scanner;
- using PdfPig.Tokens;
- using System.Linq;
- using Xunit;
-
- public class FileHeaderParserTests
- {
- private readonly ILog log = new NoOpLog();
- [Fact]
- public void NullScannerThrows()
- {
- Action action = () => FileHeaderParser.Parse(null, null, false, log);
-
- Assert.Throws(action);
- }
-
- [Theory]
- [InlineData("PDF-1.0")]
- [InlineData("PDF-1.1")]
- [InlineData("PDF-1.7")]
- [InlineData("PDF-1.9")]
- [InlineData("FDF-1.0")]
- [InlineData("FDF-1.9")]
- public void ReadsConformingHeader(string format)
- {
- var input = $"%{format}\nany garbage";
-
- var scanner = StringBytesTestConverter.Scanner(input);
-
- var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
-
- Assert.Equal(format, result.VersionString);
- Assert.Equal(0, result.OffsetInFile);
- }
-
- [Fact]
- public void ReadsHeaderWithBlankSpaceBefore()
- {
- const string input = @"
-
-%PDF-1.2";
-
- var scanner = StringBytesTestConverter.Scanner(input);
-
- var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
-
- Assert.Equal(1.2m, result.Version);
- Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 7 : 9, result.OffsetInFile);
- }
-
- [Fact]
- public void EmptyInputThrows()
- {
- var scanner = StringBytesTestConverter.Scanner(string.Empty);
-
- Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
-
- Assert.Throws(action);
- }
-
- [Fact]
- public void HeaderPrecededByJunkNonLenientDoesNotThrow()
+namespace UglyToad.PdfPig.Tests.Parser.Parts
+{
+ using System;
+ using Logging;
+ using PdfPig.Core;
+ using PdfPig.Parser.FileStructure;
+ using PdfPig.Tokenization.Scanner;
+ using PdfPig.Tokens;
+ using System.Linq;
+ using Xunit;
+
+ public class FileHeaderParserTests
+ {
+ private readonly ILog log = new NoOpLog();
+ [Fact]
+ public void NullScannerThrows()
{
- var input = @"one
- %PDF-1.2";
- var scanner = StringBytesTestConverter.Scanner(input);
-
- var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
-
- Assert.Equal(1.2m, result.Version);
- Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
- }
-
- [Fact]
- public void HeaderPrecededByJunkLenientReads()
+ Action action = () => FileHeaderParser.Parse(null, null, false, log);
+
+ Assert.Throws(action);
+ }
+
+ [Theory]
+ [InlineData("PDF-1.0")]
+ [InlineData("PDF-1.1")]
+ [InlineData("PDF-1.7")]
+ [InlineData("PDF-1.9")]
+ [InlineData("FDF-1.0")]
+ [InlineData("FDF-1.9")]
+ public void ReadsConformingHeader(string format)
{
- var input = @"one
- %PDF-1.7";
- var scanner = StringBytesTestConverter.Scanner(input);
-
- var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
-
- Assert.Equal(1.7m, result.Version);
- Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
- }
-
- [Fact]
- public void HeaderPrecededByJunkDoesNotThrow()
+ var input = $"%{format}\nany garbage";
+
+ var scanner = StringBytesTestConverter.Scanner(input);
+
+ var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
+
+ Assert.Equal(format, result.VersionString);
+ Assert.Equal(0, result.OffsetInFile);
+ }
+
+ [Fact]
+ public void ReadsHeaderWithBlankSpaceBefore()
{
- var s = @"one two
+ const string input = @"
+
+%PDF-1.2";
+
+ var scanner = StringBytesTestConverter.Scanner(input);
+
+ var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
+
+ Assert.Equal(1.2m, result.Version);
+ Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 7 : 9, result.OffsetInFile);
+ }
+
+ [Fact]
+ public void EmptyInputThrows()
+ {
+ var scanner = StringBytesTestConverter.Scanner(string.Empty);
+
+ Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
+
+ Assert.Throws(action);
+ }
+
+ [Fact]
+ public void HeaderPrecededByJunkNonLenientDoesNotThrow()
+ {
+ var input = @"one
+ %PDF-1.2";
+ var scanner = StringBytesTestConverter.Scanner(input);
+
+ var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
+
+ Assert.Equal(1.2m, result.Version);
+ Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
+ }
+
+ [Fact]
+ public void HeaderPrecededByJunkLenientReads()
+ {
+ var input = @"one
+ %PDF-1.7";
+ var scanner = StringBytesTestConverter.Scanner(input);
+
+ var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
+
+ Assert.Equal(1.7m, result.Version);
+ Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
+ }
+
+ [Fact]
+ public void HeaderPrecededByJunkDoesNotThrow()
+ {
+ var s = @"one two
three %PDF-1.6";
-
- var scanner = StringBytesTestConverter.Scanner(s);
-
- var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
-
- Assert.Equal(1.6m, result.Version);
- Assert.Equal(TestEnvironment.IsSingleByteNewLine(s) ? 14 : 15, result.OffsetInFile);
- }
-
- [Fact]
- public void JunkThenEndThrows()
- {
- var scanner = StringBytesTestConverter.Scanner(@"one two");
-
- Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
-
- Assert.Throws(action);
- }
-
- [Fact]
- public void VersionFormatInvalidNotLenientThrows()
- {
- var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
-
- Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
-
- Assert.Throws(action);
- }
-
- [Fact]
- public void VersionFormatInvalidLenientDefaults1Point4()
- {
- var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
-
- var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
-
- Assert.Equal(1.4m, result.Version);
- }
-
- [Fact]
- public void ParsingResetsPosition()
- {
- var scanner = StringBytesTestConverter.Scanner(@"%FDF-1.6");
-
- var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
-
- Assert.Equal(0, scanner.scanner.CurrentPosition);
- Assert.Equal(0, result.OffsetInFile);
- }
-
- [Fact]
- public void Issue334()
- {
- var input = OtherEncodings.StringAsLatin1Bytes("%PDF-1.7\r\n%âãÏÓ\r\n1 0 obj\r\n<>\r\nendobj");
-
- var bytes = new ByteArrayInputBytes(input);
-
- var scanner = new CoreTokenScanner(bytes, ScannerScope.None);
-
- var result = FileHeaderParser.Parse(scanner, bytes, false, log);
-
- Assert.Equal(1.7m, result.Version);
- }
-
- [Fact]
- public void Issue443()
- {
- const string hex =
- @"00 0F 4A 43 42 31 33 36 36 31 32 32 37 2E 70 64 66 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 50 44 46 20 43 41 52 4F 01 00 FF FF FF FF 00 00 00 00 00 04 DF 28 00 00 00 00 AF 51 7E 82 AF 52 D7 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 81 81 03 0D 00 00 25 50 44 46 2D 31 2E 31 0A 25 E2 E3 CF D3 0D 0A 31 20 30 20 6F 62 6A";
-
- var bytes = hex.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(x => HexToken.Convert(x[0], x[1]));
-
- var str = OtherEncodings.BytesAsLatin1String(bytes.ToArray());
-
- var scanner = StringBytesTestConverter.Scanner(str);
-
- var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
-
- Assert.Equal(0, scanner.scanner.CurrentPosition);
- Assert.Equal(128, result.OffsetInFile);
- Assert.Equal(1.1m, result.Version);
- Assert.Equal("PDF-1.1", result.VersionString);
- }
- }
-}
+
+ var scanner = StringBytesTestConverter.Scanner(s);
+
+ var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
+
+ Assert.Equal(1.6m, result.Version);
+ Assert.Equal(TestEnvironment.IsSingleByteNewLine(s) ? 14 : 15, result.OffsetInFile);
+ }
+
+ [Fact]
+ public void JunkThenEndThrows()
+ {
+ var scanner = StringBytesTestConverter.Scanner(@"one two");
+
+ Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
+
+ Assert.Throws(action);
+ }
+
+ [Fact]
+ public void VersionFormatInvalidNotLenientThrows()
+ {
+ var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
+
+ Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
+
+ Assert.Throws(action);
+ }
+
+ [Fact]
+ public void VersionFormatInvalidLenientDefaults1Point4()
+ {
+ var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
+
+ var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
+
+ Assert.Equal(1.4m, result.Version);
+ }
+
+ [Fact]
+ public void ParsingResetsPosition()
+ {
+ var scanner = StringBytesTestConverter.Scanner(@"%FDF-1.6");
+
+ var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
+
+ Assert.Equal(0, scanner.scanner.CurrentPosition);
+ Assert.Equal(0, result.OffsetInFile);
+ }
+
+ [Fact]
+ public void Issue334()
+ {
+ var input = OtherEncodings.StringAsLatin1Bytes("%PDF-1.7\r\n%âãÏÓ\r\n1 0 obj\r\n<>\r\nendobj");
+
+ var bytes = new ByteArrayInputBytes(input);
+
+ var scanner = new CoreTokenScanner(bytes, true, ScannerScope.None);
+
+ var result = FileHeaderParser.Parse(scanner, bytes, false, log);
+
+ Assert.Equal(1.7m, result.Version);
+ }
+
+ [Fact]
+ public void Issue443()
+ {
+ const string hex =
+ @"00 0F 4A 43 42 31 33 36 36 31 32 32 37 2E 70 64 66 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 50 44 46 20 43 41 52 4F 01 00 FF FF FF FF 00 00 00 00 00 04 DF 28 00 00 00 00 AF 51 7E 82 AF 52 D7 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 81 81 03 0D 00 00 25 50 44 46 2D 31 2E 31 0A 25 E2 E3 CF D3 0D 0A 31 20 30 20 6F 62 6A";
+
+ var bytes = hex.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(x => HexToken.Convert(x[0], x[1]));
+
+ var str = OtherEncodings.BytesAsLatin1String(bytes.ToArray());
+
+ var scanner = StringBytesTestConverter.Scanner(str);
+
+ var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
+
+ Assert.Equal(0, scanner.scanner.CurrentPosition);
+ Assert.Equal(128, result.OffsetInFile);
+ Assert.Equal(1.1m, result.Version);
+ Assert.Equal("PDF-1.1", result.VersionString);
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileTrailerParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileTrailerParserTests.cs
index 39e52aa7..6c614113 100644
--- a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileTrailerParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileTrailerParserTests.cs
@@ -23,7 +23,7 @@ startxref
%%EOF", false);
- var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+ var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
Assert.Equal(456, result);
}
@@ -49,7 +49,7 @@ startxref
startxref
17", false);
- var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+ var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
Assert.Equal(17, result);
}
@@ -75,7 +75,7 @@ startref
start_rexf
17", false);
- Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+ Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
Assert.Throws(action);
}
@@ -85,7 +85,7 @@ start_rexf
{
var input = StringBytesTestConverter.Convert("11 0 obj", false);
- Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(null, new CoreTokenScanner(input.Bytes), false);
+ Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(null, new CoreTokenScanner(input.Bytes, true), false);
Assert.Throws(action);
}
@@ -111,7 +111,7 @@ startxref
<< /Why (am i here?) >> 69
%EOF", false);
- Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+ Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
Assert.Throws(action);
}
@@ -126,7 +126,7 @@ endobj
startxref
", false);
- Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+ Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
Assert.Throws(action);
}
@@ -152,7 +152,7 @@ startxref
%%EOF", false);
- var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+ var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
Assert.Equal(1274665676543, result);
}
@@ -166,7 +166,7 @@ startxref %Commented here
%%EOF", false);
- var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+ var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false);
Assert.Equal(57695, result);
}
diff --git a/src/UglyToad.PdfPig.Tests/StringBytesTestConverter.cs b/src/UglyToad.PdfPig.Tests/StringBytesTestConverter.cs
index 4fa77915..bf01026e 100644
--- a/src/UglyToad.PdfPig.Tests/StringBytesTestConverter.cs
+++ b/src/UglyToad.PdfPig.Tests/StringBytesTestConverter.cs
@@ -34,7 +34,7 @@
internal static (CoreTokenScanner scanner, IInputBytes bytes) Scanner(string s)
{
var inputBytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s));
- var result = new CoreTokenScanner(inputBytes);
+ var result = new CoreTokenScanner(inputBytes, true);
return (result, inputBytes);
}
diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs
index 43aec37a..d2bf729d 100644
--- a/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs
@@ -7,7 +7,7 @@
public class ArrayTokenizerTests
{
- private readonly ArrayTokenizer tokenizer = new ArrayTokenizer();
+ private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true);
[Theory]
[InlineData("]")]
diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/DictionaryTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/DictionaryTokenizerTests.cs
index 5416d00f..9020b9f5 100644
--- a/src/UglyToad.PdfPig.Tests/Tokenization/DictionaryTokenizerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Tokenization/DictionaryTokenizerTests.cs
@@ -10,7 +10,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization
public class DictionaryTokenizerTests
{
- private readonly DictionaryTokenizer tokenizer = new DictionaryTokenizer();
+ private readonly DictionaryTokenizer tokenizer = new DictionaryTokenizer(true);
[Theory]
[InlineData("[rjee]")]
diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs
index 87e58af6..ba117ac6 100644
--- a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs
@@ -14,7 +14,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization.Scanner
public CoreTokenScannerTests()
{
- scannerFactory = x => new CoreTokenScanner(x);
+ scannerFactory = x => new CoreTokenScanner(x, true);
}
[Fact]
diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs
index 58266a46..b04597a6 100644
--- a/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs
@@ -7,7 +7,7 @@
public class StringTokenizerTests
{
- private readonly StringTokenizer tokenizer = new StringTokenizer();
+ private readonly StringTokenizer tokenizer = new StringTokenizer(true);
[Fact]
public void NullInput_ReturnsFalse()
diff --git a/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs
index 4aa2412e..323202bd 100644
--- a/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs
@@ -7,8 +7,15 @@
internal class ArrayTokenizer : ITokenizer
{
+ private readonly bool usePdfDocEncoding;
+
public bool ReadsNextByte { get; } = false;
+ public ArrayTokenizer(bool usePdfDocEncoding)
+ {
+ this.usePdfDocEncoding = usePdfDocEncoding;
+ }
+
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;
@@ -18,7 +25,7 @@
return false;
}
- var scanner = new CoreTokenScanner(inputBytes, ScannerScope.Array);
+ var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Array);
var contents = new List();
diff --git a/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs
index b2296b50..e715db98 100644
--- a/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs
@@ -7,6 +7,7 @@
internal class DictionaryTokenizer : ITokenizer
{
+ private readonly bool usePdfDocEncoding;
private readonly IReadOnlyList requiredKeys;
public bool ReadsNextByte { get; } = false;
@@ -14,12 +15,16 @@
///
/// Create a new .
///
+ ///
+ /// Whether to read strings using the PdfDocEncoding.
+ ///
///
/// Can be provided to recover from errors with missing dictionary end symbols if the
/// set of keys expected in the dictionary are known.
///
- public DictionaryTokenizer(IReadOnlyList requiredKeys = null)
+ public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList requiredKeys = null)
{
+ this.usePdfDocEncoding = usePdfDocEncoding;
this.requiredKeys = requiredKeys;
}
@@ -75,7 +80,7 @@
return false;
}
- var coreScanner = new CoreTokenScanner(inputBytes, ScannerScope.Dictionary);
+ var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary);
var tokens = new List();
diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
index 9d33f834..5536d319 100644
--- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
+++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
@@ -10,9 +10,7 @@
///
public class CoreTokenScanner : ISeekableTokenScanner
{
- private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer();
private static readonly CommentTokenizer CommentTokenizer = new CommentTokenizer();
- private static readonly DictionaryTokenizer DictionaryTokenizer = new DictionaryTokenizer();
private static readonly HexTokenizer HexTokenizer = new HexTokenizer();
private static readonly NameTokenizer NameTokenizer = new NameTokenizer();
@@ -20,11 +18,14 @@
// StringBuilder it re-uses.
private readonly PlainTokenizer PlainTokenizer = new PlainTokenizer();
private readonly NumericTokenizer NumericTokenizer = new NumericTokenizer();
- private readonly StringTokenizer StringTokenizer = new StringTokenizer();
+ private readonly StringTokenizer stringTokenizer;
+ private readonly ArrayTokenizer arrayTokenizer;
+ private readonly DictionaryTokenizer dictionaryTokenizer;
private readonly ScannerScope scope;
private readonly IReadOnlyDictionary> namedDictionaryRequiredKeys;
private readonly IInputBytes inputBytes;
+ private readonly bool usePdfDocEncoding;
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
///
@@ -49,10 +50,15 @@
///
public CoreTokenScanner(
IInputBytes inputBytes,
+ bool usePdfDocEncoding,
ScannerScope scope = ScannerScope.None,
IReadOnlyDictionary> namedDictionaryRequiredKeys = null)
{
this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
+ this.usePdfDocEncoding = usePdfDocEncoding;
+ this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
+ this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding);
+ this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding);
this.scope = scope;
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
}
@@ -121,20 +127,20 @@
switch (c)
{
case '(':
- tokenizer = StringTokenizer;
+ tokenizer = stringTokenizer;
break;
case '<':
var following = inputBytes.Peek();
if (following == '<')
{
isSkippingSymbol = true;
- tokenizer = DictionaryTokenizer;
+ tokenizer = dictionaryTokenizer;
if (namedDictionaryRequiredKeys != null
&& CurrentToken is NameToken name
&& namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys))
{
- tokenizer = new DictionaryTokenizer(requiredKeys);
+ tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys);
}
}
else
@@ -150,7 +156,7 @@
}
break;
case '[':
- tokenizer = ArrayTokenizer;
+ tokenizer = arrayTokenizer;
break;
case ']' when scope == ScannerScope.Array:
return false;
diff --git a/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs
index 6592dddd..45fa3a01 100644
--- a/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs
@@ -6,9 +6,17 @@
internal class StringTokenizer : ITokenizer
{
+ private readonly bool usePdfDocEncoding;
+
private readonly StringBuilder stringBuilder = new StringBuilder();
+
public bool ReadsNextByte { get; } = false;
+ public StringTokenizer(bool usePdfDocEncoding)
+ {
+ this.usePdfDocEncoding = usePdfDocEncoding;
+ }
+
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;
@@ -164,6 +172,21 @@
encodedWith = StringToken.Encoding.Utf16;
}
+ else if (usePdfDocEncoding)
+ {
+ var builtStr = builder.ToString();
+ var rawBytes = OtherEncodings.StringAsLatin1Bytes(builtStr);
+ if (PdfDocEncoding.TryConvertBytesToString(rawBytes, out var str))
+ {
+ tokenStr = str;
+ encodedWith = StringToken.Encoding.PdfDocEncoding;
+ }
+ else
+ {
+ tokenStr = builtStr;
+ encodedWith = StringToken.Encoding.Iso88591;
+ }
+ }
else
{
tokenStr = builder.ToString();
@@ -171,6 +194,21 @@
encodedWith = StringToken.Encoding.Iso88591;
}
}
+ else if (usePdfDocEncoding)
+ {
+ var builtStr = builder.ToString();
+ var rawBytes = OtherEncodings.StringAsLatin1Bytes(builtStr);
+ if (PdfDocEncoding.TryConvertBytesToString(rawBytes, out var str))
+ {
+ tokenStr = str;
+ encodedWith = StringToken.Encoding.PdfDocEncoding;
+ }
+ else
+ {
+ tokenStr = builtStr;
+ encodedWith = StringToken.Encoding.Iso88591;
+ }
+ }
else
{
tokenStr = builder.ToString();
diff --git a/src/UglyToad.PdfPig.Tokens/StringToken.cs b/src/UglyToad.PdfPig.Tokens/StringToken.cs
index 2b633526..f8e7bbd2 100644
--- a/src/UglyToad.PdfPig.Tokens/StringToken.cs
+++ b/src/UglyToad.PdfPig.Tokens/StringToken.cs
@@ -53,6 +53,8 @@ namespace UglyToad.PdfPig.Tokens
{
return System.Text.Encoding.Unicode.GetBytes(Data);
}
+ case Encoding.PdfDocEncoding:
+ return PdfDocEncoding.StringToBytes(Data);
default:
return OtherEncodings.StringAsLatin1Bytes(Data);
}
@@ -96,7 +98,11 @@ namespace UglyToad.PdfPig.Tokens
///
/// UTF-16 Big Endian.
///
- Utf16BE = 2
+ Utf16BE = 2,
+ ///
+ /// The PdfDocEncoding for strings in the body of a PDF file.
+ ///
+ PdfDocEncoding = 3,
}
}
}
\ No newline at end of file
diff --git a/src/UglyToad.PdfPig/Encryption/EncryptionHandler.cs b/src/UglyToad.PdfPig/Encryption/EncryptionHandler.cs
index fb9757f8..0251ac6c 100644
--- a/src/UglyToad.PdfPig/Encryption/EncryptionHandler.cs
+++ b/src/UglyToad.PdfPig/Encryption/EncryptionHandler.cs
@@ -64,6 +64,9 @@
case HexToken hex:
documentIdBytes = hex.Bytes.ToArray();
break;
+ case StringToken str:
+ documentIdBytes = str.GetBytes();
+ break;
default:
documentIdBytes = OtherEncodings.StringAsLatin1Bytes(token.Data);
break;
@@ -398,7 +401,7 @@
return token;
}
- var data = OtherEncodings.StringAsLatin1Bytes(stringToken.Data);
+ var data = stringToken.GetBytes();
var decrypted = DecryptData(data, reference);
diff --git a/src/UglyToad.PdfPig/Parser/PageContentParser.cs b/src/UglyToad.PdfPig/Parser/PageContentParser.cs
index e6a8e5f5..bd253e7b 100644
--- a/src/UglyToad.PdfPig/Parser/PageContentParser.cs
+++ b/src/UglyToad.PdfPig/Parser/PageContentParser.cs
@@ -21,10 +21,12 @@
this.operationFactory = operationFactory;
}
- public IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes,
+ public IReadOnlyList Parse(
+ int pageNumber,
+ IInputBytes inputBytes,
ILog log)
{
- var scanner = new CoreTokenScanner(inputBytes);
+ var scanner = new CoreTokenScanner(inputBytes, false);
var precedingTokens = new List();
var graphicsStateOperations = new List();
diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
index f717cf50..92f30d15 100644
--- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
+++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
@@ -70,7 +70,7 @@
{
var isLenientParsing = options?.UseLenientParsing ?? true;
- var tokenScanner = new CoreTokenScanner(inputBytes);
+ var tokenScanner = new CoreTokenScanner(inputBytes, true);
var passwords = new List();
diff --git a/src/UglyToad.PdfPig/PdfFonts/Parser/CMapParser.cs b/src/UglyToad.PdfPig/PdfFonts/Parser/CMapParser.cs
index 6e097b27..35f97dfe 100644
--- a/src/UglyToad.PdfPig/PdfFonts/Parser/CMapParser.cs
+++ b/src/UglyToad.PdfPig/PdfFonts/Parser/CMapParser.cs
@@ -22,6 +22,7 @@
public CMap Parse(IInputBytes inputBytes)
{
var scanner = new CoreTokenScanner(inputBytes,
+ false,
namedDictionaryRequiredKeys: new Dictionary>
{
{ NameToken.CidSystemInfo, new[] { NameToken.Registry, NameToken.Ordering, NameToken.Supplement } }
diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
index 2d5a570c..34ac8cde 100644
--- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
@@ -57,7 +57,7 @@
this.objectLocationProvider = objectLocationProvider;
this.filterProvider = filterProvider;
this.encryptionHandler = encryptionHandler;
- coreTokenScanner = new CoreTokenScanner(inputBytes);
+ coreTokenScanner = new CoreTokenScanner(inputBytes, true);
}
public void UpdateEncryptionHandler(IEncryptionHandler newHandler)
@@ -797,7 +797,7 @@
// Read the N integers
var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider, this));
- var scanner = new CoreTokenScanner(bytes);
+ var scanner = new CoreTokenScanner(bytes, true);
var objects = new List>();