diff --git a/src/UglyToad.PdfPig.Core/OtherEncodings.cs b/src/UglyToad.PdfPig.Core/OtherEncodings.cs index c54cf4ad..2c4cdf1b 100644 --- a/src/UglyToad.PdfPig.Core/OtherEncodings.cs +++ b/src/UglyToad.PdfPig.Core/OtherEncodings.cs @@ -57,280 +57,5 @@ return Iso88591.GetString(bytes); } - - /// - /// The encoding for strings in a PDF file which encodes all of the ISO Latin 1 character set. - /// - public static class PdfDocEncoding - { - private static readonly IReadOnlyDictionary CodeToUnicode = new Dictionary - { - {0, '\u0000'}, - {1, '\u0001'}, - {2, '\u0002'}, - {3, '\u0003'}, - {4, '\u0004'}, - {5, '\u0005'}, - {6, '\u0006'}, - {7, '\u0007'}, - {8, '\u0008'}, - {9, '\u0009'}, - {10, '\u000A'}, - {11, '\u000B'}, - {12, '\u000C'}, - {13, '\u000D'}, - {14, '\u000E'}, - {15, '\u000F'}, - {16, '\u0010'}, - {17, '\u0011'}, - {18, '\u0012'}, - {19, '\u0013'}, - {20, '\u0014'}, - {21, '\u0015'}, - {22, '\u0017'}, - {23, '\u0017'}, - {24, '\u02D8'}, - {25, '\u02C7'}, - {26, '\u02C6'}, - {27, '\u02D9'}, - {28, '\u02DD'}, - {29, '\u02DB'}, - {30, '\u02DA'}, - {31, '\u02DC'}, - {32, '\u0020'}, - {33, '\u0021'}, - {34, '\u0022'}, - {35, '\u0023'}, - {36, '\u0024'}, - {37, '\u0025'}, - {38, '\u0026'}, - {39, '\u0027'}, - {40, '\u0028'}, - {41, '\u0029'}, - {42, '\u002A'}, - {43, '\u002B'}, - {44, '\u002C'}, - {45, '\u002D'}, - {46, '\u002E'}, - {47, '\u002F'}, - {48, '\u0030'}, - {49, '\u0031'}, - {50, '\u0032'}, - {51, '\u0033'}, - {52, '\u0034'}, - {53, '\u0035'}, - {54, '\u0036'}, - {55, '\u0037'}, - {56, '\u0038'}, - {57, '\u0039'}, - {58, '\u003A'}, - {59, '\u003B'}, - {60, '\u003C'}, - {61, '\u003D'}, - {62, '\u003E'}, - {63, '\u003F'}, - {64, '\u0040'}, - {65, '\u0041'}, - {66, '\u0042'}, - {67, '\u0043'}, - {68, '\u0044'}, - {69, '\u0045'}, - {70, '\u0046'}, - {71, '\u0047'}, - {72, '\u0048'}, - {73, '\u0049'}, - {74, '\u004A'}, - {75, '\u004B'}, - {76, '\u004C'}, - {77, '\u004D'}, - {78, '\u004E'}, - {79, '\u004F'}, - {80, '\u0050'}, - {81, '\u0051'}, - {82, '\u0052'}, - {83, '\u0053'}, - {84, '\u0054'}, - {85, '\u0055'}, - {86, '\u0056'}, - {87, '\u0057'}, - {88, '\u0058'}, - {89, '\u0059'}, - {90, '\u005A'}, - {91, '\u005B'}, - {92, '\u005C'}, - {93, '\u005D'}, - {94, '\u005E'}, - {95, '\u005F'}, - {96, '\u0060'}, - {97, '\u0061'}, - {98, '\u0062'}, - {99, '\u0063'}, - {100, '\u0064'}, - {101, '\u0065'}, - {102, '\u0066'}, - {103, '\u0067'}, - {104, '\u0068'}, - {105, '\u0069'}, - {106, '\u006A'}, - {107, '\u006B'}, - {108, '\u006C'}, - {109, '\u006D'}, - {110, '\u006E'}, - {111, '\u006F'}, - {112, '\u0070'}, - {113, '\u0071'}, - {114, '\u0072'}, - {115, '\u0073'}, - {116, '\u0074'}, - {117, '\u0075'}, - {118, '\u0076'}, - {119, '\u0077'}, - {120, '\u0078'}, - {121, '\u0079'}, - {122, '\u007A'}, - {123, '\u007B'}, - {124, '\u007C'}, - {125, '\u007D'}, - {126, '\u007E'}, - {128, '\u2022'}, - {129, '\u2020'}, - {130, '\u2021'}, - {131, '\u2026'}, - {132, '\u2014'}, - {133, '\u2013'}, - {134, '\u0192'}, - {135, '\u2044'}, - {136, '\u2039'}, - {137, '\u203A'}, - {138, '\u2212'}, - {139, '\u2030'}, - {140, '\u201E'}, - {141, '\u201C'}, - {142, '\u201D'}, - {143, '\u2018'}, - {144, '\u2019'}, - {145, '\u201A'}, - {146, '\u2122'}, - {147, '\uFB01'}, - {148, '\uFB02'}, - {149, '\u0141'}, - {150, '\u0152'}, - {151, '\u0160'}, - {152, '\u0178'}, - {153, '\u017D'}, - {154, '\u0131'}, - {155, '\u0142'}, - {156, '\u0153'}, - {157, '\u0161'}, - {158, '\u017E'}, - {160, '\u20AC'}, - {161, '\u00A1'}, - {162, '\u00A2'}, - {163, '\u00A3'}, - {164, '\u00A4'}, - {165, '\u00A5'}, - {166, '\u00A6'}, - {167, '\u00A7'}, - {168, '\u00A8'}, - {169, '\u00A9'}, - {170, '\u00AA'}, - {171, '\u00AB'}, - {172, '\u00AC'}, - {174, '\u00AE'}, - {175, '\u00AF'}, - {176, '\u00B0'}, - {177, '\u00B1'}, - {178, '\u00B2'}, - {179, '\u00B3'}, - {180, '\u00B4'}, - {181, '\u00B5'}, - {182, '\u00B6'}, - {183, '\u00B7'}, - {184, '\u00B8'}, - {185, '\u00B9'}, - {186, '\u00BA'}, - {187, '\u00BB'}, - {188, '\u00BC'}, - {189, '\u00BD'}, - {190, '\u00BE'}, - {191, '\u00BF'}, - {192, '\u00C0'}, - {193, '\u00C1'}, - {194, '\u00C2'}, - {195, '\u00C3'}, - {196, '\u00C4'}, - {197, '\u00C5'}, - {198, '\u00C6'}, - {199, '\u00C7'}, - {200, '\u00C8'}, - {201, '\u00C9'}, - {202, '\u00CA'}, - {203, '\u00CB'}, - {204, '\u00CC'}, - {205, '\u00CD'}, - {206, '\u00CE'}, - {207, '\u00CF'}, - {208, '\u00D0'}, - {209, '\u00D1'}, - {210, '\u00D2'}, - {211, '\u00D3'}, - {212, '\u00D4'}, - {213, '\u00D5'}, - {214, '\u00D6'}, - {215, '\u00D7'}, - {216, '\u00D8'}, - {217, '\u00D9'}, - {218, '\u00DA'}, - {219, '\u00DB'}, - {220, '\u00DC'}, - {221, '\u00DD'}, - {222, '\u00DE'}, - {223, '\u00DF'}, - {224, '\u00E0'}, - {225, '\u00E1'}, - {226, '\u00E2'}, - {227, '\u00E3'}, - {228, '\u00E4'}, - {229, '\u00E5'}, - {230, '\u00E6'}, - {231, '\u00E7'}, - {232, '\u00E8'}, - {233, '\u00E9'}, - {234, '\u00EA'}, - {235, '\u00EB'}, - {236, '\u00EC'}, - {237, '\u00ED'}, - {238, '\u00EE'}, - {239, '\u00EF'}, - {240, '\u00F0'}, - {241, '\u00F1'}, - {242, '\u00F2'}, - {243, '\u00F3'}, - {244, '\u00F4'}, - {245, '\u00F5'}, - {246, '\u00F6'}, - {247, '\u00F7'}, - {248, '\u00F8'}, - {249, '\u00F9'}, - {250, '\u00FA'}, - {251, '\u00FB'}, - {252, '\u00FC'}, - {253, '\u00FD'}, - {254, '\u00FE'}, - {255, '\u00FF'}, - // Undefined codes follow. - {127, '\uFFFD'}, - {159, '\uFFFD'}, - {173, '\uFFFD'} - }; - - /// - /// Whether the PDF Doc Encoding contains a corresponding character. - /// - public static bool ContainsChar(char c) - { - return CodeToUnicode.Any(x => x.Value == c); - } - } } } diff --git a/src/UglyToad.PdfPig.Core/PdfDocEncoding.cs b/src/UglyToad.PdfPig.Core/PdfDocEncoding.cs new file mode 100644 index 00000000..e58ad0f7 --- /dev/null +++ b/src/UglyToad.PdfPig.Core/PdfDocEncoding.cs @@ -0,0 +1,312 @@ +namespace UglyToad.PdfPig.Core +{ + using System.Collections.Generic; + + /// + /// + /// PDFDocEncoding, defined in the spec for text strings in PDF objects (but not content stream contents, + /// Type 1 font contents, etc). + /// + /// + /// Matches ASCII for code points 32 - 126. + /// + /// + public static class PdfDocEncoding + { + private static readonly Dictionary UnicodeToCode = new Dictionary(); + private static readonly Dictionary CodeToUnicode = new Dictionary + { + {9, '\u0009'}, + {10, '\u000A'}, + {13, '\u000D'}, + {24, '\u02D8'}, + {25, '\u02C7'}, + {26, '\u02C6'}, + {27, '\u02D9'}, + {28, '\u02DD'}, + {29, '\u02DB'}, + {30, '\u02DA'}, + {31, '\u02DC'}, + {32, '\u0020'}, + {33, '\u0021'}, + {34, '\u0022'}, + {35, '\u0023'}, + {36, '\u0024'}, + {37, '\u0025'}, + {38, '\u0026'}, + {39, '\u0027'}, + {40, '\u0028'}, + {41, '\u0029'}, + {42, '\u002A'}, + {43, '\u002B'}, + {44, '\u002C'}, + {45, '\u002D'}, + {46, '\u002E'}, + {47, '\u002F'}, + {48, '\u0030'}, + {49, '\u0031'}, + {50, '\u0032'}, + {51, '\u0033'}, + {52, '\u0034'}, + {53, '\u0035'}, + {54, '\u0036'}, + {55, '\u0037'}, + {56, '\u0038'}, + {57, '\u0039'}, + {58, '\u003A'}, + {59, '\u003B'}, + {60, '\u003C'}, + {61, '\u003D'}, + {62, '\u003E'}, + {63, '\u003F'}, + {64, '\u0040'}, + {65, '\u0041'}, + {66, '\u0042'}, + {67, '\u0043'}, + {68, '\u0044'}, + {69, '\u0045'}, + {70, '\u0046'}, + {71, '\u0047'}, + {72, '\u0048'}, + {73, '\u0049'}, + {74, '\u004A'}, + {75, '\u004B'}, + {76, '\u004C'}, + {77, '\u004D'}, + {78, '\u004E'}, + {79, '\u004F'}, + {80, '\u0050'}, + {81, '\u0051'}, + {82, '\u0052'}, + {83, '\u0053'}, + {84, '\u0054'}, + {85, '\u0055'}, + {86, '\u0056'}, + {87, '\u0057'}, + {88, '\u0058'}, + {89, '\u0059'}, + {90, '\u005A'}, + {91, '\u005B'}, + {92, '\u005C'}, + {93, '\u005D'}, + {94, '\u005E'}, + {95, '\u005F'}, + {96, '\u0060'}, + {97, '\u0061'}, + {98, '\u0062'}, + {99, '\u0063'}, + {100, '\u0064'}, + {101, '\u0065'}, + {102, '\u0066'}, + {103, '\u0067'}, + {104, '\u0068'}, + {105, '\u0069'}, + {106, '\u006A'}, + {107, '\u006B'}, + {108, '\u006C'}, + {109, '\u006D'}, + {110, '\u006E'}, + {111, '\u006F'}, + {112, '\u0070'}, + {113, '\u0071'}, + {114, '\u0072'}, + {115, '\u0073'}, + {116, '\u0074'}, + {117, '\u0075'}, + {118, '\u0076'}, + {119, '\u0077'}, + {120, '\u0078'}, + {121, '\u0079'}, + {122, '\u007A'}, + {123, '\u007B'}, + {124, '\u007C'}, + {125, '\u007D'}, + {126, '\u007E'}, + {128, '\u2022'}, + {129, '\u2020'}, + {130, '\u2021'}, + {131, '\u2026'}, + {132, '\u2014'}, + {133, '\u2013'}, + {134, '\u0192'}, + {135, '\u2044'}, + {136, '\u2039'}, + {137, '\u203A'}, + {138, '\u2212'}, + {139, '\u2030'}, + {140, '\u201E'}, + {141, '\u201C'}, + {142, '\u201D'}, + {143, '\u2018'}, + {144, '\u2019'}, + {145, '\u201A'}, + {146, '\u2122'}, + {147, '\uFB01'}, + {148, '\uFB02'}, + {149, '\u0141'}, + {150, '\u0152'}, + {151, '\u0160'}, + {152, '\u0178'}, + {153, '\u017D'}, + {154, '\u0131'}, + {155, '\u0142'}, + {156, '\u0153'}, + {157, '\u0161'}, + {158, '\u017E'}, + {160, '\u20AC'}, + {161, '\u00A1'}, + {162, '\u00A2'}, + {163, '\u00A3'}, + {164, '\u00A4'}, + {165, '\u00A5'}, + {166, '\u00A6'}, + {167, '\u00A7'}, + {168, '\u00A8'}, + {169, '\u00A9'}, + {170, '\u00AA'}, + {171, '\u00AB'}, + {172, '\u00AC'}, + {174, '\u00AE'}, + {175, '\u00AF'}, + {176, '\u00B0'}, + {177, '\u00B1'}, + {178, '\u00B2'}, + {179, '\u00B3'}, + {180, '\u00B4'}, + {181, '\u00B5'}, + {182, '\u00B6'}, + {183, '\u00B7'}, + {184, '\u00B8'}, + {185, '\u00B9'}, + {186, '\u00BA'}, + {187, '\u00BB'}, + {188, '\u00BC'}, + {189, '\u00BD'}, + {190, '\u00BE'}, + {191, '\u00BF'}, + {192, '\u00C0'}, + {193, '\u00C1'}, + {194, '\u00C2'}, + {195, '\u00C3'}, + {196, '\u00C4'}, + {197, '\u00C5'}, + {198, '\u00C6'}, + {199, '\u00C7'}, + {200, '\u00C8'}, + {201, '\u00C9'}, + {202, '\u00CA'}, + {203, '\u00CB'}, + {204, '\u00CC'}, + {205, '\u00CD'}, + {206, '\u00CE'}, + {207, '\u00CF'}, + {208, '\u00D0'}, + {209, '\u00D1'}, + {210, '\u00D2'}, + {211, '\u00D3'}, + {212, '\u00D4'}, + {213, '\u00D5'}, + {214, '\u00D6'}, + {215, '\u00D7'}, + {216, '\u00D8'}, + {217, '\u00D9'}, + {218, '\u00DA'}, + {219, '\u00DB'}, + {220, '\u00DC'}, + {221, '\u00DD'}, + {222, '\u00DE'}, + {223, '\u00DF'}, + {224, '\u00E0'}, + {225, '\u00E1'}, + {226, '\u00E2'}, + {227, '\u00E3'}, + {228, '\u00E4'}, + {229, '\u00E5'}, + {230, '\u00E6'}, + {231, '\u00E7'}, + {232, '\u00E8'}, + {233, '\u00E9'}, + {234, '\u00EA'}, + {235, '\u00EB'}, + {236, '\u00EC'}, + {237, '\u00ED'}, + {238, '\u00EE'}, + {239, '\u00EF'}, + {240, '\u00F0'}, + {241, '\u00F1'}, + {242, '\u00F2'}, + {243, '\u00F3'}, + {244, '\u00F4'}, + {245, '\u00F5'}, + {246, '\u00F6'}, + {247, '\u00F7'}, + {248, '\u00F8'}, + {249, '\u00F9'}, + {250, '\u00FA'}, + {251, '\u00FB'}, + {252, '\u00FC'}, + {253, '\u00FD'}, + {254, '\u00FE'}, + {255, '\u00FF'} + }; + + static PdfDocEncoding() + { + foreach (var c in CodeToUnicode) + { + UnicodeToCode.Add(c.Value, c.Key); + } + } + + + /// + /// Try to convert raw bytes to a PdfDocEncoding encoded string. If unsupported characters are encountered + /// meaning we cannot safely round-trip the value to bytes this will instead return false. + /// + public static bool TryConvertBytesToString(byte[] bytes, out string result) + { + result = null; + if (bytes.Length == 0) + { + result = string.Empty; + return true; + } + + var arr = new char[bytes.Length]; + + for (var i = 0; i < bytes.Length; i++) + { + var b = bytes[i]; + + if (!CodeToUnicode.TryGetValue(b, out var c)) + { + return false; + } + + arr[i] = c; + } + + result = new string(arr); + return true; + } + + /// + /// Map from string back to bytes. This is not a reversible operation for all inputs. + /// + public static byte[] StringToBytes(string s) + { + var result = new byte[s.Length]; + for (int i = 0; i < s.Length; i++) + { + var c = s[i]; + + if (UnicodeToCode.TryGetValue(c, out var b)) + { + result[i] = b; + } + } + + return result; + } + } +} diff --git a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs index 38c6d22c..a6be5f62 100644 --- a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs +++ b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs @@ -42,7 +42,7 @@ inputBytes = new ByteArrayInputBytes(ascii); } - var scanner = new CoreTokenScanner(inputBytes); + var scanner = new CoreTokenScanner(inputBytes, false); if (!scanner.TryReadToken(out CommentToken comment) || !comment.Data.StartsWith("!")) { diff --git a/src/UglyToad.PdfPig.Tests/Fonts/Cmap/CodespaceRangeTests.cs b/src/UglyToad.PdfPig.Tests/Fonts/Cmap/CodespaceRangeTests.cs index 2aa35af5..48c6a730 100644 --- a/src/UglyToad.PdfPig.Tests/Fonts/Cmap/CodespaceRangeTests.cs +++ b/src/UglyToad.PdfPig.Tests/Fonts/Cmap/CodespaceRangeTests.cs @@ -108,7 +108,7 @@ { var parser = new CodespaceRangeParser(); var byteArrayInput = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes("1 begincodespacerange\nendcodespacerange")); - var tokenScanner = new CoreTokenScanner(byteArrayInput); + var tokenScanner = new CoreTokenScanner(byteArrayInput, false); Assert.True(tokenScanner.MoveNext()); Assert.True(tokenScanner.CurrentToken is NumericToken); diff --git a/src/UglyToad.PdfPig.Tests/Integration/AccentedCharactersInBookmarksTests.cs b/src/UglyToad.PdfPig.Tests/Integration/AccentedCharactersInBookmarksTests.cs new file mode 100644 index 00000000..2608e6c2 --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/AccentedCharactersInBookmarksTests.cs @@ -0,0 +1,38 @@ +namespace UglyToad.PdfPig.Tests.Integration; + +using System.Linq; +using Xunit; + +public class AccentedCharactersInBookmarksTests +{ + [Fact] + public void CanReadAccentedBookmarksCorrectly() + { + var path = IntegrationHelpers.GetDocumentPath("bookmarks-with-accented-characters.pdf"); + + using var document = PdfDocument.Open(path); + + var isFound = document.TryGetBookmarks(out var bookmarks); + + Assert.True(isFound); + + var nodes = bookmarks.GetNodes().Select(x => x.Title).ToList(); + + Assert.Equal(new[] + { + "ž", + "žč", + "žđ", + "žć", + "žš", + "ž ajklyghvbnmxcseqwuioprtzdf", + "š", + "šč", + "šđ", + "šć", + "šž", + "š ajklyghvbnmxcseqwuioprtzdf" + }, + nodes); + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/bookmarks-with-accented-characters.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/bookmarks-with-accented-characters.pdf new file mode 100644 index 00000000..e03d8c42 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/bookmarks-with-accented-characters.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs index 6a50d0c6..33fc5187 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs @@ -1,182 +1,182 @@ -namespace UglyToad.PdfPig.Tests.Parser.Parts -{ - using System; - using Logging; - using PdfPig.Core; - using PdfPig.Parser.FileStructure; - using PdfPig.Tokenization.Scanner; - using PdfPig.Tokens; - using System.Linq; - using Xunit; - - public class FileHeaderParserTests - { - private readonly ILog log = new NoOpLog(); - [Fact] - public void NullScannerThrows() - { - Action action = () => FileHeaderParser.Parse(null, null, false, log); - - Assert.Throws(action); - } - - [Theory] - [InlineData("PDF-1.0")] - [InlineData("PDF-1.1")] - [InlineData("PDF-1.7")] - [InlineData("PDF-1.9")] - [InlineData("FDF-1.0")] - [InlineData("FDF-1.9")] - public void ReadsConformingHeader(string format) - { - var input = $"%{format}\nany garbage"; - - var scanner = StringBytesTestConverter.Scanner(input); - - var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); - - Assert.Equal(format, result.VersionString); - Assert.Equal(0, result.OffsetInFile); - } - - [Fact] - public void ReadsHeaderWithBlankSpaceBefore() - { - const string input = @" - -%PDF-1.2"; - - var scanner = StringBytesTestConverter.Scanner(input); - - var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); - - Assert.Equal(1.2m, result.Version); - Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 7 : 9, result.OffsetInFile); - } - - [Fact] - public void EmptyInputThrows() - { - var scanner = StringBytesTestConverter.Scanner(string.Empty); - - Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); - - Assert.Throws(action); - } - - [Fact] - public void HeaderPrecededByJunkNonLenientDoesNotThrow() +namespace UglyToad.PdfPig.Tests.Parser.Parts +{ + using System; + using Logging; + using PdfPig.Core; + using PdfPig.Parser.FileStructure; + using PdfPig.Tokenization.Scanner; + using PdfPig.Tokens; + using System.Linq; + using Xunit; + + public class FileHeaderParserTests + { + private readonly ILog log = new NoOpLog(); + [Fact] + public void NullScannerThrows() { - var input = @"one - %PDF-1.2"; - var scanner = StringBytesTestConverter.Scanner(input); - - var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); - - Assert.Equal(1.2m, result.Version); - Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile); - } - - [Fact] - public void HeaderPrecededByJunkLenientReads() + Action action = () => FileHeaderParser.Parse(null, null, false, log); + + Assert.Throws(action); + } + + [Theory] + [InlineData("PDF-1.0")] + [InlineData("PDF-1.1")] + [InlineData("PDF-1.7")] + [InlineData("PDF-1.9")] + [InlineData("FDF-1.0")] + [InlineData("FDF-1.9")] + public void ReadsConformingHeader(string format) { - var input = @"one - %PDF-1.7"; - var scanner = StringBytesTestConverter.Scanner(input); - - var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); - - Assert.Equal(1.7m, result.Version); - Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile); - } - - [Fact] - public void HeaderPrecededByJunkDoesNotThrow() + var input = $"%{format}\nany garbage"; + + var scanner = StringBytesTestConverter.Scanner(input); + + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); + + Assert.Equal(format, result.VersionString); + Assert.Equal(0, result.OffsetInFile); + } + + [Fact] + public void ReadsHeaderWithBlankSpaceBefore() { - var s = @"one two + const string input = @" + +%PDF-1.2"; + + var scanner = StringBytesTestConverter.Scanner(input); + + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); + + Assert.Equal(1.2m, result.Version); + Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 7 : 9, result.OffsetInFile); + } + + [Fact] + public void EmptyInputThrows() + { + var scanner = StringBytesTestConverter.Scanner(string.Empty); + + Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); + + Assert.Throws(action); + } + + [Fact] + public void HeaderPrecededByJunkNonLenientDoesNotThrow() + { + var input = @"one + %PDF-1.2"; + var scanner = StringBytesTestConverter.Scanner(input); + + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); + + Assert.Equal(1.2m, result.Version); + Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile); + } + + [Fact] + public void HeaderPrecededByJunkLenientReads() + { + var input = @"one + %PDF-1.7"; + var scanner = StringBytesTestConverter.Scanner(input); + + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); + + Assert.Equal(1.7m, result.Version); + Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile); + } + + [Fact] + public void HeaderPrecededByJunkDoesNotThrow() + { + var s = @"one two three %PDF-1.6"; - - var scanner = StringBytesTestConverter.Scanner(s); - - var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); - - Assert.Equal(1.6m, result.Version); - Assert.Equal(TestEnvironment.IsSingleByteNewLine(s) ? 14 : 15, result.OffsetInFile); - } - - [Fact] - public void JunkThenEndThrows() - { - var scanner = StringBytesTestConverter.Scanner(@"one two"); - - Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); - - Assert.Throws(action); - } - - [Fact] - public void VersionFormatInvalidNotLenientThrows() - { - var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69"); - - Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); - - Assert.Throws(action); - } - - [Fact] - public void VersionFormatInvalidLenientDefaults1Point4() - { - var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69"); - - var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); - - Assert.Equal(1.4m, result.Version); - } - - [Fact] - public void ParsingResetsPosition() - { - var scanner = StringBytesTestConverter.Scanner(@"%FDF-1.6"); - - var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); - - Assert.Equal(0, scanner.scanner.CurrentPosition); - Assert.Equal(0, result.OffsetInFile); - } - - [Fact] - public void Issue334() - { - var input = OtherEncodings.StringAsLatin1Bytes("%PDF-1.7\r\n%âãÏÓ\r\n1 0 obj\r\n<>\r\nendobj"); - - var bytes = new ByteArrayInputBytes(input); - - var scanner = new CoreTokenScanner(bytes, ScannerScope.None); - - var result = FileHeaderParser.Parse(scanner, bytes, false, log); - - Assert.Equal(1.7m, result.Version); - } - - [Fact] - public void Issue443() - { - const string hex = - @"00 0F 4A 43 42 31 33 36 36 31 32 32 37 2E 70 64 66 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 50 44 46 20 43 41 52 4F 01 00 FF FF FF FF 00 00 00 00 00 04 DF 28 00 00 00 00 AF 51 7E 82 AF 52 D7 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 81 81 03 0D 00 00 25 50 44 46 2D 31 2E 31 0A 25 E2 E3 CF D3 0D 0A 31 20 30 20 6F 62 6A"; - - var bytes = hex.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(x => HexToken.Convert(x[0], x[1])); - - var str = OtherEncodings.BytesAsLatin1String(bytes.ToArray()); - - var scanner = StringBytesTestConverter.Scanner(str); - - var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); - - Assert.Equal(0, scanner.scanner.CurrentPosition); - Assert.Equal(128, result.OffsetInFile); - Assert.Equal(1.1m, result.Version); - Assert.Equal("PDF-1.1", result.VersionString); - } - } -} + + var scanner = StringBytesTestConverter.Scanner(s); + + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); + + Assert.Equal(1.6m, result.Version); + Assert.Equal(TestEnvironment.IsSingleByteNewLine(s) ? 14 : 15, result.OffsetInFile); + } + + [Fact] + public void JunkThenEndThrows() + { + var scanner = StringBytesTestConverter.Scanner(@"one two"); + + Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); + + Assert.Throws(action); + } + + [Fact] + public void VersionFormatInvalidNotLenientThrows() + { + var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69"); + + Action action = () => FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); + + Assert.Throws(action); + } + + [Fact] + public void VersionFormatInvalidLenientDefaults1Point4() + { + var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69"); + + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); + + Assert.Equal(1.4m, result.Version); + } + + [Fact] + public void ParsingResetsPosition() + { + var scanner = StringBytesTestConverter.Scanner(@"%FDF-1.6"); + + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); + + Assert.Equal(0, scanner.scanner.CurrentPosition); + Assert.Equal(0, result.OffsetInFile); + } + + [Fact] + public void Issue334() + { + var input = OtherEncodings.StringAsLatin1Bytes("%PDF-1.7\r\n%âãÏÓ\r\n1 0 obj\r\n<>\r\nendobj"); + + var bytes = new ByteArrayInputBytes(input); + + var scanner = new CoreTokenScanner(bytes, true, ScannerScope.None); + + var result = FileHeaderParser.Parse(scanner, bytes, false, log); + + Assert.Equal(1.7m, result.Version); + } + + [Fact] + public void Issue443() + { + const string hex = + @"00 0F 4A 43 42 31 33 36 36 31 32 32 37 2E 70 64 66 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 50 44 46 20 43 41 52 4F 01 00 FF FF FF FF 00 00 00 00 00 04 DF 28 00 00 00 00 AF 51 7E 82 AF 52 D7 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 81 81 03 0D 00 00 25 50 44 46 2D 31 2E 31 0A 25 E2 E3 CF D3 0D 0A 31 20 30 20 6F 62 6A"; + + var bytes = hex.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(x => HexToken.Convert(x[0], x[1])); + + var str = OtherEncodings.BytesAsLatin1String(bytes.ToArray()); + + var scanner = StringBytesTestConverter.Scanner(str); + + var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); + + Assert.Equal(0, scanner.scanner.CurrentPosition); + Assert.Equal(128, result.OffsetInFile); + Assert.Equal(1.1m, result.Version); + Assert.Equal("PDF-1.1", result.VersionString); + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileTrailerParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileTrailerParserTests.cs index 39e52aa7..6c614113 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileTrailerParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileTrailerParserTests.cs @@ -23,7 +23,7 @@ startxref %%EOF", false); - var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false); Assert.Equal(456, result); } @@ -49,7 +49,7 @@ startxref startxref 17", false); - var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false); Assert.Equal(17, result); } @@ -75,7 +75,7 @@ startref start_rexf 17", false); - Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false); Assert.Throws(action); } @@ -85,7 +85,7 @@ start_rexf { var input = StringBytesTestConverter.Convert("11 0 obj", false); - Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(null, new CoreTokenScanner(input.Bytes), false); + Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(null, new CoreTokenScanner(input.Bytes, true), false); Assert.Throws(action); } @@ -111,7 +111,7 @@ startxref << /Why (am i here?) >> 69 %EOF", false); - Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false); Assert.Throws(action); } @@ -126,7 +126,7 @@ endobj startxref ", false); - Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + Action action = () => FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false); Assert.Throws(action); } @@ -152,7 +152,7 @@ startxref %%EOF", false); - var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false); Assert.Equal(1274665676543, result); } @@ -166,7 +166,7 @@ startxref %Commented here %%EOF", false); - var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + var result = FileTrailerParser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes, true), false); Assert.Equal(57695, result); } diff --git a/src/UglyToad.PdfPig.Tests/StringBytesTestConverter.cs b/src/UglyToad.PdfPig.Tests/StringBytesTestConverter.cs index 4fa77915..bf01026e 100644 --- a/src/UglyToad.PdfPig.Tests/StringBytesTestConverter.cs +++ b/src/UglyToad.PdfPig.Tests/StringBytesTestConverter.cs @@ -34,7 +34,7 @@ internal static (CoreTokenScanner scanner, IInputBytes bytes) Scanner(string s) { var inputBytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s)); - var result = new CoreTokenScanner(inputBytes); + var result = new CoreTokenScanner(inputBytes, true); return (result, inputBytes); } diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs index 43aec37a..d2bf729d 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs @@ -7,7 +7,7 @@ public class ArrayTokenizerTests { - private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(); + private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true); [Theory] [InlineData("]")] diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/DictionaryTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/DictionaryTokenizerTests.cs index 5416d00f..9020b9f5 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/DictionaryTokenizerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/DictionaryTokenizerTests.cs @@ -10,7 +10,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization public class DictionaryTokenizerTests { - private readonly DictionaryTokenizer tokenizer = new DictionaryTokenizer(); + private readonly DictionaryTokenizer tokenizer = new DictionaryTokenizer(true); [Theory] [InlineData("[rjee]")] diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs index 87e58af6..ba117ac6 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/CoreTokenScannerTests.cs @@ -14,7 +14,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization.Scanner public CoreTokenScannerTests() { - scannerFactory = x => new CoreTokenScanner(x); + scannerFactory = x => new CoreTokenScanner(x, true); } [Fact] diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs index 58266a46..b04597a6 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/StringTokenizerTests.cs @@ -7,7 +7,7 @@ public class StringTokenizerTests { - private readonly StringTokenizer tokenizer = new StringTokenizer(); + private readonly StringTokenizer tokenizer = new StringTokenizer(true); [Fact] public void NullInput_ReturnsFalse() diff --git a/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs index 4aa2412e..323202bd 100644 --- a/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs @@ -7,8 +7,15 @@ internal class ArrayTokenizer : ITokenizer { + private readonly bool usePdfDocEncoding; + public bool ReadsNextByte { get; } = false; + public ArrayTokenizer(bool usePdfDocEncoding) + { + this.usePdfDocEncoding = usePdfDocEncoding; + } + public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) { token = null; @@ -18,7 +25,7 @@ return false; } - var scanner = new CoreTokenScanner(inputBytes, ScannerScope.Array); + var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Array); var contents = new List(); diff --git a/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs index b2296b50..e715db98 100644 --- a/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs @@ -7,6 +7,7 @@ internal class DictionaryTokenizer : ITokenizer { + private readonly bool usePdfDocEncoding; private readonly IReadOnlyList requiredKeys; public bool ReadsNextByte { get; } = false; @@ -14,12 +15,16 @@ /// /// Create a new . /// + /// + /// Whether to read strings using the PdfDocEncoding. + /// /// /// Can be provided to recover from errors with missing dictionary end symbols if the /// set of keys expected in the dictionary are known. /// - public DictionaryTokenizer(IReadOnlyList requiredKeys = null) + public DictionaryTokenizer(bool usePdfDocEncoding, IReadOnlyList requiredKeys = null) { + this.usePdfDocEncoding = usePdfDocEncoding; this.requiredKeys = requiredKeys; } @@ -75,7 +80,7 @@ return false; } - var coreScanner = new CoreTokenScanner(inputBytes, ScannerScope.Dictionary); + var coreScanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, ScannerScope.Dictionary); var tokens = new List(); diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs index 9d33f834..5536d319 100644 --- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs @@ -10,9 +10,7 @@ /// public class CoreTokenScanner : ISeekableTokenScanner { - private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer(); private static readonly CommentTokenizer CommentTokenizer = new CommentTokenizer(); - private static readonly DictionaryTokenizer DictionaryTokenizer = new DictionaryTokenizer(); private static readonly HexTokenizer HexTokenizer = new HexTokenizer(); private static readonly NameTokenizer NameTokenizer = new NameTokenizer(); @@ -20,11 +18,14 @@ // StringBuilder it re-uses. private readonly PlainTokenizer PlainTokenizer = new PlainTokenizer(); private readonly NumericTokenizer NumericTokenizer = new NumericTokenizer(); - private readonly StringTokenizer StringTokenizer = new StringTokenizer(); + private readonly StringTokenizer stringTokenizer; + private readonly ArrayTokenizer arrayTokenizer; + private readonly DictionaryTokenizer dictionaryTokenizer; private readonly ScannerScope scope; private readonly IReadOnlyDictionary> namedDictionaryRequiredKeys; private readonly IInputBytes inputBytes; + private readonly bool usePdfDocEncoding; private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>(); /// @@ -49,10 +50,15 @@ /// public CoreTokenScanner( IInputBytes inputBytes, + bool usePdfDocEncoding, ScannerScope scope = ScannerScope.None, IReadOnlyDictionary> namedDictionaryRequiredKeys = null) { this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes)); + this.usePdfDocEncoding = usePdfDocEncoding; + this.stringTokenizer = new StringTokenizer(usePdfDocEncoding); + this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding); + this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding); this.scope = scope; this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys; } @@ -121,20 +127,20 @@ switch (c) { case '(': - tokenizer = StringTokenizer; + tokenizer = stringTokenizer; break; case '<': var following = inputBytes.Peek(); if (following == '<') { isSkippingSymbol = true; - tokenizer = DictionaryTokenizer; + tokenizer = dictionaryTokenizer; if (namedDictionaryRequiredKeys != null && CurrentToken is NameToken name && namedDictionaryRequiredKeys.TryGetValue(name, out var requiredKeys)) { - tokenizer = new DictionaryTokenizer(requiredKeys); + tokenizer = new DictionaryTokenizer(usePdfDocEncoding, requiredKeys); } } else @@ -150,7 +156,7 @@ } break; case '[': - tokenizer = ArrayTokenizer; + tokenizer = arrayTokenizer; break; case ']' when scope == ScannerScope.Array: return false; diff --git a/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs index 6592dddd..45fa3a01 100644 --- a/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs @@ -6,9 +6,17 @@ internal class StringTokenizer : ITokenizer { + private readonly bool usePdfDocEncoding; + private readonly StringBuilder stringBuilder = new StringBuilder(); + public bool ReadsNextByte { get; } = false; + public StringTokenizer(bool usePdfDocEncoding) + { + this.usePdfDocEncoding = usePdfDocEncoding; + } + public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) { token = null; @@ -164,6 +172,21 @@ encodedWith = StringToken.Encoding.Utf16; } + else if (usePdfDocEncoding) + { + var builtStr = builder.ToString(); + var rawBytes = OtherEncodings.StringAsLatin1Bytes(builtStr); + if (PdfDocEncoding.TryConvertBytesToString(rawBytes, out var str)) + { + tokenStr = str; + encodedWith = StringToken.Encoding.PdfDocEncoding; + } + else + { + tokenStr = builtStr; + encodedWith = StringToken.Encoding.Iso88591; + } + } else { tokenStr = builder.ToString(); @@ -171,6 +194,21 @@ encodedWith = StringToken.Encoding.Iso88591; } } + else if (usePdfDocEncoding) + { + var builtStr = builder.ToString(); + var rawBytes = OtherEncodings.StringAsLatin1Bytes(builtStr); + if (PdfDocEncoding.TryConvertBytesToString(rawBytes, out var str)) + { + tokenStr = str; + encodedWith = StringToken.Encoding.PdfDocEncoding; + } + else + { + tokenStr = builtStr; + encodedWith = StringToken.Encoding.Iso88591; + } + } else { tokenStr = builder.ToString(); diff --git a/src/UglyToad.PdfPig.Tokens/StringToken.cs b/src/UglyToad.PdfPig.Tokens/StringToken.cs index 2b633526..f8e7bbd2 100644 --- a/src/UglyToad.PdfPig.Tokens/StringToken.cs +++ b/src/UglyToad.PdfPig.Tokens/StringToken.cs @@ -53,6 +53,8 @@ namespace UglyToad.PdfPig.Tokens { return System.Text.Encoding.Unicode.GetBytes(Data); } + case Encoding.PdfDocEncoding: + return PdfDocEncoding.StringToBytes(Data); default: return OtherEncodings.StringAsLatin1Bytes(Data); } @@ -96,7 +98,11 @@ namespace UglyToad.PdfPig.Tokens /// /// UTF-16 Big Endian. /// - Utf16BE = 2 + Utf16BE = 2, + /// + /// The PdfDocEncoding for strings in the body of a PDF file. + /// + PdfDocEncoding = 3, } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Encryption/EncryptionHandler.cs b/src/UglyToad.PdfPig/Encryption/EncryptionHandler.cs index fb9757f8..0251ac6c 100644 --- a/src/UglyToad.PdfPig/Encryption/EncryptionHandler.cs +++ b/src/UglyToad.PdfPig/Encryption/EncryptionHandler.cs @@ -64,6 +64,9 @@ case HexToken hex: documentIdBytes = hex.Bytes.ToArray(); break; + case StringToken str: + documentIdBytes = str.GetBytes(); + break; default: documentIdBytes = OtherEncodings.StringAsLatin1Bytes(token.Data); break; @@ -398,7 +401,7 @@ return token; } - var data = OtherEncodings.StringAsLatin1Bytes(stringToken.Data); + var data = stringToken.GetBytes(); var decrypted = DecryptData(data, reference); diff --git a/src/UglyToad.PdfPig/Parser/PageContentParser.cs b/src/UglyToad.PdfPig/Parser/PageContentParser.cs index e6a8e5f5..bd253e7b 100644 --- a/src/UglyToad.PdfPig/Parser/PageContentParser.cs +++ b/src/UglyToad.PdfPig/Parser/PageContentParser.cs @@ -21,10 +21,12 @@ this.operationFactory = operationFactory; } - public IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes, + public IReadOnlyList Parse( + int pageNumber, + IInputBytes inputBytes, ILog log) { - var scanner = new CoreTokenScanner(inputBytes); + var scanner = new CoreTokenScanner(inputBytes, false); var precedingTokens = new List(); var graphicsStateOperations = new List(); diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index f717cf50..92f30d15 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -70,7 +70,7 @@ { var isLenientParsing = options?.UseLenientParsing ?? true; - var tokenScanner = new CoreTokenScanner(inputBytes); + var tokenScanner = new CoreTokenScanner(inputBytes, true); var passwords = new List(); diff --git a/src/UglyToad.PdfPig/PdfFonts/Parser/CMapParser.cs b/src/UglyToad.PdfPig/PdfFonts/Parser/CMapParser.cs index 6e097b27..35f97dfe 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Parser/CMapParser.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Parser/CMapParser.cs @@ -22,6 +22,7 @@ public CMap Parse(IInputBytes inputBytes) { var scanner = new CoreTokenScanner(inputBytes, + false, namedDictionaryRequiredKeys: new Dictionary> { { NameToken.CidSystemInfo, new[] { NameToken.Registry, NameToken.Ordering, NameToken.Supplement } } diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 2d5a570c..34ac8cde 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -57,7 +57,7 @@ this.objectLocationProvider = objectLocationProvider; this.filterProvider = filterProvider; this.encryptionHandler = encryptionHandler; - coreTokenScanner = new CoreTokenScanner(inputBytes); + coreTokenScanner = new CoreTokenScanner(inputBytes, true); } public void UpdateEncryptionHandler(IEncryptionHandler newHandler) @@ -797,7 +797,7 @@ // Read the N integers var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider, this)); - var scanner = new CoreTokenScanner(bytes); + var scanner = new CoreTokenScanner(bytes, true); var objects = new List>();