From 81ab414c56076a2c7561a9a86e0b136cb811ee45 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Tue, 8 Oct 2019 15:53:42 +0100 Subject: [PATCH 1/2] add is supported flag to filters and add missing doc comment --- src/UglyToad.PdfPig/Filters/Ascii85Filter.cs | 5 +++++ .../Filters/AsciiHexDecodeFilter.cs | 8 ++++++++ .../Filters/CcittFaxDecodeFilter.cs | 4 ++++ src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs | 4 ++++ src/UglyToad.PdfPig/Filters/FlateFilter.cs | 10 +++++++++- src/UglyToad.PdfPig/Filters/IFilter.cs | 16 ++++++++++++++++ src/UglyToad.PdfPig/Filters/Jbig2DecodeFilter.cs | 4 ++++ src/UglyToad.PdfPig/Filters/JpxDecodeFilter.cs | 4 ++++ src/UglyToad.PdfPig/Filters/LzwFilter.cs | 9 +++++++++ src/UglyToad.PdfPig/Filters/RunLengthFilter.cs | 9 +++++++++ src/UglyToad.PdfPig/Geometry/PdfPath.cs | 4 ++++ 11 files changed, 76 insertions(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig/Filters/Ascii85Filter.cs b/src/UglyToad.PdfPig/Filters/Ascii85Filter.cs index f115de88..db0b7728 100644 --- a/src/UglyToad.PdfPig/Filters/Ascii85Filter.cs +++ b/src/UglyToad.PdfPig/Filters/Ascii85Filter.cs @@ -5,6 +5,7 @@ using System.IO; using Tokens; + /// /// /// ASCII 85 (Base85) is a binary to text encoding using 5 ASCII characters per 4 bytes of data. /// @@ -24,6 +25,10 @@ 85 * 85 * 85 *85 }; + /// + public bool IsSupported { get; } = true; + + /// public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) { var asciiBuffer = new byte[5]; diff --git a/src/UglyToad.PdfPig/Filters/AsciiHexDecodeFilter.cs b/src/UglyToad.PdfPig/Filters/AsciiHexDecodeFilter.cs index 92b96004..1c879bc5 100644 --- a/src/UglyToad.PdfPig/Filters/AsciiHexDecodeFilter.cs +++ b/src/UglyToad.PdfPig/Filters/AsciiHexDecodeFilter.cs @@ -5,6 +5,10 @@ using System.IO; using Tokens; + /// + /// + /// Encodes/decodes data using the ASCII hexadecimal encoding where each byte is represented by two ASCII characters. + /// internal class AsciiHexDecodeFilter : IFilter { private static readonly short[] ReverseHex = @@ -22,6 +26,10 @@ /* 100 */ 13, 14, 15 }; + /// + public bool IsSupported { get; } = true; + + /// public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) { var pair = new byte[2]; diff --git a/src/UglyToad.PdfPig/Filters/CcittFaxDecodeFilter.cs b/src/UglyToad.PdfPig/Filters/CcittFaxDecodeFilter.cs index 98707a0e..4c061441 100644 --- a/src/UglyToad.PdfPig/Filters/CcittFaxDecodeFilter.cs +++ b/src/UglyToad.PdfPig/Filters/CcittFaxDecodeFilter.cs @@ -6,6 +6,10 @@ internal class CcittFaxDecodeFilter : IFilter { + /// + public bool IsSupported { get; } = false; + + /// public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) { throw new NotSupportedException("The CCITT Fax Filter for image data is not currently supported. " + diff --git a/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs b/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs index ced83f76..6f751621 100644 --- a/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs +++ b/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs @@ -6,6 +6,10 @@ internal class DctDecodeFilter : IFilter { + /// + public bool IsSupported { get; } = false; + + /// public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) { throw new NotSupportedException("The DST (Discrete Cosine Transform) Filter indicates data is encoded in JPEG format. " + diff --git a/src/UglyToad.PdfPig/Filters/FlateFilter.cs b/src/UglyToad.PdfPig/Filters/FlateFilter.cs index 8f0a9b37..9c230a40 100644 --- a/src/UglyToad.PdfPig/Filters/FlateFilter.cs +++ b/src/UglyToad.PdfPig/Filters/FlateFilter.cs @@ -9,8 +9,12 @@ using Tokens; using Util; + /// /// - /// + /// The Flate filter is based on the public-domain zlib/deflate compression method, a variable-length Lempel-Ziv + /// adaptive compression method cascaded with adaptive Huffman coding. + /// It is fully defined in Internet RFCs 1950, ZLIB Compressed Data Format Specification, and + /// 1951, DEFLATE Compressed Data Format Specification /// /// /// See section 3.3.3 of the spec (version 1.7) for details on the FlateDecode filter. @@ -34,6 +38,10 @@ this.log = log; } + /// + public bool IsSupported { get; } = true; + + /// public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) { if (input == null) diff --git a/src/UglyToad.PdfPig/Filters/IFilter.cs b/src/UglyToad.PdfPig/Filters/IFilter.cs index 0d2368d2..654ec21a 100644 --- a/src/UglyToad.PdfPig/Filters/IFilter.cs +++ b/src/UglyToad.PdfPig/Filters/IFilter.cs @@ -3,8 +3,24 @@ using System.Collections.Generic; using Tokens; + /// + /// A filter is used in a PDF to encode/decode data either to compress it + /// or derive an ASCII representation of the data. + /// internal interface IFilter { + /// + /// Whether this library can decode information encoded using this filter. + /// + bool IsSupported { get; } + + /// + /// Decodes data encoded using this filter type. + /// + /// The encoded bytes which were encoded using this filter. + /// The dictionary of the (or other dictionary types, e.g. inline images) containing these bytes. + /// The position of this filter in the pipeline used to encode data. + /// The decoded bytes. byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex); } } diff --git a/src/UglyToad.PdfPig/Filters/Jbig2DecodeFilter.cs b/src/UglyToad.PdfPig/Filters/Jbig2DecodeFilter.cs index d51eb3fb..ee4c0441 100644 --- a/src/UglyToad.PdfPig/Filters/Jbig2DecodeFilter.cs +++ b/src/UglyToad.PdfPig/Filters/Jbig2DecodeFilter.cs @@ -6,6 +6,10 @@ internal class Jbig2DecodeFilter : IFilter { + /// + public bool IsSupported { get; } = false; + + /// public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) { throw new NotSupportedException("The JBIG2 Filter for monochrome image data is not currently supported. " + diff --git a/src/UglyToad.PdfPig/Filters/JpxDecodeFilter.cs b/src/UglyToad.PdfPig/Filters/JpxDecodeFilter.cs index 9531b15e..6a4ded19 100644 --- a/src/UglyToad.PdfPig/Filters/JpxDecodeFilter.cs +++ b/src/UglyToad.PdfPig/Filters/JpxDecodeFilter.cs @@ -6,6 +6,10 @@ internal class JpxDecodeFilter : IFilter { + /// + public bool IsSupported { get; } = false; + + /// public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) { throw new NotSupportedException("The JPX Filter (JPEG2000) for image data is not currently supported. " + diff --git a/src/UglyToad.PdfPig/Filters/LzwFilter.cs b/src/UglyToad.PdfPig/Filters/LzwFilter.cs index 85f78ef8..ae28b9b3 100644 --- a/src/UglyToad.PdfPig/Filters/LzwFilter.cs +++ b/src/UglyToad.PdfPig/Filters/LzwFilter.cs @@ -5,6 +5,11 @@ using Tokens; using Util; + /// + /// + /// The LZW (Lempel-Ziv-Welch) filter is a variable-length, adaptive compression method + /// that has been adopted as one of the standard compression methods in the Tag Image File Format (TIFF) standard. + /// internal class LzwFilter : IFilter { private const int DefaultColors = 1; @@ -27,6 +32,10 @@ this.pngPredictor = pngPredictor ?? throw new ArgumentNullException(nameof(pngPredictor)); } + /// + public bool IsSupported { get; } = true; + + /// public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) { var parameters = decodeParameterResolver.GetFilterParameters(streamDictionary, filterIndex); diff --git a/src/UglyToad.PdfPig/Filters/RunLengthFilter.cs b/src/UglyToad.PdfPig/Filters/RunLengthFilter.cs index 1e8e9a32..d44c37aa 100644 --- a/src/UglyToad.PdfPig/Filters/RunLengthFilter.cs +++ b/src/UglyToad.PdfPig/Filters/RunLengthFilter.cs @@ -4,10 +4,19 @@ using System.IO; using Tokens; + /// + /// + /// The Run Length filterencodes data in a simple byte-oriented format based on run length. + /// The encoded data is a sequence of runs, where each run consists of a length byte followed by 1 to 128 bytes of data. + /// internal class RunLengthFilter : IFilter { private const byte EndOfDataLength = 128; + /// + public bool IsSupported { get; } = true; + + /// public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) { using (var memoryStream = new MemoryStream()) diff --git a/src/UglyToad.PdfPig/Geometry/PdfPath.cs b/src/UglyToad.PdfPig/Geometry/PdfPath.cs index a741d77b..a01a31f2 100644 --- a/src/UglyToad.PdfPig/Geometry/PdfPath.cs +++ b/src/UglyToad.PdfPig/Geometry/PdfPath.cs @@ -211,6 +211,10 @@ namespace UglyToad.PdfPig.Geometry commands.Add(new Close()); } + /// + /// Gets a which entirely contains the geometry of the defined path. + /// + /// For paths which don't define any geometry this returns . public PdfRectangle? GetBoundingRectangle() { if (commands.Count == 0) From 2ef45f71d5d3379074e9dc671752fb94866278aa Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Wed, 9 Oct 2019 14:28:37 +0100 Subject: [PATCH 2/2] make missing acroform types public and start improving data also changes pages to use a proper tree structure since this will be required for resource inheritance and for acroform widget dictionaries. --- .../Integration/AcroFormsBasicFieldsTests.cs | 16 +- .../Integration/PigProductionHandbookTests.cs | 23 +-- .../PublicApiScannerTests.cs | 8 + .../AcroForms/AcroFormFactory.cs | 13 +- .../AcroForms/Fields/AcroButtonFieldFlags.cs | 2 +- .../AcroForms/Fields/AcroCheckboxField.cs | 27 +++- .../AcroForms/Fields/AcroComboBoxField.cs | 7 +- .../AcroForms/Fields/AcroFieldBase.cs | 26 +++- .../Fields/AcroFieldCommonInformation.cs | 30 +++- .../AcroForms/Fields/AcroFieldType.cs | 41 +++++ .../AcroForms/Fields/AcroListBoxField.cs | 3 +- .../AcroForms/Fields/AcroNonTerminalField.cs | 29 ++++ .../AcroForms/Fields/AcroPushButtonField.cs | 15 +- .../AcroForms/Fields/AcroRadioButtonsField.cs | 15 +- .../AcroForms/Fields/AcroSignatureField.cs | 12 +- .../AcroForms/Fields/AcroTextField.cs | 4 +- .../AcroForms/Fields/NonTerminalAcroField.cs | 21 --- src/UglyToad.PdfPig/Content/Catalog.cs | 51 ++++++- src/UglyToad.PdfPig/Content/PageTreeNode.cs | 94 ++++++++++++ src/UglyToad.PdfPig/Content/Pages.cs | 140 +++++------------- src/UglyToad.PdfPig/Parser/CatalogFactory.cs | 119 ++++++++++++++- .../Parser/DocumentInformationFactory.cs | 10 +- .../Parser/PdfDocumentFactory.cs | 20 ++- src/UglyToad.PdfPig/PdfDocument.cs | 2 +- 24 files changed, 538 insertions(+), 190 deletions(-) create mode 100644 src/UglyToad.PdfPig/AcroForms/Fields/AcroFieldType.cs create mode 100644 src/UglyToad.PdfPig/AcroForms/Fields/AcroNonTerminalField.cs delete mode 100644 src/UglyToad.PdfPig/AcroForms/Fields/NonTerminalAcroField.cs create mode 100644 src/UglyToad.PdfPig/Content/PageTreeNode.cs diff --git a/src/UglyToad.PdfPig.Tests/Integration/AcroFormsBasicFieldsTests.cs b/src/UglyToad.PdfPig.Tests/Integration/AcroFormsBasicFieldsTests.cs index 89d42984..e5039d49 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/AcroFormsBasicFieldsTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/AcroFormsBasicFieldsTests.cs @@ -1,6 +1,7 @@ namespace UglyToad.PdfPig.Tests.Integration { using System; + using System.Linq; using Xunit; public class AcroFormsBasicFieldsTests @@ -13,7 +14,7 @@ [Fact] public void GetFormNotNull() { - using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions { UseLenientParsing = false })) + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) { var form = document.GetForm(); Assert.NotNull(form); @@ -35,11 +36,22 @@ [Fact] public void GetsAllFormFields() { - using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions { UseLenientParsing = false })) + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) { var form = document.GetForm(); Assert.Equal(16, form.Fields.Count); } } + + [Fact] + public void GetsEmptyFormFields() + { + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) + { + var form = document.GetForm(); + var annots = document.GetPage(1).ExperimentalAccess.GetAnnotations().ToList(); + Assert.Equal(16, form.Fields.Count); + } + } } } diff --git a/src/UglyToad.PdfPig.Tests/Integration/PigProductionHandbookTests.cs b/src/UglyToad.PdfPig.Tests/Integration/PigProductionHandbookTests.cs index a6d49d1e..b05e1f73 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/PigProductionHandbookTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/PigProductionHandbookTests.cs @@ -16,10 +16,7 @@ [Fact] public void CanReadContent() { - using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions - { - UseLenientParsing = false - })) + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) { var page = document.GetPage(1); @@ -30,10 +27,7 @@ [Fact] public void LettersHaveCorrectColors() { - using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions - { - UseLenientParsing = false - })) + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) { var page = document.GetPage(1); @@ -91,7 +85,7 @@ "Nations" }; - using (var document = PdfDocument.Open(GetFilename())) + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) { var page = document.GetPage(1); @@ -105,7 +99,7 @@ public void Page4HasCorrectWords() { var expected = WordsPage4.Split(new[] {"\r", "\r\n", "\n", " "}, StringSplitOptions.RemoveEmptyEntries); - using (var document = PdfDocument.Open(GetFilename())) + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) { var page = document.GetPage(4); @@ -118,10 +112,7 @@ [Fact] public void CanReadPage9() { - using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions - { - UseLenientParsing = false - })) + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) { var page = document.GetPage(9); @@ -132,7 +123,7 @@ [Fact] public void HasCorrectNumberOfPages() { - using (var document = PdfDocument.Open(GetFilename())) + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) { Assert.Equal(86, document.NumberOfPages); } @@ -141,7 +132,7 @@ [Fact] public void LettersHaveCorrectPosition() { - using (var document = PdfDocument.Open(GetFilename())) + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) { var page = document.GetPage(1); var letters = page.Letters; diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index a8a28fe7..680540f9 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -39,12 +39,19 @@ var expected = new List { + "UglyToad.PdfPig.AcroForms.Fields.AcroButtonFieldFlags", + "UglyToad.PdfPig.AcroForms.Fields.AcroCheckboxField", "UglyToad.PdfPig.AcroForms.Fields.AcroChoiceFieldFlags", "UglyToad.PdfPig.AcroForms.Fields.AcroChoiceOption", "UglyToad.PdfPig.AcroForms.Fields.AcroComboBoxField", "UglyToad.PdfPig.AcroForms.Fields.AcroFieldBase", "UglyToad.PdfPig.AcroForms.Fields.AcroFieldCommonInformation", + "UglyToad.PdfPig.AcroForms.Fields.AcroFieldType", "UglyToad.PdfPig.AcroForms.Fields.AcroListBoxField", + "UglyToad.PdfPig.AcroForms.Fields.AcroNonTerminalField", + "UglyToad.PdfPig.AcroForms.Fields.AcroPushButtonField", + "UglyToad.PdfPig.AcroForms.Fields.AcroRadioButtonsField", + "UglyToad.PdfPig.AcroForms.Fields.AcroSignatureField", "UglyToad.PdfPig.AcroForms.Fields.AcroTextField", "UglyToad.PdfPig.AcroForms.Fields.AcroTextFieldFlags", "UglyToad.PdfPig.Annotations.Annotation", @@ -59,6 +66,7 @@ "UglyToad.PdfPig.Content.Page", "UglyToad.PdfPig.Content.PageRotationDegrees", "UglyToad.PdfPig.Content.PageSize", + "UglyToad.PdfPig.Content.PageTreeNode", "UglyToad.PdfPig.Content.Word", "UglyToad.PdfPig.Content.TextLine", "UglyToad.PdfPig.Content.TextBlock", diff --git a/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs b/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs index a88b5841..6386abf6 100644 --- a/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs +++ b/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs @@ -147,7 +147,7 @@ children.Add(kidField); } - result = new NonTerminalAcroField(fieldDictionary, "Non-Terminal Field", fieldFlags, information, children); + result = new AcroNonTerminalField(fieldDictionary, "Non-Terminal Field", fieldFlags, information, children); } else if (fieldType == NameToken.Btn) { @@ -165,12 +165,21 @@ } else { + var isChecked = false; if (!fieldDictionary.TryGetOptionalTokenDirect(NameToken.V, tokenScanner, out NameToken valueToken)) { valueToken = NameToken.Off; } + else + { + isChecked = !string.Equals(valueToken.Data, NameToken.Off, StringComparison.OrdinalIgnoreCase); + } + + var field = new AcroCheckboxField(fieldDictionary, fieldType, buttonFlags, + information, + valueToken, + isChecked); - var field = new AcroCheckboxField(fieldDictionary, fieldType, buttonFlags, information, valueToken); result = field; } } diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/AcroButtonFieldFlags.cs b/src/UglyToad.PdfPig/AcroForms/Fields/AcroButtonFieldFlags.cs index b45d0cb6..da6f5cf1 100644 --- a/src/UglyToad.PdfPig/AcroForms/Fields/AcroButtonFieldFlags.cs +++ b/src/UglyToad.PdfPig/AcroForms/Fields/AcroButtonFieldFlags.cs @@ -6,7 +6,7 @@ /// Flags specifying various characteristics of a button type field in an . /// [Flags] - internal enum AcroButtonFieldFlags : uint + public enum AcroButtonFieldFlags : uint { /// /// The user may not change the value of the field. diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/AcroCheckboxField.cs b/src/UglyToad.PdfPig/AcroForms/Fields/AcroCheckboxField.cs index 9ce2fefe..7ba79d4c 100644 --- a/src/UglyToad.PdfPig/AcroForms/Fields/AcroCheckboxField.cs +++ b/src/UglyToad.PdfPig/AcroForms/Fields/AcroCheckboxField.cs @@ -2,18 +2,39 @@ { using Tokens; - internal class AcroCheckboxField : AcroFieldBase + /// + /// + /// A checkbox which may be toggled on or off. + /// + public class AcroCheckboxField : AcroFieldBase { + /// + /// The which define the behaviour of this button type. + /// public AcroButtonFieldFlags Flags { get; } + /// + /// The current value of this checkbox. + /// public NameToken CurrentValue { get; } + /// + /// Whether this checkbox is currently checked/on. + /// + public bool IsChecked { get; } + + /// + /// + /// Create a new . + /// public AcroCheckboxField(DictionaryToken dictionary, string fieldType, AcroButtonFieldFlags fieldFlags, - AcroFieldCommonInformation information, NameToken currentValue) : - base(dictionary, fieldType, (uint)fieldFlags, information) + AcroFieldCommonInformation information, NameToken currentValue, + bool isChecked) : + base(dictionary, fieldType, (uint)fieldFlags, AcroFieldType.Checkbox, information) { Flags = fieldFlags; CurrentValue = currentValue; + IsChecked = isChecked; } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/AcroComboBoxField.cs b/src/UglyToad.PdfPig/AcroForms/Fields/AcroComboBoxField.cs index 50c31ced..c1342bd8 100644 --- a/src/UglyToad.PdfPig/AcroForms/Fields/AcroComboBoxField.cs +++ b/src/UglyToad.PdfPig/AcroForms/Fields/AcroComboBoxField.cs @@ -35,11 +35,12 @@ [CanBeNull] public IReadOnlyList SelectedOptionIndices { get; } + /// /// - /// Create a new . + /// Create a new . /// /// The dictionary for this field. - /// The type of this field, must be . + /// The type of this field, must be . /// The flags specifying behaviour for this field. /// Additional information for this field. /// The options in this field. @@ -49,7 +50,7 @@ AcroFieldCommonInformation information, IReadOnlyList options, IReadOnlyList selectedOptions, IReadOnlyList selectedOptionIndices) : - base(dictionary, fieldType, (uint)fieldFlags, information) + base(dictionary, fieldType, (uint)fieldFlags, AcroFieldType.ComboBox, information) { Flags = fieldFlags; Options = options ?? throw new ArgumentNullException(nameof(options)); diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/AcroFieldBase.cs b/src/UglyToad.PdfPig/AcroForms/Fields/AcroFieldBase.cs index 816bd1cf..49931603 100644 --- a/src/UglyToad.PdfPig/AcroForms/Fields/AcroFieldBase.cs +++ b/src/UglyToad.PdfPig/AcroForms/Fields/AcroFieldBase.cs @@ -16,10 +16,15 @@ public DictionaryToken Dictionary { get; } /// - /// The representing the type of this field. + /// The representing the type of this field in PDF format. /// [NotNull] - public string FieldType { get; } + public string RawFieldType { get; } + + /// + /// The actual represented by this field. + /// + public AcroFieldType FieldType { get; } /// /// Specifies various characteristics of the field. @@ -36,15 +41,26 @@ /// Create a new . /// /// The dictionary for this field. - /// The type of this field. + /// The PDF string type of this field. /// The flags specifying behaviour for this field. + /// The type of this field. /// Additional information for this field. - protected AcroFieldBase(DictionaryToken dictionary, string fieldType, uint fieldFlags, AcroFieldCommonInformation information) + protected AcroFieldBase(DictionaryToken dictionary, string rawFieldType, + uint fieldFlags, + AcroFieldType fieldType, + AcroFieldCommonInformation information) { Dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary)); - FieldType = fieldType ?? throw new ArgumentNullException(nameof(fieldType)); + RawFieldType = rawFieldType ?? throw new ArgumentNullException(nameof(rawFieldType)); FieldFlags = fieldFlags; + FieldType = fieldType; Information = information ?? new AcroFieldCommonInformation(null, null, null, null); } + + /// + public override string ToString() + { + return $"{FieldType}"; + } } } diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/AcroFieldCommonInformation.cs b/src/UglyToad.PdfPig/AcroForms/Fields/AcroFieldCommonInformation.cs index 62be2ddc..ba7224c9 100644 --- a/src/UglyToad.PdfPig/AcroForms/Fields/AcroFieldCommonInformation.cs +++ b/src/UglyToad.PdfPig/AcroForms/Fields/AcroFieldCommonInformation.cs @@ -47,7 +47,35 @@ /// public override string ToString() { - return $"Parent: {Parent}. Partial: {PartialName}. Alternate: {AlternateName}. Mapping: {MappingName}."; + string AppendIfNotNull(string val, string label, string result) + { + if (val == null) + { + return result; + } + + if (result.Length > 0) + { + result += " "; + } + + result += $"{label}: {val}."; + + return result; + } + + var s = string.Empty; + + if (Parent != null) + { + s += $"Parent: {Parent}."; + } + + s = AppendIfNotNull(PartialName, "Partial Name", s); + s = AppendIfNotNull(AlternateName, "Alternate Name", s); + s = AppendIfNotNull(MappingName, "Mapping Name", s); + + return s; } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/AcroFieldType.cs b/src/UglyToad.PdfPig/AcroForms/Fields/AcroFieldType.cs new file mode 100644 index 00000000..4ddd5dd4 --- /dev/null +++ b/src/UglyToad.PdfPig/AcroForms/Fields/AcroFieldType.cs @@ -0,0 +1,41 @@ +namespace UglyToad.PdfPig.AcroForms.Fields +{ + /// + /// Indicates the type of field for a . + /// + public enum AcroFieldType + { + /// + /// A button that immediately to user input without retaining state. + /// + PushButton, + /// + /// A checkbox which toggles between on and off states. + /// + Checkbox, + /// + /// A set of radio buttons. + /// + RadioButton, + /// + /// A textbox allowing user input through the keyboard. + /// + Text, + /// + /// A dropdown list of options with optional user-editable textbox. + /// + ComboBox, + /// + /// A list of options for the user to select from. + /// + ListBox, + /// + /// A field containing a digital signature. + /// + Signature, + /// + /// A field which acts as a container for other fields. + /// + NonTerminal + } +} diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/AcroListBoxField.cs b/src/UglyToad.PdfPig/AcroForms/Fields/AcroListBoxField.cs index caa9b52a..9f19fe75 100644 --- a/src/UglyToad.PdfPig/AcroForms/Fields/AcroListBoxField.cs +++ b/src/UglyToad.PdfPig/AcroForms/Fields/AcroListBoxField.cs @@ -44,6 +44,7 @@ /// public bool SupportsMultiSelect => Flags.Equals(AcroChoiceFieldFlags.MultiSelect); + /// /// /// Create a new . /// @@ -60,7 +61,7 @@ IReadOnlyList selectedOptions, IReadOnlyList selectedOptionIndices, int? topIndex) : - base(dictionary, fieldType, (uint)fieldFlags, information) + base(dictionary, fieldType, (uint)fieldFlags, AcroFieldType.ListBox, information) { Flags = fieldFlags; Options = options ?? throw new ArgumentNullException(nameof(options)); diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/AcroNonTerminalField.cs b/src/UglyToad.PdfPig/AcroForms/Fields/AcroNonTerminalField.cs new file mode 100644 index 00000000..c51a58de --- /dev/null +++ b/src/UglyToad.PdfPig/AcroForms/Fields/AcroNonTerminalField.cs @@ -0,0 +1,29 @@ +namespace UglyToad.PdfPig.AcroForms.Fields +{ + using System; + using System.Collections.Generic; + using Tokens; + + /// + /// + /// A non-leaf field in the form's structure. + /// + public class AcroNonTerminalField : AcroFieldBase + { + /// + /// The child fields of this field. + /// + public IReadOnlyList Children { get; } + + /// + /// + /// Create a new . + /// + internal AcroNonTerminalField(DictionaryToken dictionary, string fieldType, uint fieldFlags, AcroFieldCommonInformation information, + IReadOnlyList children) : + base(dictionary, fieldType, fieldFlags, AcroFieldType.NonTerminal, information) + { + Children = children ?? throw new ArgumentNullException(nameof(children)); + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/AcroPushButtonField.cs b/src/UglyToad.PdfPig/AcroForms/Fields/AcroPushButtonField.cs index a56c02bf..8669acee 100644 --- a/src/UglyToad.PdfPig/AcroForms/Fields/AcroPushButtonField.cs +++ b/src/UglyToad.PdfPig/AcroForms/Fields/AcroPushButtonField.cs @@ -2,13 +2,24 @@ { using Tokens; - internal class AcroPushButtonField : AcroFieldBase + /// + /// + /// A push button responds immediately to user input without storing any state. + /// + public class AcroPushButtonField : AcroFieldBase { + /// + /// The which define the behaviour of this button type. + /// public AcroButtonFieldFlags Flags { get; } + /// + /// + /// Create a new . + /// public AcroPushButtonField(DictionaryToken dictionary, string fieldType, AcroButtonFieldFlags fieldFlags, AcroFieldCommonInformation information) : - base(dictionary, fieldType, (uint)fieldFlags, information) + base(dictionary, fieldType, (uint)fieldFlags, AcroFieldType.PushButton, information) { Flags = fieldFlags; } diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/AcroRadioButtonsField.cs b/src/UglyToad.PdfPig/AcroForms/Fields/AcroRadioButtonsField.cs index 6e6d291f..637ec696 100644 --- a/src/UglyToad.PdfPig/AcroForms/Fields/AcroRadioButtonsField.cs +++ b/src/UglyToad.PdfPig/AcroForms/Fields/AcroRadioButtonsField.cs @@ -2,13 +2,24 @@ { using Tokens; - internal class AcroRadioButtonsField : AcroFieldBase + /// + /// + /// A set of radio buttons. + /// + public class AcroRadioButtonsField : AcroFieldBase { + /// + /// The which define the behaviour of this button type. + /// public AcroButtonFieldFlags Flags { get; } + /// + /// + /// Create a new . + /// public AcroRadioButtonsField(DictionaryToken dictionary, string fieldType, AcroButtonFieldFlags fieldFlags, AcroFieldCommonInformation information) : - base(dictionary, fieldType, (uint)fieldFlags, information) + base(dictionary, fieldType, (uint)fieldFlags, AcroFieldType.RadioButton, information) { Flags = fieldFlags; } diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/AcroSignatureField.cs b/src/UglyToad.PdfPig/AcroForms/Fields/AcroSignatureField.cs index 3304bd5e..fa443b4f 100644 --- a/src/UglyToad.PdfPig/AcroForms/Fields/AcroSignatureField.cs +++ b/src/UglyToad.PdfPig/AcroForms/Fields/AcroSignatureField.cs @@ -2,10 +2,18 @@ { using Tokens; - internal class AcroSignatureField : AcroFieldBase + /// + /// + /// A digital signature field. + /// + public class AcroSignatureField : AcroFieldBase { + /// + /// + /// Create a new . + /// public AcroSignatureField(DictionaryToken dictionary, string fieldType, uint fieldFlags, AcroFieldCommonInformation information) : - base(dictionary, fieldType, fieldFlags, information) + base(dictionary, fieldType, fieldFlags, AcroFieldType.Signature, information) { } } diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/AcroTextField.cs b/src/UglyToad.PdfPig/AcroForms/Fields/AcroTextField.cs index 29cbd7d8..d5c41b59 100644 --- a/src/UglyToad.PdfPig/AcroForms/Fields/AcroTextField.cs +++ b/src/UglyToad.PdfPig/AcroForms/Fields/AcroTextField.cs @@ -47,7 +47,7 @@ /// The maximum length. public AcroTextField(DictionaryToken dictionary, string fieldType, AcroTextFieldFlags fieldFlags, AcroFieldCommonInformation information, string value, int? maxLength) : - base(dictionary, fieldType, (uint)fieldFlags, information) + base(dictionary, fieldType, (uint)fieldFlags, AcroFieldType.Text, information) { Flags = fieldFlags; Value = value; @@ -59,7 +59,7 @@ /// public override string ToString() { - return Value ?? string.Empty; + return $"{FieldType}: {Value ?? string.Empty}"; } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/AcroForms/Fields/NonTerminalAcroField.cs b/src/UglyToad.PdfPig/AcroForms/Fields/NonTerminalAcroField.cs deleted file mode 100644 index ce1fd2c5..00000000 --- a/src/UglyToad.PdfPig/AcroForms/Fields/NonTerminalAcroField.cs +++ /dev/null @@ -1,21 +0,0 @@ -namespace UglyToad.PdfPig.AcroForms.Fields -{ - using System; - using System.Collections.Generic; - using Tokens; - - /// - /// A non-leaf field in the form's structure. - /// - internal class NonTerminalAcroField : AcroFieldBase - { - public IReadOnlyList Children { get; } - - public NonTerminalAcroField(DictionaryToken dictionary, string fieldType, uint fieldFlags, AcroFieldCommonInformation information, - IReadOnlyList children) : - base(dictionary, fieldType, fieldFlags, information) - { - Children = children ?? throw new ArgumentNullException(nameof(children)); - } - } -} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Content/Catalog.cs b/src/UglyToad.PdfPig/Content/Catalog.cs index 2c3ea509..d6a773c1 100644 --- a/src/UglyToad.PdfPig/Content/Catalog.cs +++ b/src/UglyToad.PdfPig/Content/Catalog.cs @@ -1,6 +1,7 @@ namespace UglyToad.PdfPig.Content { using System; + using System.Collections.Generic; using Tokens; using Util.JetBrains.Annotations; @@ -10,6 +11,8 @@ /// public class Catalog { + private readonly IReadOnlyDictionary pagesByNumber; + /// /// The catalog dictionary containing assorted information. /// @@ -22,14 +25,58 @@ [NotNull] public DictionaryToken PagesDictionary { get; } + /// + /// The page tree for this document containing all pages, page numbers and their dictionaries. + /// + public PageTreeNode PageTree { get; } + /// /// Create a new . /// - internal Catalog(DictionaryToken catalogDictionary, DictionaryToken pagesDictionary) + internal Catalog(DictionaryToken catalogDictionary, DictionaryToken pagesDictionary, + PageTreeNode pageTree) { CatalogDictionary = catalogDictionary ?? throw new ArgumentNullException(nameof(catalogDictionary)); - PagesDictionary = pagesDictionary ?? throw new ArgumentNullException(nameof(pagesDictionary)); + PageTree = pageTree ?? throw new ArgumentNullException(nameof(pageTree)); + + if (!pageTree.IsRoot) + { + throw new ArgumentException("Page tree must be the root page tree node.", nameof(pageTree)); + } + + var byNumber = new Dictionary(); + PopulatePageByNumberDictionary(pageTree, byNumber); + pagesByNumber = byNumber; + } + + private static void PopulatePageByNumberDictionary(PageTreeNode node, Dictionary result) + { + if (node.IsPage) + { + if (!node.PageNumber.HasValue) + { + throw new InvalidOperationException($"Node was page but did not have page number: {node}."); + } + + result[node.PageNumber.Value] = node; + return; + } + + foreach (var child in node.Children) + { + PopulatePageByNumberDictionary(child, result); + } + } + + internal PageTreeNode GetPageNode(int pageNumber) + { + if (!pagesByNumber.TryGetValue(pageNumber, out var node)) + { + throw new InvalidOperationException($"Could not find page node by number for: {pageNumber}."); + } + + return node; } } } diff --git a/src/UglyToad.PdfPig/Content/PageTreeNode.cs b/src/UglyToad.PdfPig/Content/PageTreeNode.cs new file mode 100644 index 00000000..47676c0b --- /dev/null +++ b/src/UglyToad.PdfPig/Content/PageTreeNode.cs @@ -0,0 +1,94 @@ +namespace UglyToad.PdfPig.Content +{ + using System; + using System.Collections.Generic; + using Tokens; + using Util.JetBrains.Annotations; + + /// + /// A node in the PDF document's page tree. + /// Nodes may either be of type 'Page' - a single page, or 'Pages' - a container for multiple child Page + /// or Pages nodes. + /// + public class PageTreeNode + { + /// + /// The dictionary for this node in the page tree. + /// + [NotNull] + public DictionaryToken NodeDictionary { get; } + + /// + /// The indirect reference for this node in the page tree. + /// + public IndirectReference Reference { get; } + + /// + /// Whether this node is a page or not. If not it must be a /Pages container. + /// + public bool IsPage { get; } + + /// + /// The number of this page if is . + /// + public int? PageNumber { get; } + + /// + /// The child nodes of this node if is + /// + [NotNull] + public IReadOnlyList Children { get; } + + /// + /// The parent node of this node, unless it is the root node. + /// + [CanBeNull] + public PageTreeNode Parent { get; private set; } + + /// + /// Whether this node is the root node. + /// + public bool IsRoot => Parent == null; + + /// + /// Create a new . + /// + internal PageTreeNode(DictionaryToken nodeDictionary, IndirectReference reference, + bool isPage, + int? pageNumber, + IReadOnlyList children) + { + NodeDictionary = nodeDictionary ?? throw new ArgumentNullException(nameof(nodeDictionary)); + Reference = reference; + IsPage = isPage; + PageNumber = pageNumber; + Children = children ?? throw new ArgumentNullException(nameof(children)); + + if (IsPage && Children.Count > 0) + { + throw new ArgumentException("Cannot define children on a page node.", nameof(children)); + } + + if (!IsPage && pageNumber.HasValue) + { + throw new ArgumentException("Cannot define page number for a pages node.", nameof(pageNumber)); + } + + foreach (var child in Children) + { + child.Parent = this; + } + } + + /// + public override string ToString() + { + if (IsPage) + { + return $"Page #{PageNumber}: {NodeDictionary}."; + } + + return $"Pages ({Children.Count} children): {NodeDictionary}"; + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Content/Pages.cs b/src/UglyToad.PdfPig/Content/Pages.cs index 92eb08d9..7304802e 100644 --- a/src/UglyToad.PdfPig/Content/Pages.cs +++ b/src/UglyToad.PdfPig/Content/Pages.cs @@ -2,144 +2,70 @@ { using System; using System.Collections.Generic; - using Logging; - using Parser.Parts; using Tokenization.Scanner; using Tokens; using Util; internal class Pages { - private readonly ILog log; private readonly Catalog catalog; private readonly IPageFactory pageFactory; private readonly bool isLenientParsing; private readonly IPdfTokenScanner pdfScanner; - private readonly DictionaryToken rootPageDictionary; - private readonly Dictionary locatedPages = new Dictionary(); public int Count { get; } - internal Pages(ILog log, Catalog catalog, IPageFactory pageFactory, bool isLenientParsing, IPdfTokenScanner pdfScanner) + internal Pages(Catalog catalog, IPageFactory pageFactory, bool isLenientParsing, + IPdfTokenScanner pdfScanner) { - if (catalog == null) - { - throw new ArgumentNullException(nameof(catalog)); - } - - rootPageDictionary = catalog.PagesDictionary; - - Count = rootPageDictionary.GetIntOrDefault(NameToken.Count); - - this.log = log; - this.catalog = catalog; - this.pageFactory = pageFactory; + this.catalog = catalog ?? throw new ArgumentNullException(nameof(catalog)); + this.pageFactory = pageFactory ?? throw new ArgumentNullException(nameof(pageFactory)); this.isLenientParsing = isLenientParsing; - this.pdfScanner = pdfScanner; + this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner)); + + Count = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count); } public Page GetPage(int pageNumber) { - if (locatedPages.TryGetValue(pageNumber, out DictionaryToken targetPageDictionary)) + if (pageNumber <= 0 || pageNumber > Count) { - // TODO: cache the page - return pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), - isLenientParsing); + throw new ArgumentOutOfRangeException(nameof(pageNumber), + $"Page number {pageNumber} invalid, must be between 1 and {Count}."); } - var observed = new List(); + var pageNode = catalog.GetPageNode(pageNumber); + var pageStack = new Stack(); + + var currentNode = pageNode; + while (currentNode != null) + { + pageStack.Push(currentNode); + currentNode = currentNode.Parent; + } var pageTreeMembers = new PageTreeMembers(); - // todo: running a search for a different, unloaded, page number, results in a bug. - var isFound = FindPage(rootPageDictionary, pageNumber, observed, pageTreeMembers); - - if (!isFound || !locatedPages.TryGetValue(pageNumber, out targetPageDictionary)) + while (pageStack.Count > 0) { - throw new ArgumentOutOfRangeException("Could not find the page with number: " + pageNumber); - } + currentNode = pageStack.Pop(); - var page = pageFactory.Create(pageNumber, targetPageDictionary, pageTreeMembers, isLenientParsing); - - locatedPages[pageNumber] = targetPageDictionary; - - return page; - } - - private static int GetNextPageNumber(IReadOnlyList pages) - { - if (pages.Count == 0) - { - return 1; - } - - return pages[pages.Count - 1] + 1; - } - - public bool FindPage(DictionaryToken currentPageDictionary, int soughtPageNumber, List pageNumbersObserved, PageTreeMembers pageTreeMembers) - { - var type = currentPageDictionary.GetNameOrDefault(NameToken.Type); - - if (type?.Equals(NameToken.Page) == true) - { - var pageNumber = GetNextPageNumber(pageNumbersObserved); - - bool found = pageNumber == soughtPageNumber; - - locatedPages[pageNumber] = currentPageDictionary; - pageNumbersObserved.Add(pageNumber); - - return found; - } - - if (type?.Equals(NameToken.Pages) != true) - { - log.Warn("Did not find the expected type (Page or Pages) in dictionary: " + currentPageDictionary); - - return false; - } - - if (currentPageDictionary.TryGet(NameToken.MediaBox, out var token)) - { - var mediaBox = DirectObjectFinder.Get(token, pdfScanner); - - pageTreeMembers.MediaBox = new MediaBox(mediaBox.ToRectangle()); - } - - if (currentPageDictionary.TryGet(NameToken.Rotate, pdfScanner, out NumericToken rotateToken)) - { - pageTreeMembers.Rotation = rotateToken.Int; - } - - if (!currentPageDictionary.TryGet(NameToken.Kids, out var kids) - || !(kids is ArrayToken kidsArray)) - { - return false; - } - - pageFactory.LoadResources(currentPageDictionary, isLenientParsing); - - bool childFound = false; - foreach (var kid in kidsArray.Data) - { - // todo: exit early - var child = DirectObjectFinder.Get(kid, pdfScanner); + pageFactory.LoadResources(currentNode.NodeDictionary, isLenientParsing); - var thisPageMatches = FindPage(child, soughtPageNumber, pageNumbersObserved, pageTreeMembers); - - if (thisPageMatches) + if (currentNode.NodeDictionary.TryGet(NameToken.MediaBox, pdfScanner, out ArrayToken mediaBox)) { - childFound = true; - break; + pageTreeMembers.MediaBox = new MediaBox(mediaBox.ToRectangle()); + } + + if (currentNode.NodeDictionary.TryGet(NameToken.Rotate, pdfScanner, out NumericToken rotateToken)) + { + pageTreeMembers.Rotation = rotateToken.Int; } } - - return childFound; - } - - public IReadOnlyList GetAllPages() - { - return new Page[0]; + + var page = pageFactory.Create(pageNumber, pageNode.NodeDictionary, pageTreeMembers, isLenientParsing); + + return page; } } } diff --git a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs index b3aa0485..733e6f76 100644 --- a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs +++ b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs @@ -1,15 +1,19 @@ namespace UglyToad.PdfPig.Parser { using System; + using System.Collections.Generic; using Content; using Exceptions; using Parts; using Tokenization.Scanner; using Tokens; + using Util; - internal class CatalogFactory + internal static class CatalogFactory { - public Catalog Create(IPdfTokenScanner scanner, DictionaryToken dictionary) + public static Catalog Create(IndirectReference rootReference, DictionaryToken dictionary, + IPdfTokenScanner scanner, + bool isLenientParsing) { if (dictionary == null) { @@ -26,9 +30,116 @@ throw new PdfDocumentFormatException($"No pages entry was found in the catalog dictionary: {dictionary}."); } - var pages = DirectObjectFinder.Get(value, scanner); + DictionaryToken pages; + var pagesReference = rootReference; + + if (value is IndirectReferenceToken pagesRef) + { + pagesReference = pagesRef.Data; + pages = DirectObjectFinder.Get(pagesRef, scanner); + } + else if (value is DictionaryToken pagesDict) + { + pages = pagesDict; + } + else + { + pages = DirectObjectFinder.Get(value, scanner); + } + + var pageNumber = 0; + + var pageTree = ProcessPagesNode(pagesReference, pages, new IndirectReference(1, 0), true, + scanner, isLenientParsing, ref pageNumber); + + return new Catalog(dictionary, pages, pageTree); + } + + private static PageTreeNode ProcessPagesNode(IndirectReference reference, DictionaryToken nodeDictionary, + IndirectReference parentReference, + bool isRoot, + IPdfTokenScanner pdfTokenScanner, + bool isLenientParsing, + ref int pageNumber) + { + var isPage = false; + + if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type)) + { + if (!isLenientParsing) + { + throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}."); + } + + if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _)) + { + isPage = true; + } + } + else + { + isPage = type.Equals(NameToken.Page); + + if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing) + { + throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}."); + } + } + + if (!isLenientParsing && !isRoot) + { + if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken)) + { + throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}."); + } + + if (!parentReferenceToken.Data.Equals(parentReference)) + { + throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}."); + } + } + + if (isPage) + { + pageNumber++; + + var thisNode = new PageTreeNode(nodeDictionary, reference, true, + pageNumber, + EmptyArray.Instance); + + return thisNode; + } - return new Catalog(dictionary, pages); + if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids)) + { + if (!isLenientParsing) + { + throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {nodeDictionary}."); + } + + kids = new ArrayToken(EmptyArray.Instance); + } + + var nodeChildren = new List(); + + foreach (var kid in kids.Data) + { + if (!(kid is IndirectReferenceToken kidRef)) + { + throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}."); + } + + if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken)) + { + throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}."); + } + + var kidNode = ProcessPagesNode(kidRef.Data, kidDictionaryToken, reference, false, pdfTokenScanner, isLenientParsing, ref pageNumber); + + nodeChildren.Add(kidNode); + } + + return new PageTreeNode(nodeDictionary, reference, false, null, nodeChildren); } } } diff --git a/src/UglyToad.PdfPig/Parser/DocumentInformationFactory.cs b/src/UglyToad.PdfPig/Parser/DocumentInformationFactory.cs index c2ace0fd..5b997c54 100644 --- a/src/UglyToad.PdfPig/Parser/DocumentInformationFactory.cs +++ b/src/UglyToad.PdfPig/Parser/DocumentInformationFactory.cs @@ -6,9 +6,15 @@ using Tokenization.Scanner; using Tokens; - internal class DocumentInformationFactory + /// + /// Parse the dictionary from a PDF file trailer. + /// + internal static class DocumentInformationFactory { - public DocumentInformation Create(IPdfTokenScanner pdfTokenScanner, TrailerDictionary trailer) + /// + /// Convert the file trailer dictionary into a instance. + /// + public static DocumentInformation Create(IPdfTokenScanner pdfTokenScanner, TrailerDictionary trailer) { if (!trailer.Info.HasValue) { diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index 79d8ed0c..6cedbb45 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -70,7 +70,6 @@ { var log = container.Get(); var filterProvider = container.Get(); - var catalogFactory = new CatalogFactory(); var cMapCache = new CMapCache(new CMapParser()); CrossReferenceTable crossReferenceTable = null; @@ -104,7 +103,9 @@ var compactFontFormatParser = new CompactFontFormatParser(new CompactFontFormatIndividualFontParser(compactFontFormatIndexReader, new CompactFontFormatTopLevelDictionaryReader(), new CompactFontFormatPrivateDictionaryReader()), compactFontFormatIndexReader); - var rootDictionary = ParseTrailer(crossReferenceTable, isLenientParsing, pdfScanner, out var encryptionDictionary); + var (rootReference, rootDictionary) = ParseTrailer(crossReferenceTable, isLenientParsing, + pdfScanner, + out var encryptionDictionary); var encryptionHandler = encryptionDictionary != null ? (IEncryptionHandler)new EncryptionHandler(encryptionDictionary, crossReferenceTable.Trailer, password ?? string.Empty) : NoOpEncryptionHandler.Instance; @@ -124,14 +125,13 @@ var resourceContainer = new ResourceStore(pdfScanner, fontFactory); + var information = DocumentInformationFactory.Create(pdfScanner, crossReferenceTable.Trailer); + + var catalog = CatalogFactory.Create(rootReference, rootDictionary, pdfScanner, isLenientParsing); + var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory()), log); - var informationFactory = new DocumentInformationFactory(); - - var information = informationFactory.Create(pdfScanner, crossReferenceTable.Trailer); - - var catalog = catalogFactory.Create(pdfScanner, rootDictionary); var caching = new ParsingCachingProviders(bruteForceSearcher, resourceContainer); @@ -144,7 +144,7 @@ acroFormFactory); } - private static DictionaryToken ParseTrailer(CrossReferenceTable crossReferenceTable, bool isLenientParsing, IPdfTokenScanner pdfTokenScanner, + private static (IndirectReference, DictionaryToken) ParseTrailer(CrossReferenceTable crossReferenceTable, bool isLenientParsing, IPdfTokenScanner pdfTokenScanner, out EncryptionDictionary encryptionDictionary) { encryptionDictionary = null; @@ -157,8 +157,6 @@ } encryptionDictionary = EncryptionDictionaryFactory.Read(encryptionDictionaryToken, pdfTokenScanner); - - //throw new NotSupportedException("Cannot currently parse a document using encryption: " + crossReferenceTable.Trailer.EncryptionToken); } var rootDictionary = DirectObjectFinder.Get(crossReferenceTable.Trailer.Root, pdfTokenScanner); @@ -168,7 +166,7 @@ rootDictionary = rootDictionary.With(NameToken.Type, NameToken.Catalog); } - return rootDictionary; + return (crossReferenceTable.Trailer.Root, rootDictionary); } } } diff --git a/src/UglyToad.PdfPig/PdfDocument.cs b/src/UglyToad.PdfPig/PdfDocument.cs index bc6b9ea6..baabf627 100644 --- a/src/UglyToad.PdfPig/PdfDocument.cs +++ b/src/UglyToad.PdfPig/PdfDocument.cs @@ -98,7 +98,7 @@ this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner)); this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider)); Information = information ?? throw new ArgumentNullException(nameof(information)); - pages = new Pages(log, catalog, pageFactory, isLenientParsing, pdfScanner); + pages = new Pages(catalog, pageFactory, isLenientParsing, pdfScanner); Structure = new Structure(catalog, crossReferenceTable, pdfScanner); documentForm = new Lazy(() => acroFormFactory.GetAcroForm(catalog)); }