updated from comments

- still need to look at XmlWriter
This commit is contained in:
BobLd
2019-10-10 12:29:28 +01:00
35 changed files with 612 additions and 193 deletions

View File

@@ -1,6 +1,7 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using System;
using System.Linq;
using Xunit;
public class AcroFormsBasicFieldsTests
@@ -13,7 +14,7 @@
[Fact]
public void GetFormNotNull()
{
using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions { UseLenientParsing = false }))
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var form = document.GetForm();
Assert.NotNull(form);
@@ -35,11 +36,22 @@
[Fact]
public void GetsAllFormFields()
{
using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions { UseLenientParsing = false }))
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var form = document.GetForm();
Assert.Equal(16, form.Fields.Count);
}
}
[Fact]
public void GetsEmptyFormFields()
{
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var form = document.GetForm();
var annots = document.GetPage(1).ExperimentalAccess.GetAnnotations().ToList();
Assert.Equal(16, form.Fields.Count);
}
}
}
}

View File

@@ -16,10 +16,7 @@
[Fact]
public void CanReadContent()
{
using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions
{
UseLenientParsing = false
}))
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(1);
@@ -30,10 +27,7 @@
[Fact]
public void LettersHaveCorrectColors()
{
using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions
{
UseLenientParsing = false
}))
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(1);
@@ -91,7 +85,7 @@
"Nations"
};
using (var document = PdfDocument.Open(GetFilename()))
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(1);
@@ -105,7 +99,7 @@
public void Page4HasCorrectWords()
{
var expected = WordsPage4.Split(new[] {"\r", "\r\n", "\n", " "}, StringSplitOptions.RemoveEmptyEntries);
using (var document = PdfDocument.Open(GetFilename()))
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(4);
@@ -118,10 +112,7 @@
[Fact]
public void CanReadPage9()
{
using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions
{
UseLenientParsing = false
}))
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(9);
@@ -132,7 +123,7 @@
[Fact]
public void HasCorrectNumberOfPages()
{
using (var document = PdfDocument.Open(GetFilename()))
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
Assert.Equal(86, document.NumberOfPages);
}
@@ -141,7 +132,7 @@
[Fact]
public void LettersHaveCorrectPosition()
{
using (var document = PdfDocument.Open(GetFilename()))
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(1);
var letters = page.Letters;

View File

@@ -39,12 +39,19 @@
var expected = new List<string>
{
"UglyToad.PdfPig.AcroForms.Fields.AcroButtonFieldFlags",
"UglyToad.PdfPig.AcroForms.Fields.AcroCheckboxField",
"UglyToad.PdfPig.AcroForms.Fields.AcroChoiceFieldFlags",
"UglyToad.PdfPig.AcroForms.Fields.AcroChoiceOption",
"UglyToad.PdfPig.AcroForms.Fields.AcroComboBoxField",
"UglyToad.PdfPig.AcroForms.Fields.AcroFieldBase",
"UglyToad.PdfPig.AcroForms.Fields.AcroFieldCommonInformation",
"UglyToad.PdfPig.AcroForms.Fields.AcroFieldType",
"UglyToad.PdfPig.AcroForms.Fields.AcroListBoxField",
"UglyToad.PdfPig.AcroForms.Fields.AcroNonTerminalField",
"UglyToad.PdfPig.AcroForms.Fields.AcroPushButtonField",
"UglyToad.PdfPig.AcroForms.Fields.AcroRadioButtonsField",
"UglyToad.PdfPig.AcroForms.Fields.AcroSignatureField",
"UglyToad.PdfPig.AcroForms.Fields.AcroTextField",
"UglyToad.PdfPig.AcroForms.Fields.AcroTextFieldFlags",
"UglyToad.PdfPig.Annotations.Annotation",
@@ -59,6 +66,7 @@
"UglyToad.PdfPig.Content.Page",
"UglyToad.PdfPig.Content.PageRotationDegrees",
"UglyToad.PdfPig.Content.PageSize",
"UglyToad.PdfPig.Content.PageTreeNode",
"UglyToad.PdfPig.Content.Word",
"UglyToad.PdfPig.Content.TextLine",
"UglyToad.PdfPig.Content.TextBlock",

View File

@@ -147,7 +147,7 @@
children.Add(kidField);
}
result = new NonTerminalAcroField(fieldDictionary, "Non-Terminal Field", fieldFlags, information, children);
result = new AcroNonTerminalField(fieldDictionary, "Non-Terminal Field", fieldFlags, information, children);
}
else if (fieldType == NameToken.Btn)
{
@@ -165,12 +165,21 @@
}
else
{
var isChecked = false;
if (!fieldDictionary.TryGetOptionalTokenDirect(NameToken.V, tokenScanner, out NameToken valueToken))
{
valueToken = NameToken.Off;
}
else
{
isChecked = !string.Equals(valueToken.Data, NameToken.Off, StringComparison.OrdinalIgnoreCase);
}
var field = new AcroCheckboxField(fieldDictionary, fieldType, buttonFlags,
information,
valueToken,
isChecked);
var field = new AcroCheckboxField(fieldDictionary, fieldType, buttonFlags, information, valueToken);
result = field;
}
}

View File

@@ -6,7 +6,7 @@
/// Flags specifying various characteristics of a button type field in an <see cref="AcroFieldBase"/>.
/// </summary>
[Flags]
internal enum AcroButtonFieldFlags : uint
public enum AcroButtonFieldFlags : uint
{
/// <summary>
/// The user may not change the value of the field.

View File

@@ -2,18 +2,39 @@
{
using Tokens;
internal class AcroCheckboxField : AcroFieldBase
/// <inheritdoc />
/// <summary>
/// A checkbox which may be toggled on or off.
/// </summary>
public class AcroCheckboxField : AcroFieldBase
{
/// <summary>
/// The <see cref="AcroButtonFieldFlags"/> which define the behaviour of this button type.
/// </summary>
public AcroButtonFieldFlags Flags { get; }
/// <summary>
/// The current value of this checkbox.
/// </summary>
public NameToken CurrentValue { get; }
/// <summary>
/// Whether this checkbox is currently checked/on.
/// </summary>
public bool IsChecked { get; }
/// <inheritdoc />
/// <summary>
/// Create a new <see cref="AcroCheckboxField"/>.
/// </summary>
public AcroCheckboxField(DictionaryToken dictionary, string fieldType, AcroButtonFieldFlags fieldFlags,
AcroFieldCommonInformation information, NameToken currentValue) :
base(dictionary, fieldType, (uint)fieldFlags, information)
AcroFieldCommonInformation information, NameToken currentValue,
bool isChecked) :
base(dictionary, fieldType, (uint)fieldFlags, AcroFieldType.Checkbox, information)
{
Flags = fieldFlags;
CurrentValue = currentValue;
IsChecked = isChecked;
}
}
}

View File

@@ -35,11 +35,12 @@
[CanBeNull]
public IReadOnlyList<int> SelectedOptionIndices { get; }
/// <inheritdoc />
/// <summary>
/// Create a new <see cref="AcroComboBoxField"/>.
/// Create a new <see cref="T:UglyToad.PdfPig.AcroForms.Fields.AcroComboBoxField" />.
/// </summary>
/// <param name="dictionary">The dictionary for this field.</param>
/// <param name="fieldType">The type of this field, must be <see cref="NameToken.Ch"/>.</param>
/// <param name="fieldType">The type of this field, must be <see cref="F:UglyToad.PdfPig.Tokens.NameToken.Ch" />.</param>
/// <param name="fieldFlags">The flags specifying behaviour for this field.</param>
/// <param name="information">Additional information for this field.</param>
/// <param name="options">The options in this field.</param>
@@ -49,7 +50,7 @@
AcroFieldCommonInformation information, IReadOnlyList<AcroChoiceOption> options,
IReadOnlyList<string> selectedOptions,
IReadOnlyList<int> selectedOptionIndices) :
base(dictionary, fieldType, (uint)fieldFlags, information)
base(dictionary, fieldType, (uint)fieldFlags, AcroFieldType.ComboBox, information)
{
Flags = fieldFlags;
Options = options ?? throw new ArgumentNullException(nameof(options));

View File

@@ -16,10 +16,15 @@
public DictionaryToken Dictionary { get; }
/// <summary>
/// The <see cref="string"/> representing the type of this field.
/// The <see cref="string"/> representing the type of this field in PDF format.
/// </summary>
[NotNull]
public string FieldType { get; }
public string RawFieldType { get; }
/// <summary>
/// The actual <see cref="AcroFieldType"/> represented by this field.
/// </summary>
public AcroFieldType FieldType { get; }
/// <summary>
/// Specifies various characteristics of the field.
@@ -36,15 +41,26 @@
/// Create a new <see cref="AcroFieldBase"/>.
/// </summary>
/// <param name="dictionary">The dictionary for this field.</param>
/// <param name="fieldType">The type of this field.</param>
/// <param name="rawFieldType">The PDF string type of this field.</param>
/// <param name="fieldFlags">The flags specifying behaviour for this field.</param>
/// <param name="fieldType">The type of this field.</param>
/// <param name="information">Additional information for this field.</param>
protected AcroFieldBase(DictionaryToken dictionary, string fieldType, uint fieldFlags, AcroFieldCommonInformation information)
protected AcroFieldBase(DictionaryToken dictionary, string rawFieldType,
uint fieldFlags,
AcroFieldType fieldType,
AcroFieldCommonInformation information)
{
Dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary));
FieldType = fieldType ?? throw new ArgumentNullException(nameof(fieldType));
RawFieldType = rawFieldType ?? throw new ArgumentNullException(nameof(rawFieldType));
FieldFlags = fieldFlags;
FieldType = fieldType;
Information = information ?? new AcroFieldCommonInformation(null, null, null, null);
}
/// <inheritdoc />
public override string ToString()
{
return $"{FieldType}";
}
}
}

View File

@@ -47,7 +47,35 @@
/// <inheritdoc />
public override string ToString()
{
return $"Parent: {Parent}. Partial: {PartialName}. Alternate: {AlternateName}. Mapping: {MappingName}.";
string AppendIfNotNull(string val, string label, string result)
{
if (val == null)
{
return result;
}
if (result.Length > 0)
{
result += " ";
}
result += $"{label}: {val}.";
return result;
}
var s = string.Empty;
if (Parent != null)
{
s += $"Parent: {Parent}.";
}
s = AppendIfNotNull(PartialName, "Partial Name", s);
s = AppendIfNotNull(AlternateName, "Alternate Name", s);
s = AppendIfNotNull(MappingName, "Mapping Name", s);
return s;
}
}
}

View File

@@ -0,0 +1,41 @@
namespace UglyToad.PdfPig.AcroForms.Fields
{
/// <summary>
/// Indicates the type of field for a <see cref="AcroFieldBase"/>.
/// </summary>
public enum AcroFieldType
{
/// <summary>
/// A button that immediately to user input without retaining state.
/// </summary>
PushButton,
/// <summary>
/// A checkbox which toggles between on and off states.
/// </summary>
Checkbox,
/// <summary>
/// A set of radio buttons.
/// </summary>
RadioButton,
/// <summary>
/// A textbox allowing user input through the keyboard.
/// </summary>
Text,
/// <summary>
/// A dropdown list of options with optional user-editable textbox.
/// </summary>
ComboBox,
/// <summary>
/// A list of options for the user to select from.
/// </summary>
ListBox,
/// <summary>
/// A field containing a digital signature.
/// </summary>
Signature,
/// <summary>
/// A field which acts as a container for other fields.
/// </summary>
NonTerminal
}
}

View File

@@ -44,6 +44,7 @@
/// </summary>
public bool SupportsMultiSelect => Flags.Equals(AcroChoiceFieldFlags.MultiSelect);
/// <inheritdoc />
/// <summary>
/// Create a new <see cref="AcroListBoxField"/>.
/// </summary>
@@ -60,7 +61,7 @@
IReadOnlyList<string> selectedOptions,
IReadOnlyList<int> selectedOptionIndices,
int? topIndex) :
base(dictionary, fieldType, (uint)fieldFlags, information)
base(dictionary, fieldType, (uint)fieldFlags, AcroFieldType.ListBox, information)
{
Flags = fieldFlags;
Options = options ?? throw new ArgumentNullException(nameof(options));

View File

@@ -0,0 +1,29 @@
namespace UglyToad.PdfPig.AcroForms.Fields
{
using System;
using System.Collections.Generic;
using Tokens;
/// <inheritdoc />
/// <summary>
/// A non-leaf field in the form's structure.
/// </summary>
public class AcroNonTerminalField : AcroFieldBase
{
/// <summary>
/// The child fields of this field.
/// </summary>
public IReadOnlyList<AcroFieldBase> Children { get; }
/// <inheritdoc />
/// <summary>
/// Create a new <see cref="AcroNonTerminalField"/>.
/// </summary>
internal AcroNonTerminalField(DictionaryToken dictionary, string fieldType, uint fieldFlags, AcroFieldCommonInformation information,
IReadOnlyList<AcroFieldBase> children) :
base(dictionary, fieldType, fieldFlags, AcroFieldType.NonTerminal, information)
{
Children = children ?? throw new ArgumentNullException(nameof(children));
}
}
}

View File

@@ -2,13 +2,24 @@
{
using Tokens;
internal class AcroPushButtonField : AcroFieldBase
/// <inheritdoc />
/// <summary>
/// A push button responds immediately to user input without storing any state.
/// </summary>
public class AcroPushButtonField : AcroFieldBase
{
/// <summary>
/// The <see cref="AcroButtonFieldFlags"/> which define the behaviour of this button type.
/// </summary>
public AcroButtonFieldFlags Flags { get; }
/// <inheritdoc />
/// <summary>
/// Create a new <see cref="AcroPushButtonField"/>.
/// </summary>
public AcroPushButtonField(DictionaryToken dictionary, string fieldType, AcroButtonFieldFlags fieldFlags,
AcroFieldCommonInformation information) :
base(dictionary, fieldType, (uint)fieldFlags, information)
base(dictionary, fieldType, (uint)fieldFlags, AcroFieldType.PushButton, information)
{
Flags = fieldFlags;
}

View File

@@ -2,13 +2,24 @@
{
using Tokens;
internal class AcroRadioButtonsField : AcroFieldBase
/// <inheritdoc />
/// <summary>
/// A set of radio buttons.
/// </summary>
public class AcroRadioButtonsField : AcroFieldBase
{
/// <summary>
/// The <see cref="AcroButtonFieldFlags"/> which define the behaviour of this button type.
/// </summary>
public AcroButtonFieldFlags Flags { get; }
/// <inheritdoc />
/// <summary>
/// Create a new <see cref="AcroRadioButtonsField"/>.
/// </summary>
public AcroRadioButtonsField(DictionaryToken dictionary, string fieldType, AcroButtonFieldFlags fieldFlags,
AcroFieldCommonInformation information) :
base(dictionary, fieldType, (uint)fieldFlags, information)
base(dictionary, fieldType, (uint)fieldFlags, AcroFieldType.RadioButton, information)
{
Flags = fieldFlags;
}

View File

@@ -2,10 +2,18 @@
{
using Tokens;
internal class AcroSignatureField : AcroFieldBase
/// <inheritdoc />
/// <summary>
/// A digital signature field.
/// </summary>
public class AcroSignatureField : AcroFieldBase
{
/// <inheritdoc />
/// <summary>
/// Create a new <see cref="T:UglyToad.PdfPig.AcroForms.Fields.AcroSignatureField" />.
/// </summary>
public AcroSignatureField(DictionaryToken dictionary, string fieldType, uint fieldFlags, AcroFieldCommonInformation information) :
base(dictionary, fieldType, fieldFlags, information)
base(dictionary, fieldType, fieldFlags, AcroFieldType.Signature, information)
{
}
}

View File

@@ -47,7 +47,7 @@
/// <param name="maxLength">The maximum length.</param>
public AcroTextField(DictionaryToken dictionary, string fieldType, AcroTextFieldFlags fieldFlags,
AcroFieldCommonInformation information, string value, int? maxLength) :
base(dictionary, fieldType, (uint)fieldFlags, information)
base(dictionary, fieldType, (uint)fieldFlags, AcroFieldType.Text, information)
{
Flags = fieldFlags;
Value = value;
@@ -59,7 +59,7 @@
/// <inheritdoc />
public override string ToString()
{
return Value ?? string.Empty;
return $"{FieldType}: {Value ?? string.Empty}";
}
}
}

View File

@@ -1,21 +0,0 @@
namespace UglyToad.PdfPig.AcroForms.Fields
{
using System;
using System.Collections.Generic;
using Tokens;
/// <summary>
/// A non-leaf field in the form's structure.
/// </summary>
internal class NonTerminalAcroField : AcroFieldBase
{
public IReadOnlyList<AcroFieldBase> Children { get; }
public NonTerminalAcroField(DictionaryToken dictionary, string fieldType, uint fieldFlags, AcroFieldCommonInformation information,
IReadOnlyList<AcroFieldBase> children) :
base(dictionary, fieldType, fieldFlags, information)
{
Children = children ?? throw new ArgumentNullException(nameof(children));
}
}
}

View File

@@ -1,6 +1,7 @@
namespace UglyToad.PdfPig.Content
{
using System;
using System.Collections.Generic;
using Tokens;
using Util.JetBrains.Annotations;
@@ -10,6 +11,8 @@
/// </summary>
public class Catalog
{
private readonly IReadOnlyDictionary<int, PageTreeNode> pagesByNumber;
/// <summary>
/// The catalog dictionary containing assorted information.
/// </summary>
@@ -22,14 +25,58 @@
[NotNull]
public DictionaryToken PagesDictionary { get; }
/// <summary>
/// The page tree for this document containing all pages, page numbers and their dictionaries.
/// </summary>
public PageTreeNode PageTree { get; }
/// <summary>
/// Create a new <see cref="CatalogDictionary"/>.
/// </summary>
internal Catalog(DictionaryToken catalogDictionary, DictionaryToken pagesDictionary)
internal Catalog(DictionaryToken catalogDictionary, DictionaryToken pagesDictionary,
PageTreeNode pageTree)
{
CatalogDictionary = catalogDictionary ?? throw new ArgumentNullException(nameof(catalogDictionary));
PagesDictionary = pagesDictionary ?? throw new ArgumentNullException(nameof(pagesDictionary));
PageTree = pageTree ?? throw new ArgumentNullException(nameof(pageTree));
if (!pageTree.IsRoot)
{
throw new ArgumentException("Page tree must be the root page tree node.", nameof(pageTree));
}
var byNumber = new Dictionary<int, PageTreeNode>();
PopulatePageByNumberDictionary(pageTree, byNumber);
pagesByNumber = byNumber;
}
private static void PopulatePageByNumberDictionary(PageTreeNode node, Dictionary<int, PageTreeNode> result)
{
if (node.IsPage)
{
if (!node.PageNumber.HasValue)
{
throw new InvalidOperationException($"Node was page but did not have page number: {node}.");
}
result[node.PageNumber.Value] = node;
return;
}
foreach (var child in node.Children)
{
PopulatePageByNumberDictionary(child, result);
}
}
internal PageTreeNode GetPageNode(int pageNumber)
{
if (!pagesByNumber.TryGetValue(pageNumber, out var node))
{
throw new InvalidOperationException($"Could not find page node by number for: {pageNumber}.");
}
return node;
}
}
}

View File

@@ -0,0 +1,94 @@
namespace UglyToad.PdfPig.Content
{
using System;
using System.Collections.Generic;
using Tokens;
using Util.JetBrains.Annotations;
/// <summary>
/// A node in the PDF document's page tree.
/// Nodes may either be of type 'Page' - a single page, or 'Pages' - a container for multiple child Page
/// or Pages nodes.
/// </summary>
public class PageTreeNode
{
/// <summary>
/// The dictionary for this node in the page tree.
/// </summary>
[NotNull]
public DictionaryToken NodeDictionary { get; }
/// <summary>
/// The indirect reference for this node in the page tree.
/// </summary>
public IndirectReference Reference { get; }
/// <summary>
/// Whether this node is a page or not. If not it must be a /Pages container.
/// </summary>
public bool IsPage { get; }
/// <summary>
/// The number of this page if <see cref="IsPage"/> is <see langword="true"/>.
/// </summary>
public int? PageNumber { get; }
/// <summary>
/// The child nodes of this node if <see cref="IsPage"/> is <see langword="false" />
/// </summary>
[NotNull]
public IReadOnlyList<PageTreeNode> Children { get; }
/// <summary>
/// The parent node of this node, unless it is the root node.
/// </summary>
[CanBeNull]
public PageTreeNode Parent { get; private set; }
/// <summary>
/// Whether this node is the root node.
/// </summary>
public bool IsRoot => Parent == null;
/// <summary>
/// Create a new <see cref="PageTreeNode"/>.
/// </summary>
internal PageTreeNode(DictionaryToken nodeDictionary, IndirectReference reference,
bool isPage,
int? pageNumber,
IReadOnlyList<PageTreeNode> children)
{
NodeDictionary = nodeDictionary ?? throw new ArgumentNullException(nameof(nodeDictionary));
Reference = reference;
IsPage = isPage;
PageNumber = pageNumber;
Children = children ?? throw new ArgumentNullException(nameof(children));
if (IsPage && Children.Count > 0)
{
throw new ArgumentException("Cannot define children on a page node.", nameof(children));
}
if (!IsPage && pageNumber.HasValue)
{
throw new ArgumentException("Cannot define page number for a pages node.", nameof(pageNumber));
}
foreach (var child in Children)
{
child.Parent = this;
}
}
/// <inheritdoc />
public override string ToString()
{
if (IsPage)
{
return $"Page #{PageNumber}: {NodeDictionary}.";
}
return $"Pages ({Children.Count} children): {NodeDictionary}";
}
}
}

View File

@@ -2,144 +2,70 @@
{
using System;
using System.Collections.Generic;
using Logging;
using Parser.Parts;
using Tokenization.Scanner;
using Tokens;
using Util;
internal class Pages
{
private readonly ILog log;
private readonly Catalog catalog;
private readonly IPageFactory pageFactory;
private readonly bool isLenientParsing;
private readonly IPdfTokenScanner pdfScanner;
private readonly DictionaryToken rootPageDictionary;
private readonly Dictionary<int, DictionaryToken> locatedPages = new Dictionary<int, DictionaryToken>();
public int Count { get; }
internal Pages(ILog log, Catalog catalog, IPageFactory pageFactory, bool isLenientParsing, IPdfTokenScanner pdfScanner)
internal Pages(Catalog catalog, IPageFactory pageFactory, bool isLenientParsing,
IPdfTokenScanner pdfScanner)
{
if (catalog == null)
{
throw new ArgumentNullException(nameof(catalog));
}
rootPageDictionary = catalog.PagesDictionary;
Count = rootPageDictionary.GetIntOrDefault(NameToken.Count);
this.log = log;
this.catalog = catalog;
this.pageFactory = pageFactory;
this.catalog = catalog ?? throw new ArgumentNullException(nameof(catalog));
this.pageFactory = pageFactory ?? throw new ArgumentNullException(nameof(pageFactory));
this.isLenientParsing = isLenientParsing;
this.pdfScanner = pdfScanner;
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
Count = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count);
}
public Page GetPage(int pageNumber)
{
if (locatedPages.TryGetValue(pageNumber, out DictionaryToken targetPageDictionary))
if (pageNumber <= 0 || pageNumber > Count)
{
// TODO: cache the page
return pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(),
isLenientParsing);
throw new ArgumentOutOfRangeException(nameof(pageNumber),
$"Page number {pageNumber} invalid, must be between 1 and {Count}.");
}
var observed = new List<int>();
var pageNode = catalog.GetPageNode(pageNumber);
var pageStack = new Stack<PageTreeNode>();
var currentNode = pageNode;
while (currentNode != null)
{
pageStack.Push(currentNode);
currentNode = currentNode.Parent;
}
var pageTreeMembers = new PageTreeMembers();
// todo: running a search for a different, unloaded, page number, results in a bug.
var isFound = FindPage(rootPageDictionary, pageNumber, observed, pageTreeMembers);
if (!isFound || !locatedPages.TryGetValue(pageNumber, out targetPageDictionary))
while (pageStack.Count > 0)
{
throw new ArgumentOutOfRangeException("Could not find the page with number: " + pageNumber);
}
currentNode = pageStack.Pop();
var page = pageFactory.Create(pageNumber, targetPageDictionary, pageTreeMembers, isLenientParsing);
locatedPages[pageNumber] = targetPageDictionary;
return page;
}
private static int GetNextPageNumber(IReadOnlyList<int> pages)
{
if (pages.Count == 0)
{
return 1;
}
return pages[pages.Count - 1] + 1;
}
public bool FindPage(DictionaryToken currentPageDictionary, int soughtPageNumber, List<int> pageNumbersObserved, PageTreeMembers pageTreeMembers)
{
var type = currentPageDictionary.GetNameOrDefault(NameToken.Type);
if (type?.Equals(NameToken.Page) == true)
{
var pageNumber = GetNextPageNumber(pageNumbersObserved);
bool found = pageNumber == soughtPageNumber;
locatedPages[pageNumber] = currentPageDictionary;
pageNumbersObserved.Add(pageNumber);
return found;
}
if (type?.Equals(NameToken.Pages) != true)
{
log.Warn("Did not find the expected type (Page or Pages) in dictionary: " + currentPageDictionary);
return false;
}
if (currentPageDictionary.TryGet(NameToken.MediaBox, out var token))
{
var mediaBox = DirectObjectFinder.Get<ArrayToken>(token, pdfScanner);
pageTreeMembers.MediaBox = new MediaBox(mediaBox.ToRectangle());
}
if (currentPageDictionary.TryGet(NameToken.Rotate, pdfScanner, out NumericToken rotateToken))
{
pageTreeMembers.Rotation = rotateToken.Int;
}
if (!currentPageDictionary.TryGet(NameToken.Kids, out var kids)
|| !(kids is ArrayToken kidsArray))
{
return false;
}
pageFactory.LoadResources(currentPageDictionary, isLenientParsing);
bool childFound = false;
foreach (var kid in kidsArray.Data)
{
// todo: exit early
var child = DirectObjectFinder.Get<DictionaryToken>(kid, pdfScanner);
pageFactory.LoadResources(currentNode.NodeDictionary, isLenientParsing);
var thisPageMatches = FindPage(child, soughtPageNumber, pageNumbersObserved, pageTreeMembers);
if (thisPageMatches)
if (currentNode.NodeDictionary.TryGet(NameToken.MediaBox, pdfScanner, out ArrayToken mediaBox))
{
childFound = true;
break;
pageTreeMembers.MediaBox = new MediaBox(mediaBox.ToRectangle());
}
if (currentNode.NodeDictionary.TryGet(NameToken.Rotate, pdfScanner, out NumericToken rotateToken))
{
pageTreeMembers.Rotation = rotateToken.Int;
}
}
return childFound;
}
public IReadOnlyList<Page> GetAllPages()
{
return new Page[0];
var page = pageFactory.Create(pageNumber, pageNode.NodeDictionary, pageTreeMembers, isLenientParsing);
return page;
}
}
}

View File

@@ -5,6 +5,7 @@
using System.IO;
using Tokens;
/// <inheritdoc />
/// <summary>
/// ASCII 85 (Base85) is a binary to text encoding using 5 ASCII characters per 4 bytes of data.
/// </summary>
@@ -24,6 +25,10 @@
85 * 85 * 85 *85
};
/// <inheritdoc />
public bool IsSupported { get; } = true;
/// <inheritdoc />
public byte[] Decode(IReadOnlyList<byte> input, DictionaryToken streamDictionary, int filterIndex)
{
var asciiBuffer = new byte[5];

View File

@@ -5,6 +5,10 @@
using System.IO;
using Tokens;
/// <inheritdoc />
/// <summary>
/// Encodes/decodes data using the ASCII hexadecimal encoding where each byte is represented by two ASCII characters.
/// </summary>
internal class AsciiHexDecodeFilter : IFilter
{
private static readonly short[] ReverseHex =
@@ -22,6 +26,10 @@
/* 100 */ 13, 14, 15
};
/// <inheritdoc />
public bool IsSupported { get; } = true;
/// <inheritdoc />
public byte[] Decode(IReadOnlyList<byte> input, DictionaryToken streamDictionary, int filterIndex)
{
var pair = new byte[2];

View File

@@ -6,6 +6,10 @@
internal class CcittFaxDecodeFilter : IFilter
{
/// <inheritdoc />
public bool IsSupported { get; } = false;
/// <inheritdoc />
public byte[] Decode(IReadOnlyList<byte> input, DictionaryToken streamDictionary, int filterIndex)
{
throw new NotSupportedException("The CCITT Fax Filter for image data is not currently supported. " +

View File

@@ -6,6 +6,10 @@
internal class DctDecodeFilter : IFilter
{
/// <inheritdoc />
public bool IsSupported { get; } = false;
/// <inheritdoc />
public byte[] Decode(IReadOnlyList<byte> input, DictionaryToken streamDictionary, int filterIndex)
{
throw new NotSupportedException("The DST (Discrete Cosine Transform) Filter indicates data is encoded in JPEG format. " +

View File

@@ -9,8 +9,12 @@
using Tokens;
using Util;
/// <inheritdoc />
/// <summary>
///
/// The Flate filter is based on the public-domain zlib/deflate compression method, a variable-length Lempel-Ziv
/// adaptive compression method cascaded with adaptive Huffman coding.
/// It is fully defined in Internet RFCs 1950, ZLIB Compressed Data Format Specification, and
/// 1951, DEFLATE Compressed Data Format Specification
/// </summary>
/// <remarks>
/// See section 3.3.3 of the spec (version 1.7) for details on the FlateDecode filter.
@@ -34,6 +38,10 @@
this.log = log;
}
/// <inheritdoc />
public bool IsSupported { get; } = true;
/// <inheritdoc />
public byte[] Decode(IReadOnlyList<byte> input, DictionaryToken streamDictionary, int filterIndex)
{
if (input == null)

View File

@@ -3,8 +3,24 @@
using System.Collections.Generic;
using Tokens;
/// <summary>
/// A filter is used in a PDF to encode/decode data either to compress it
/// or derive an ASCII representation of the data.
/// </summary>
internal interface IFilter
{
/// <summary>
/// Whether this library can decode information encoded using this filter.
/// </summary>
bool IsSupported { get; }
/// <summary>
/// Decodes data encoded using this filter type.
/// </summary>
/// <param name="input">The encoded bytes which were encoded using this filter.</param>
/// <param name="streamDictionary">The dictionary of the <see cref="StreamToken"/> (or other dictionary types, e.g. inline images) containing these bytes.</param>
/// <param name="filterIndex">The position of this filter in the pipeline used to encode data.</param>
/// <returns>The decoded bytes.</returns>
byte[] Decode(IReadOnlyList<byte> input, DictionaryToken streamDictionary, int filterIndex);
}
}

View File

@@ -6,6 +6,10 @@
internal class Jbig2DecodeFilter : IFilter
{
/// <inheritdoc />
public bool IsSupported { get; } = false;
/// <inheritdoc />
public byte[] Decode(IReadOnlyList<byte> input, DictionaryToken streamDictionary, int filterIndex)
{
throw new NotSupportedException("The JBIG2 Filter for monochrome image data is not currently supported. " +

View File

@@ -6,6 +6,10 @@
internal class JpxDecodeFilter : IFilter
{
/// <inheritdoc />
public bool IsSupported { get; } = false;
/// <inheritdoc />
public byte[] Decode(IReadOnlyList<byte> input, DictionaryToken streamDictionary, int filterIndex)
{
throw new NotSupportedException("The JPX Filter (JPEG2000) for image data is not currently supported. " +

View File

@@ -5,6 +5,11 @@
using Tokens;
using Util;
/// <inheritdoc />
/// <summary>
/// The LZW (Lempel-Ziv-Welch) filter is a variable-length, adaptive compression method
/// that has been adopted as one of the standard compression methods in the Tag Image File Format (TIFF) standard.
/// </summary>
internal class LzwFilter : IFilter
{
private const int DefaultColors = 1;
@@ -27,6 +32,10 @@
this.pngPredictor = pngPredictor ?? throw new ArgumentNullException(nameof(pngPredictor));
}
/// <inheritdoc />
public bool IsSupported { get; } = true;
/// <inheritdoc />
public byte[] Decode(IReadOnlyList<byte> input, DictionaryToken streamDictionary, int filterIndex)
{
var parameters = decodeParameterResolver.GetFilterParameters(streamDictionary, filterIndex);

View File

@@ -4,10 +4,19 @@
using System.IO;
using Tokens;
/// <inheritdoc />
/// <summary>
/// The Run Length filterencodes data in a simple byte-oriented format based on run length.
/// The encoded data is a sequence of runs, where each run consists of a length byte followed by 1 to 128 bytes of data.
/// </summary>
internal class RunLengthFilter : IFilter
{
private const byte EndOfDataLength = 128;
/// <inheritdoc />
public bool IsSupported { get; } = true;
/// <inheritdoc />
public byte[] Decode(IReadOnlyList<byte> input, DictionaryToken streamDictionary, int filterIndex)
{
using (var memoryStream = new MemoryStream())

View File

@@ -212,9 +212,9 @@ namespace UglyToad.PdfPig.Geometry
}
/// <summary>
/// The rectangle completely containing the path.
/// Gets a <see cref="PdfRectangle"/> which entirely contains the geometry of the defined path.
/// </summary>
/// <returns></returns>
/// <returns>For paths which don't define any geometry this returns <see langword="null"/>.</returns>
public PdfRectangle? GetBoundingRectangle()
{
if (commands.Count == 0)

View File

@@ -1,15 +1,19 @@
namespace UglyToad.PdfPig.Parser
{
using System;
using System.Collections.Generic;
using Content;
using Exceptions;
using Parts;
using Tokenization.Scanner;
using Tokens;
using Util;
internal class CatalogFactory
internal static class CatalogFactory
{
public Catalog Create(IPdfTokenScanner scanner, DictionaryToken dictionary)
public static Catalog Create(IndirectReference rootReference, DictionaryToken dictionary,
IPdfTokenScanner scanner,
bool isLenientParsing)
{
if (dictionary == null)
{
@@ -26,9 +30,116 @@
throw new PdfDocumentFormatException($"No pages entry was found in the catalog dictionary: {dictionary}.");
}
var pages = DirectObjectFinder.Get<DictionaryToken>(value, scanner);
DictionaryToken pages;
var pagesReference = rootReference;
if (value is IndirectReferenceToken pagesRef)
{
pagesReference = pagesRef.Data;
pages = DirectObjectFinder.Get<DictionaryToken>(pagesRef, scanner);
}
else if (value is DictionaryToken pagesDict)
{
pages = pagesDict;
}
else
{
pages = DirectObjectFinder.Get<DictionaryToken>(value, scanner);
}
var pageNumber = 0;
var pageTree = ProcessPagesNode(pagesReference, pages, new IndirectReference(1, 0), true,
scanner, isLenientParsing, ref pageNumber);
return new Catalog(dictionary, pages, pageTree);
}
private static PageTreeNode ProcessPagesNode(IndirectReference reference, DictionaryToken nodeDictionary,
IndirectReference parentReference,
bool isRoot,
IPdfTokenScanner pdfTokenScanner,
bool isLenientParsing,
ref int pageNumber)
{
var isPage = false;
if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type))
{
if (!isLenientParsing)
{
throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}.");
}
if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _))
{
isPage = true;
}
}
else
{
isPage = type.Equals(NameToken.Page);
if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing)
{
throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}.");
}
}
if (!isLenientParsing && !isRoot)
{
if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken))
{
throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}.");
}
if (!parentReferenceToken.Data.Equals(parentReference))
{
throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}.");
}
}
if (isPage)
{
pageNumber++;
var thisNode = new PageTreeNode(nodeDictionary, reference, true,
pageNumber,
EmptyArray<PageTreeNode>.Instance);
return thisNode;
}
return new Catalog(dictionary, pages);
if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids))
{
if (!isLenientParsing)
{
throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {nodeDictionary}.");
}
kids = new ArrayToken(EmptyArray<IToken>.Instance);
}
var nodeChildren = new List<PageTreeNode>();
foreach (var kid in kids.Data)
{
if (!(kid is IndirectReferenceToken kidRef))
{
throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}.");
}
if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken))
{
throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}.");
}
var kidNode = ProcessPagesNode(kidRef.Data, kidDictionaryToken, reference, false, pdfTokenScanner, isLenientParsing, ref pageNumber);
nodeChildren.Add(kidNode);
}
return new PageTreeNode(nodeDictionary, reference, false, null, nodeChildren);
}
}
}

View File

@@ -6,9 +6,15 @@
using Tokenization.Scanner;
using Tokens;
internal class DocumentInformationFactory
/// <summary>
/// Parse the dictionary from a PDF file trailer.
/// </summary>
internal static class DocumentInformationFactory
{
public DocumentInformation Create(IPdfTokenScanner pdfTokenScanner, TrailerDictionary trailer)
/// <summary>
/// Convert the file trailer dictionary into a <see cref="DocumentInformation"/> instance.
/// </summary>
public static DocumentInformation Create(IPdfTokenScanner pdfTokenScanner, TrailerDictionary trailer)
{
if (!trailer.Info.HasValue)
{

View File

@@ -70,7 +70,6 @@
{
var log = container.Get<ILog>();
var filterProvider = container.Get<IFilterProvider>();
var catalogFactory = new CatalogFactory();
var cMapCache = new CMapCache(new CMapParser());
CrossReferenceTable crossReferenceTable = null;
@@ -104,7 +103,9 @@
var compactFontFormatParser = new CompactFontFormatParser(new CompactFontFormatIndividualFontParser(compactFontFormatIndexReader, new CompactFontFormatTopLevelDictionaryReader(),
new CompactFontFormatPrivateDictionaryReader()), compactFontFormatIndexReader);
var rootDictionary = ParseTrailer(crossReferenceTable, isLenientParsing, pdfScanner, out var encryptionDictionary);
var (rootReference, rootDictionary) = ParseTrailer(crossReferenceTable, isLenientParsing,
pdfScanner,
out var encryptionDictionary);
var encryptionHandler = encryptionDictionary != null ? (IEncryptionHandler)new EncryptionHandler(encryptionDictionary, crossReferenceTable.Trailer, password ?? string.Empty)
: NoOpEncryptionHandler.Instance;
@@ -124,14 +125,13 @@
var resourceContainer = new ResourceStore(pdfScanner, fontFactory);
var information = DocumentInformationFactory.Create(pdfScanner, crossReferenceTable.Trailer);
var catalog = CatalogFactory.Create(rootReference, rootDictionary, pdfScanner, isLenientParsing);
var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider,
new PageContentParser(new ReflectionGraphicsStateOperationFactory()),
log);
var informationFactory = new DocumentInformationFactory();
var information = informationFactory.Create(pdfScanner, crossReferenceTable.Trailer);
var catalog = catalogFactory.Create(pdfScanner, rootDictionary);
var caching = new ParsingCachingProviders(bruteForceSearcher, resourceContainer);
@@ -144,7 +144,7 @@
acroFormFactory);
}
private static DictionaryToken ParseTrailer(CrossReferenceTable crossReferenceTable, bool isLenientParsing, IPdfTokenScanner pdfTokenScanner,
private static (IndirectReference, DictionaryToken) ParseTrailer(CrossReferenceTable crossReferenceTable, bool isLenientParsing, IPdfTokenScanner pdfTokenScanner,
out EncryptionDictionary encryptionDictionary)
{
encryptionDictionary = null;
@@ -157,8 +157,6 @@
}
encryptionDictionary = EncryptionDictionaryFactory.Read(encryptionDictionaryToken, pdfTokenScanner);
//throw new NotSupportedException("Cannot currently parse a document using encryption: " + crossReferenceTable.Trailer.EncryptionToken);
}
var rootDictionary = DirectObjectFinder.Get<DictionaryToken>(crossReferenceTable.Trailer.Root, pdfTokenScanner);
@@ -168,7 +166,7 @@
rootDictionary = rootDictionary.With(NameToken.Type, NameToken.Catalog);
}
return rootDictionary;
return (crossReferenceTable.Trailer.Root, rootDictionary);
}
}
}

View File

@@ -98,7 +98,7 @@
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
Information = information ?? throw new ArgumentNullException(nameof(information));
pages = new Pages(log, catalog, pageFactory, isLenientParsing, pdfScanner);
pages = new Pages(catalog, pageFactory, isLenientParsing, pdfScanner);
Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
documentForm = new Lazy<AcroForm>(() => acroFormFactory.GetAcroForm(catalog));
}