Merge pull request #435 from theolivenbaum/master

Remove recursive algorithm on netstandard2.0 and above
This commit is contained in:
Eliot Jones
2022-04-10 11:50:02 -04:00
committed by GitHub
2 changed files with 148 additions and 58 deletions

View File

@@ -38,7 +38,7 @@
/// The child nodes of this node if <see cref="IsPage"/> is <see langword="false" /> /// The child nodes of this node if <see cref="IsPage"/> is <see langword="false" />
/// </summary> /// </summary>
[NotNull] [NotNull]
public IReadOnlyList<PageTreeNode> Children { get; } public IReadOnlyList<PageTreeNode> Children { get; private set; }
/// <summary> /// <summary>
/// The parent node of this node, unless it is the root node. /// The parent node of this node, unless it is the root node.
@@ -56,13 +56,21 @@
/// </summary> /// </summary>
internal PageTreeNode(DictionaryToken nodeDictionary, IndirectReference reference, internal PageTreeNode(DictionaryToken nodeDictionary, IndirectReference reference,
bool isPage, bool isPage,
int? pageNumber, int? pageNumber)
IReadOnlyList<PageTreeNode> children)
{ {
NodeDictionary = nodeDictionary ?? throw new ArgumentNullException(nameof(nodeDictionary)); NodeDictionary = nodeDictionary ?? throw new ArgumentNullException(nameof(nodeDictionary));
Reference = reference; Reference = reference;
IsPage = isPage; IsPage = isPage;
PageNumber = pageNumber; PageNumber = pageNumber;
if (!IsPage && PageNumber.HasValue)
{
throw new ArgumentException("Cannot define page number for a pages node.", nameof(pageNumber));
}
}
internal PageTreeNode WithChildren(IReadOnlyList<PageTreeNode> children)
{
Children = children ?? throw new ArgumentNullException(nameof(children)); Children = children ?? throw new ArgumentNullException(nameof(children));
if (IsPage && Children.Count > 0) if (IsPage && Children.Count > 0)
@@ -70,15 +78,11 @@
throw new ArgumentException("Cannot define children on a page node.", nameof(children)); throw new ArgumentException("Cannot define children on a page node.", nameof(children));
} }
if (!IsPage && pageNumber.HasValue)
{
throw new ArgumentException("Cannot define page number for a pages node.", nameof(pageNumber));
}
foreach (var child in Children) foreach (var child in Children)
{ {
child.Parent = this; child.Parent = this;
} }
return this;
} }
/// <inheritdoc /> /// <inheritdoc />

View File

@@ -10,6 +10,16 @@
internal static class CatalogFactory internal static class CatalogFactory
{ {
private class PageCounter
{
public int PageCount { get; private set; }
public void Increment()
{
PageCount++;
}
}
public static Catalog Create(IndirectReference rootReference, DictionaryToken dictionary, public static Catalog Create(IndirectReference rootReference, DictionaryToken dictionary,
IPdfTokenScanner scanner, IPdfTokenScanner scanner,
bool isLenientParsing) bool isLenientParsing)
@@ -46,75 +56,157 @@
pages = DirectObjectFinder.Get<DictionaryToken>(value, scanner); pages = DirectObjectFinder.Get<DictionaryToken>(value, scanner);
} }
var pageNumber = 0; var pageNumber = new PageCounter();
var pageTree = ProcessPagesNode(pagesReference, pages, new IndirectReference(1, 0), true, var pageTree = ProcessPagesNode(pagesReference, pages, new IndirectReference(1, 0), true,
scanner, isLenientParsing, ref pageNumber); scanner, isLenientParsing, pageNumber);
return new Catalog(dictionary, pages, pageTree); return new Catalog(dictionary, pages, pageTree);
} }
private static PageTreeNode ProcessPagesNode(IndirectReference reference, DictionaryToken nodeDictionary, #if NETSTANDARD2_0_OR_GREATER
IndirectReference parentReference,
bool isRoot, private static PageTreeNode ProcessPagesNode(IndirectReference referenceInput, DictionaryToken nodeDictionaryInput, IndirectReference parentReferenceInput, bool isRoot, IPdfTokenScanner pdfTokenScanner, bool isLenientParsing, PageCounter pageNumber)
IPdfTokenScanner pdfTokenScanner,
bool isLenientParsing,
ref int pageNumber)
{ {
bool isPage = CheckIfIsPage(nodeDictionaryInput, parentReferenceInput, isRoot, pdfTokenScanner, isLenientParsing);
if (isPage)
{
pageNumber.Increment();
return new PageTreeNode(nodeDictionaryInput, referenceInput, true, pageNumber.PageCount).WithChildren(EmptyArray<PageTreeNode>.Instance);
}
//If we got here, we have to iterate till we manage to exit
var toProcess = new Queue<(PageTreeNode thisPage, IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference, List<PageTreeNode> nodeChildren)>();
var firstPage = new PageTreeNode(nodeDictionaryInput, referenceInput, false, null);
var setChildren = new List<Action>();
var firstPageChildren = new List<PageTreeNode>();
setChildren.Add(() => firstPage.WithChildren(firstPageChildren));
toProcess.Enqueue((thisPage: firstPage, reference: referenceInput, nodeDictionary: nodeDictionaryInput, parentReference: parentReferenceInput, nodeChildren: firstPageChildren));
do
{
var current = toProcess.Dequeue();
if (!current.nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids))
{
if (!isLenientParsing) { throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {current.nodeDictionary}."); }
kids = new ArrayToken(EmptyArray<IToken>.Instance);
}
foreach (var kid in kids.Data)
{
if (!(kid is IndirectReferenceToken kidRef)) { throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}."); }
if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken)) { throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}."); }
bool isChildPage = CheckIfIsPage(kidDictionaryToken, current.reference, false, pdfTokenScanner, isLenientParsing);
if (isChildPage)
{
pageNumber.Increment();
var kidPageNode = new PageTreeNode(kidDictionaryToken, kidRef.Data, true, pageNumber.PageCount).WithChildren(EmptyArray<PageTreeNode>.Instance);
current.nodeChildren.Add(kidPageNode);
}
else
{
var kidChildNode = new PageTreeNode(kidDictionaryToken, kidRef.Data, false, null);
var kidChildren = new List<PageTreeNode>();
toProcess.Enqueue((thisPage: kidChildNode, reference: kidRef.Data, nodeDictionary: kidDictionaryToken, parentReference: current.reference, nodeChildren: kidChildren));
setChildren.Add(() => kidChildNode.WithChildren(kidChildren));
current.nodeChildren.Add(kidChildNode);
}
}
} while (toProcess.Count > 0);
foreach (var action in setChildren)
{
action();
}
return firstPage;
static bool CheckIfIsPage(DictionaryToken nodeDictionary, IndirectReference parentReference, bool isRoot, IPdfTokenScanner pdfTokenScanner, bool isLenientParsing)
{
var isPage = false;
if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type))
{
if (!isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}."); }
if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _)) { isPage = true; }
}
else
{
isPage = type.Equals(NameToken.Page);
if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}."); }
}
if (!isLenientParsing && !isRoot)
{
if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken)) { throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}."); }
if (!parentReferenceToken.Data.Equals(parentReference)) { throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}."); }
}
return isPage;
}
}
#endif
// Keep the algorithm below from throwing a StackOverflow exception.
// It probably should be refactored to not be recursive
private const ushort MAX_TREE_DEPTH = 1024;
private static PageTreeNode ProcessPagesNode(IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference, bool isRoot, IPdfTokenScanner pdfTokenScanner, bool isLenientParsing, PageCounter pageNumber, int depth = 0)
{
depth++;
if (depth > MAX_TREE_DEPTH) { throw new PdfDocumentFormatException($"Tree exceeded maximum depth of {MAX_TREE_DEPTH}, aborting."); }
var isPage = false; var isPage = false;
if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type)) if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type))
{ {
if (!isLenientParsing) if (!isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}."); }
{
throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}.");
}
if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _)) if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _)) { isPage = true; }
{
isPage = true;
}
} }
else else
{ {
isPage = type.Equals(NameToken.Page); isPage = type.Equals(NameToken.Page);
if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing) if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}."); }
{
throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}.");
}
} }
if (!isLenientParsing && !isRoot) if (!isLenientParsing && !isRoot)
{ {
if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken)) if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken)) { throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}."); }
{
throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}.");
}
if (!parentReferenceToken.Data.Equals(parentReference)) if (!parentReferenceToken.Data.Equals(parentReference)) { throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}."); }
{
throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}.");
}
} }
if (isPage) if (isPage)
{ {
pageNumber++; pageNumber.Increment();
var newPage = new PageTreeNode(nodeDictionary, reference, true, pageNumber.PageCount).WithChildren(EmptyArray<PageTreeNode>.Instance);
var thisNode = new PageTreeNode(nodeDictionary, reference, true, return newPage;
pageNumber,
EmptyArray<PageTreeNode>.Instance);
return thisNode;
} }
if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids)) if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids))
{ {
if (!isLenientParsing) if (!isLenientParsing) { throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {nodeDictionary}."); }
{
throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {nodeDictionary}.");
}
kids = new ArrayToken(EmptyArray<IToken>.Instance); kids = new ArrayToken(EmptyArray<IToken>.Instance);
} }
@@ -123,22 +215,16 @@
foreach (var kid in kids.Data) foreach (var kid in kids.Data)
{ {
if (!(kid is IndirectReferenceToken kidRef)) if (!(kid is IndirectReferenceToken kidRef)) { throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}."); }
{
throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}.");
}
if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken)) if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken)) { throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}."); }
{
throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}.");
}
var kidNode = ProcessPagesNode(kidRef.Data, kidDictionaryToken, reference, false, pdfTokenScanner, isLenientParsing, ref pageNumber); var kidNode = ProcessPagesNode(kidRef.Data, kidDictionaryToken, reference, false, pdfTokenScanner, isLenientParsing, pageNumber, depth);
nodeChildren.Add(kidNode); nodeChildren.Add(kidNode);
} }
return new PageTreeNode(nodeDictionary, reference, false, null, nodeChildren); return new PageTreeNode(nodeDictionary, reference, false, null).WithChildren(nodeChildren);
} }
} }
} }