refactor ProcessPagesNode not to use a recursive algorithm

This commit is contained in:
rafael-aero
2022-03-28 22:49:48 +02:00
parent c14238676c
commit 75a9665ff6
2 changed files with 143 additions and 64 deletions

View File

@@ -38,7 +38,7 @@
/// The child nodes of this node if <see cref="IsPage"/> is <see langword="false" /> /// The child nodes of this node if <see cref="IsPage"/> is <see langword="false" />
/// </summary> /// </summary>
[NotNull] [NotNull]
public IReadOnlyList<PageTreeNode> Children { get; } public IReadOnlyList<PageTreeNode> Children { get; private set; }
/// <summary> /// <summary>
/// The parent node of this node, unless it is the root node. /// The parent node of this node, unless it is the root node.
@@ -56,13 +56,21 @@
/// </summary> /// </summary>
internal PageTreeNode(DictionaryToken nodeDictionary, IndirectReference reference, internal PageTreeNode(DictionaryToken nodeDictionary, IndirectReference reference,
bool isPage, bool isPage,
int? pageNumber, int? pageNumber)
IReadOnlyList<PageTreeNode> children)
{ {
NodeDictionary = nodeDictionary ?? throw new ArgumentNullException(nameof(nodeDictionary)); NodeDictionary = nodeDictionary ?? throw new ArgumentNullException(nameof(nodeDictionary));
Reference = reference; Reference = reference;
IsPage = isPage; IsPage = isPage;
PageNumber = pageNumber; PageNumber = pageNumber;
if (!IsPage && PageNumber.HasValue)
{
throw new ArgumentException("Cannot define page number for a pages node.", nameof(pageNumber));
}
}
internal PageTreeNode WithChildren(IReadOnlyList<PageTreeNode> children)
{
Children = children ?? throw new ArgumentNullException(nameof(children)); Children = children ?? throw new ArgumentNullException(nameof(children));
if (IsPage && Children.Count > 0) if (IsPage && Children.Count > 0)
@@ -70,15 +78,11 @@
throw new ArgumentException("Cannot define children on a page node.", nameof(children)); throw new ArgumentException("Cannot define children on a page node.", nameof(children));
} }
if (!IsPage && pageNumber.HasValue)
{
throw new ArgumentException("Cannot define page number for a pages node.", nameof(pageNumber));
}
foreach (var child in Children) foreach (var child in Children)
{ {
child.Parent = this; child.Parent = this;
} }
return this;
} }
/// <inheritdoc /> /// <inheritdoc />

View File

@@ -11,9 +11,14 @@
internal static class CatalogFactory internal static class CatalogFactory
{ {
// Keep the algorithm below from throwing a StackOverflow exception. private class PageCounter
// It probably should be refactored to not be recursive {
private const ushort MAX_TREE_DEPTH = 1024; public int PageCount { get; private set; }
public void Increment()
{
PageCount++;
}
}
public static Catalog Create(IndirectReference rootReference, DictionaryToken dictionary, public static Catalog Create(IndirectReference rootReference, DictionaryToken dictionary,
IPdfTokenScanner scanner, IPdfTokenScanner scanner,
@@ -51,81 +56,157 @@
pages = DirectObjectFinder.Get<DictionaryToken>(value, scanner); pages = DirectObjectFinder.Get<DictionaryToken>(value, scanner);
} }
var pageNumber = 0; var pageNumber = new PageCounter();
var pageTree = ProcessPagesNode(pagesReference, pages, new IndirectReference(1, 0), true, var pageTree = ProcessPagesNode(pagesReference, pages, new IndirectReference(1, 0), true,
scanner, isLenientParsing, ref pageNumber, 0); scanner, isLenientParsing, pageNumber);
return new Catalog(dictionary, pages, pageTree); return new Catalog(dictionary, pages, pageTree);
} }
private static PageTreeNode ProcessPagesNode(IndirectReference reference, DictionaryToken nodeDictionary, #if NETSTANDARD2_0_OR_GREATER
IndirectReference parentReference,
bool isRoot, private static PageTreeNode ProcessPagesNode(IndirectReference referenceInput, DictionaryToken nodeDictionaryInput, IndirectReference parentReferenceInput, bool isRoot, IPdfTokenScanner pdfTokenScanner, bool isLenientParsing, PageCounter pageNumber)
IPdfTokenScanner pdfTokenScanner,
bool isLenientParsing,
ref int pageNumber,
ushort depth)
{ {
if(depth > MAX_TREE_DEPTH) bool isPage = CheckIfIsPage(nodeDictionaryInput, parentReferenceInput, isRoot, pdfTokenScanner, isLenientParsing);
if (isPage)
{ {
throw new PdfDocumentFormatException($"Tree exceeded maximum depth of {MAX_TREE_DEPTH}, aborting."); pageNumber.Increment();
return new PageTreeNode(nodeDictionaryInput, referenceInput, true, pageNumber.PageCount).WithChildren(EmptyArray<PageTreeNode>.Instance);
} }
//If we got here, we have to iterate till we manage to exit
var toProcess = new Queue<(PageTreeNode thisPage, IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference, List<PageTreeNode> nodeChildren)>();
var firstPage = new PageTreeNode(nodeDictionaryInput, referenceInput, false, null);
var setChildren = new List<Action>();
var firstPageChildren = new List<PageTreeNode>();
setChildren.Add(() => firstPage.WithChildren(firstPageChildren));
toProcess.Enqueue((thisPage: firstPage, reference: referenceInput, nodeDictionary: nodeDictionaryInput, parentReference: parentReferenceInput, nodeChildren: firstPageChildren));
do
{
var current = toProcess.Dequeue();
if (!current.nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids))
{
if (!isLenientParsing) { throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {current.nodeDictionary}."); }
kids = new ArrayToken(EmptyArray<IToken>.Instance);
}
foreach (var kid in kids.Data)
{
if (!(kid is IndirectReferenceToken kidRef)) { throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}."); }
if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken)) { throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}."); }
bool isChildPage = CheckIfIsPage(kidDictionaryToken, current.reference, false, pdfTokenScanner, isLenientParsing);
if (isChildPage)
{
pageNumber.Increment();
var kidPageNode = new PageTreeNode(kidDictionaryToken, kidRef.Data, true, pageNumber.PageCount).WithChildren(EmptyArray<PageTreeNode>.Instance);
current.nodeChildren.Add(kidPageNode);
}
else
{
var kidChildNode = new PageTreeNode(kidDictionaryToken, kidRef.Data, false, null);
var kidChildren = new List<PageTreeNode>();
toProcess.Enqueue((thisPage: kidChildNode, reference: kidRef.Data, nodeDictionary: kidDictionaryToken, parentReference: current.reference, nodeChildren: kidChildren));
setChildren.Add(() => kidChildNode.WithChildren(kidChildren));
current.nodeChildren.Add(kidChildNode);
}
}
} while (toProcess.Count > 0);
foreach (var action in setChildren)
{
action();
}
return firstPage;
static bool CheckIfIsPage(DictionaryToken nodeDictionary, IndirectReference parentReference, bool isRoot, IPdfTokenScanner pdfTokenScanner, bool isLenientParsing)
{
var isPage = false;
if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type))
{
if (!isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}."); }
if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _)) { isPage = true; }
}
else
{
isPage = type.Equals(NameToken.Page);
if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}."); }
}
if (!isLenientParsing && !isRoot)
{
if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken)) { throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}."); }
if (!parentReferenceToken.Data.Equals(parentReference)) { throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}."); }
}
return isPage;
}
}
#endif
// Keep the algorithm below from throwing a StackOverflow exception.
// It probably should be refactored to not be recursive
private const ushort MAX_TREE_DEPTH = 1024;
private static PageTreeNode ProcessPagesNode(IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference, bool isRoot, IPdfTokenScanner pdfTokenScanner, bool isLenientParsing, PageCounter pageNumber, int depth = 0)
{
depth++;
if (depth > MAX_TREE_DEPTH) { throw new PdfDocumentFormatException($"Tree exceeded maximum depth of {MAX_TREE_DEPTH}, aborting."); }
var isPage = false; var isPage = false;
if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type)) if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type))
{ {
if (!isLenientParsing) if (!isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}."); }
{
throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}.");
}
if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _)) if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _)) { isPage = true; }
{
isPage = true;
}
} }
else else
{ {
isPage = type.Equals(NameToken.Page); isPage = type.Equals(NameToken.Page);
if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing) if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}."); }
{
throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}.");
}
} }
if (!isLenientParsing && !isRoot) if (!isLenientParsing && !isRoot)
{ {
if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken)) if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken)) { throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}."); }
{
throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}.");
}
if (!parentReferenceToken.Data.Equals(parentReference)) if (!parentReferenceToken.Data.Equals(parentReference)) { throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}."); }
{
throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}.");
}
} }
if (isPage) if (isPage)
{ {
pageNumber++; pageNumber.Increment();
var newPage = new PageTreeNode(nodeDictionary, reference, true, pageNumber.PageCount).WithChildren(EmptyArray<PageTreeNode>.Instance);
var thisNode = new PageTreeNode(nodeDictionary, reference, true, return newPage;
pageNumber,
EmptyArray<PageTreeNode>.Instance);
return thisNode;
} }
if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids)) if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids))
{ {
if (!isLenientParsing) if (!isLenientParsing) { throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {nodeDictionary}."); }
{
throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {nodeDictionary}.");
}
kids = new ArrayToken(EmptyArray<IToken>.Instance); kids = new ArrayToken(EmptyArray<IToken>.Instance);
} }
@@ -134,22 +215,16 @@
foreach (var kid in kids.Data) foreach (var kid in kids.Data)
{ {
if (!(kid is IndirectReferenceToken kidRef)) if (!(kid is IndirectReferenceToken kidRef)) { throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}."); }
{
throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}.");
}
if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken)) if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken)) { throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}."); }
{
throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}.");
}
var kidNode = ProcessPagesNode(kidRef.Data, kidDictionaryToken, reference, false, pdfTokenScanner, isLenientParsing, ref pageNumber, depth + 1); var kidNode = ProcessPagesNode(kidRef.Data, kidDictionaryToken, reference, false, pdfTokenScanner, isLenientParsing, pageNumber, depth);
nodeChildren.Add(kidNode); nodeChildren.Add(kidNode);
} }
return new PageTreeNode(nodeDictionary, reference, false, null, nodeChildren); return new PageTreeNode(nodeDictionary, reference, false, null).WithChildren(nodeChildren);
} }
} }
} }