diff --git a/src/UglyToad.PdfPig/Content/PageTreeNode.cs b/src/UglyToad.PdfPig/Content/PageTreeNode.cs index f9cc505d..2b48d303 100644 --- a/src/UglyToad.PdfPig/Content/PageTreeNode.cs +++ b/src/UglyToad.PdfPig/Content/PageTreeNode.cs @@ -38,7 +38,7 @@ /// The child nodes of this node if is /// [NotNull] - public IReadOnlyList Children { get; } + public IReadOnlyList Children { get; private set; } /// /// The parent node of this node, unless it is the root node. @@ -56,29 +56,33 @@ /// internal PageTreeNode(DictionaryToken nodeDictionary, IndirectReference reference, bool isPage, - int? pageNumber, - IReadOnlyList children) + int? pageNumber) { NodeDictionary = nodeDictionary ?? throw new ArgumentNullException(nameof(nodeDictionary)); Reference = reference; IsPage = isPage; PageNumber = pageNumber; + + if (!IsPage && PageNumber.HasValue) + { + throw new ArgumentException("Cannot define page number for a pages node.", nameof(pageNumber)); + } + } + + internal PageTreeNode WithChildren(IReadOnlyList children) + { Children = children ?? throw new ArgumentNullException(nameof(children)); if (IsPage && Children.Count > 0) { - throw new ArgumentException("Cannot define children on a page node.", nameof(children)); - } - - if (!IsPage && pageNumber.HasValue) - { - throw new ArgumentException("Cannot define page number for a pages node.", nameof(pageNumber)); + throw new ArgumentException("Cannot define children on a page node.", nameof(children)); } foreach (var child in Children) { child.Parent = this; } + return this; } /// diff --git a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs index 71e6f79a..9f272f88 100644 --- a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs +++ b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs @@ -11,9 +11,14 @@ internal static class CatalogFactory { - // Keep the algorithm below from throwing a StackOverflow exception. - // It probably should be refactored to not be recursive - private const ushort MAX_TREE_DEPTH = 1024; + private class PageCounter + { + public int PageCount { get; private set; } + public void Increment() + { + PageCount++; + } + } public static Catalog Create(IndirectReference rootReference, DictionaryToken dictionary, IPdfTokenScanner scanner, @@ -51,81 +56,157 @@ pages = DirectObjectFinder.Get(value, scanner); } - var pageNumber = 0; + var pageNumber = new PageCounter(); var pageTree = ProcessPagesNode(pagesReference, pages, new IndirectReference(1, 0), true, - scanner, isLenientParsing, ref pageNumber, 0); + scanner, isLenientParsing, pageNumber); return new Catalog(dictionary, pages, pageTree); } - private static PageTreeNode ProcessPagesNode(IndirectReference reference, DictionaryToken nodeDictionary, - IndirectReference parentReference, - bool isRoot, - IPdfTokenScanner pdfTokenScanner, - bool isLenientParsing, - ref int pageNumber, - ushort depth) +#if NETSTANDARD2_0_OR_GREATER + + private static PageTreeNode ProcessPagesNode(IndirectReference referenceInput, DictionaryToken nodeDictionaryInput, IndirectReference parentReferenceInput, bool isRoot, IPdfTokenScanner pdfTokenScanner, bool isLenientParsing, PageCounter pageNumber) { - if(depth > MAX_TREE_DEPTH) + bool isPage = CheckIfIsPage(nodeDictionaryInput, parentReferenceInput, isRoot, pdfTokenScanner, isLenientParsing); + + if (isPage) { - throw new PdfDocumentFormatException($"Tree exceeded maximum depth of {MAX_TREE_DEPTH}, aborting."); + pageNumber.Increment(); + + return new PageTreeNode(nodeDictionaryInput, referenceInput, true, pageNumber.PageCount).WithChildren(EmptyArray.Instance); } + //If we got here, we have to iterate till we manage to exit + + var toProcess = new Queue<(PageTreeNode thisPage, IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference, List nodeChildren)>(); + var firstPage = new PageTreeNode(nodeDictionaryInput, referenceInput, false, null); + var setChildren = new List(); + var firstPageChildren = new List(); + + setChildren.Add(() => firstPage.WithChildren(firstPageChildren)); + + toProcess.Enqueue((thisPage: firstPage, reference: referenceInput, nodeDictionary: nodeDictionaryInput, parentReference: parentReferenceInput, nodeChildren: firstPageChildren)); + + do + { + var current = toProcess.Dequeue(); + + if (!current.nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids)) + { + if (!isLenientParsing) { throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {current.nodeDictionary}."); } + + kids = new ArrayToken(EmptyArray.Instance); + } + + foreach (var kid in kids.Data) + { + if (!(kid is IndirectReferenceToken kidRef)) { throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}."); } + + if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken)) { throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}."); } + + bool isChildPage = CheckIfIsPage(kidDictionaryToken, current.reference, false, pdfTokenScanner, isLenientParsing); + + if (isChildPage) + { + pageNumber.Increment(); + + var kidPageNode = new PageTreeNode(kidDictionaryToken, kidRef.Data, true, pageNumber.PageCount).WithChildren(EmptyArray.Instance); + current.nodeChildren.Add(kidPageNode); + } + else + { + var kidChildNode = new PageTreeNode(kidDictionaryToken, kidRef.Data, false, null); + var kidChildren = new List(); + toProcess.Enqueue((thisPage: kidChildNode, reference: kidRef.Data, nodeDictionary: kidDictionaryToken, parentReference: current.reference, nodeChildren: kidChildren)); + + setChildren.Add(() => kidChildNode.WithChildren(kidChildren)); + + current.nodeChildren.Add(kidChildNode); + } + } + } while (toProcess.Count > 0); + + foreach (var action in setChildren) + { + action(); + } + + return firstPage; + + + static bool CheckIfIsPage(DictionaryToken nodeDictionary, IndirectReference parentReference, bool isRoot, IPdfTokenScanner pdfTokenScanner, bool isLenientParsing) + { + var isPage = false; + + if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type)) + { + if (!isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}."); } + + if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _)) { isPage = true; } + } + else + { + isPage = type.Equals(NameToken.Page); + + if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}."); } + } + + if (!isLenientParsing && !isRoot) + { + if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken)) { throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}."); } + + if (!parentReferenceToken.Data.Equals(parentReference)) { throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}."); } + } + + return isPage; + } + } + +#endif + + // Keep the algorithm below from throwing a StackOverflow exception. + // It probably should be refactored to not be recursive + private const ushort MAX_TREE_DEPTH = 1024; + + private static PageTreeNode ProcessPagesNode(IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference, bool isRoot, IPdfTokenScanner pdfTokenScanner, bool isLenientParsing, PageCounter pageNumber, int depth = 0) + { + depth++; + + if (depth > MAX_TREE_DEPTH) { throw new PdfDocumentFormatException($"Tree exceeded maximum depth of {MAX_TREE_DEPTH}, aborting."); } + var isPage = false; if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type)) { - if (!isLenientParsing) - { - throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}."); - } + if (!isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}."); } - if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _)) - { - isPage = true; - } + if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _)) { isPage = true; } } else { isPage = type.Equals(NameToken.Page); - if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing) - { - throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}."); - } + if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}."); } } if (!isLenientParsing && !isRoot) { - if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken)) - { - throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}."); - } + if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken)) { throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}."); } - if (!parentReferenceToken.Data.Equals(parentReference)) - { - throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}."); - } + if (!parentReferenceToken.Data.Equals(parentReference)) { throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}."); } } if (isPage) { - pageNumber++; - - var thisNode = new PageTreeNode(nodeDictionary, reference, true, - pageNumber, - EmptyArray.Instance); - - return thisNode; + pageNumber.Increment(); + var newPage = new PageTreeNode(nodeDictionary, reference, true, pageNumber.PageCount).WithChildren(EmptyArray.Instance); + return newPage; } if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids)) { - if (!isLenientParsing) - { - throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {nodeDictionary}."); - } + if (!isLenientParsing) { throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {nodeDictionary}."); } kids = new ArrayToken(EmptyArray.Instance); } @@ -134,22 +215,16 @@ foreach (var kid in kids.Data) { - if (!(kid is IndirectReferenceToken kidRef)) - { - throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}."); - } + if (!(kid is IndirectReferenceToken kidRef)) { throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}."); } - if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken)) - { - throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}."); - } + if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken)) { throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}."); } - var kidNode = ProcessPagesNode(kidRef.Data, kidDictionaryToken, reference, false, pdfTokenScanner, isLenientParsing, ref pageNumber, depth + 1); + var kidNode = ProcessPagesNode(kidRef.Data, kidDictionaryToken, reference, false, pdfTokenScanner, isLenientParsing, pageNumber, depth); nodeChildren.Add(kidNode); } - return new PageTreeNode(nodeDictionary, reference, false, null, nodeChildren); + return new PageTreeNode(nodeDictionary, reference, false, null).WithChildren(nodeChildren); } } }