From 19ac38bf8b167ef431a73e537a1bee07a790b7d4 Mon Sep 17 00:00:00 2001 From: Wesley Moret Date: Mon, 4 Jan 2021 19:31:37 -0500 Subject: [PATCH] [Improved] PdfMerger: Allow to select the pages when merging documents (#248) * PdfMerger: Allow to select the pages when merging documents Supersede #245 * Impose artificial limit in the amount of page, that a node can contain Plus, Use IReadOnlyList instead of IReadOnlyCollection --- .../Writer/PdfMergerTests.cs | 39 +++ src/UglyToad.PdfPig.Tokens/DictionaryToken.cs | 9 + src/UglyToad.PdfPig/Writer/PdfMerger.cs | 321 +++++++++++------- 3 files changed, 247 insertions(+), 122 deletions(-) diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs index f3c7d528..ccbabccc 100644 --- a/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs @@ -115,6 +115,45 @@ } } + [Fact] + public void CanMergeWithSelection() + { + var first = IntegrationHelpers.GetDocumentPath("Multiple Page - from Mortality Statistics.pdf"); + var result = PdfMerger.Merge(new [] { File.ReadAllBytes(first) }, new [] { new[] {2, 1, 4, 3, 6, 5} }); + + WriteFile(nameof(CanMergeWithSelection), result); + + using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff)) + { + Assert.Equal(6, document.NumberOfPages); + + foreach (var page in document.GetPages()) + { + Assert.NotNull(page.Text); + } + } + } + + [Fact] + public void CanMergeMultipleWithSelection() + { + var first = IntegrationHelpers.GetDocumentPath("Multiple Page - from Mortality Statistics.pdf"); + var second = IntegrationHelpers.GetDocumentPath("Old Gutnish Internet Explorer.pdf"); + var result = PdfMerger.Merge(new[] { File.ReadAllBytes(first), File.ReadAllBytes(second) }, new[] { new[] { 2, 1, 4, 3, 6, 5 }, new []{ 3, 2, 1 } }); + + WriteFile(nameof(CanMergeMultipleWithSelection), result); + + using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff)) + { + Assert.Equal(9, document.NumberOfPages); + + foreach (var page in document.GetPages()) + { + Assert.NotNull(page.Text); + } + } + } + private static void WriteFile(string name, byte[] bytes) { try diff --git a/src/UglyToad.PdfPig.Tokens/DictionaryToken.cs b/src/UglyToad.PdfPig.Tokens/DictionaryToken.cs index fe9f9428..f770dd14 100644 --- a/src/UglyToad.PdfPig.Tokens/DictionaryToken.cs +++ b/src/UglyToad.PdfPig.Tokens/DictionaryToken.cs @@ -114,6 +114,15 @@ return new DictionaryToken(result); } + /// + /// Create a new . + /// + /// The data this dictionary will contain. + public static DictionaryToken With(IReadOnlyDictionary data) + { + return new DictionaryToken(data ?? throw new ArgumentNullException(nameof(data))); + } + /// public bool Equals(IToken obj) diff --git a/src/UglyToad.PdfPig/Writer/PdfMerger.cs b/src/UglyToad.PdfPig/Writer/PdfMerger.cs index 1d9a25a9..e8e87bcf 100644 --- a/src/UglyToad.PdfPig/Writer/PdfMerger.cs +++ b/src/UglyToad.PdfPig/Writer/PdfMerger.cs @@ -16,6 +16,8 @@ using Tokenization.Scanner; using Tokens; using Exceptions; + using System.Linq; + using Util; /// /// Merges PDF documents into each other. @@ -29,7 +31,7 @@ /// /// Merge two PDF documents together with the pages from followed by . /// - public static byte[] Merge(string file1, string file2) + public static byte[] Merge(string file1, string file2, IReadOnlyList file1Selection = null, IReadOnlyList file2Selection = null) { if (file1 == null) { @@ -45,7 +47,7 @@ { File.ReadAllBytes(file1), File.ReadAllBytes(file2) - }); + }, new [] { file1Selection, file2Selection }); } /// @@ -67,13 +69,13 @@ bytes.Add(File.ReadAllBytes(filePath)); } - return Merge(bytes); + return Merge(bytes, null); } /// /// Merge the set of PDF documents. /// - public static byte[] Merge(IReadOnlyList files) + public static byte[] Merge(IReadOnlyList files, IReadOnlyList> pagesBundle = null) { if (files == null) { @@ -84,8 +86,16 @@ var documentBuilder = new DocumentMerger(); - foreach (var file in files) + foreach (var fileIndex in Enumerable.Range(0, files.Count)) { + var file = files[fileIndex]; + + IReadOnlyList pages = null; + if (pagesBundle != null && fileIndex < pagesBundle.Count) + { + pages = pagesBundle[fileIndex]; + } + var inputBytes = new ByteArrayInputBytes(file); var coreScanner = new CoreTokenScanner(inputBytes); @@ -112,7 +122,7 @@ var documentCatalog = CatalogFactory.Create(crossReference.Trailer.Root, catalogDictionaryToken, pdfScanner, isLenientParsing); - documentBuilder.AppendDocument(documentCatalog, version.Version, pdfScanner); + documentBuilder.AppendDocument(documentCatalog, version.Version, pdfScanner, pages); } return documentBuilder.Build(); @@ -149,31 +159,153 @@ private class DocumentMerger { private const decimal DefaultVersion = 1.2m; - + + private const int ARTIFICIAL_NODE_LIMIT = 100; + private readonly PdfStreamWriter context = new PdfStreamWriter(); private readonly List pagesTokenReferences = new List(); private readonly IndirectReferenceToken rootPagesReference; private decimal currentVersion = DefaultVersion; private int pageCount = 0; - - private readonly Dictionary referencesFromDocument = - new Dictionary(); public DocumentMerger() { rootPagesReference = context.ReserveNumberToken(); } - - public void AppendDocument(Catalog documentCatalog, decimal version, IPdfTokenScanner tokenScanner) + + public void AppendDocument(Catalog catalog, decimal version, IPdfTokenScanner tokenScanner, IReadOnlyList pages) { + IEnumerable pageIndices; + if (pages == null) + { + var pagesCount = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count); + if (pagesCount < 1) + { + return; + } + + pageIndices = Enumerable.Range(1, pagesCount); + } + else if (pages.Count < 1) + { + return; + } + else + { + pageIndices = pages; + } + currentVersion = Math.Max(version, currentVersion); - var (pagesReference, count) = CopyPagesTree(documentCatalog.PageTree, rootPagesReference, tokenScanner); - pageCount += count; - pagesTokenReferences.Add(pagesReference); + var referencesFromDocument = new Dictionary(); - referencesFromDocument.Clear(); + var currentNodeReference = context.ReserveNumberToken(); + var pagesReferences = new List(); + var resources = new Dictionary(); + + bool DoesAEntryCollide(PageTreeNode node) + { + while (node != null) + { + var dictionary = node.NodeDictionary; + if (dictionary.TryGet(NameToken.Resources, tokenScanner, out DictionaryToken resourcesDictionary)) + { + var nonCollidingResources = resourcesDictionary.Data.Keys.Except(resources.Keys); + if (nonCollidingResources.Count() != resourcesDictionary.Data.Count) + { + // This means that at least one of the resources collided + return true; + } + } + + /* TODO: How to handle? + * `Rotate` + * `CropBox` + * `MediaBox` + */ + + // No colliding entry was found, in this node + // Keep walking up into the tree + node = node.Parent; + } + + return false; + } + + + void CopyEntries(PageTreeNode node) + { + while (node != null) + { + var dictionary = node.NodeDictionary; + if (dictionary.TryGet(NameToken.Resources, tokenScanner, out DictionaryToken resourcesDictionary)) + { + foreach (var pair in resourcesDictionary.Data) + { + resources.Add(pair.Key, CopyToken(pair.Value, tokenScanner, referencesFromDocument)); + } + } + + /* TODO: How to handle? + * `Rotate` + * `CropBox` + * `MediaBox` + */ + + // Keep walking up into the tree + node = node.Parent; + } + } + + void CreateTree() + { + if (pagesReferences.Count < 1) + { + throw new InvalidOperationException("Pages reference should always be more than 1 when executing this function"); + } + + var newPagesNode = new Dictionary + { + { NameToken.Type, NameToken.Pages }, + { NameToken.Kids, new ArrayToken(pagesReferences) }, + { NameToken.Count, new NumericToken(pagesReferences.Count) }, + { NameToken.Parent, rootPagesReference } + }; + + if (resources.Count > 0) + { + newPagesNode.Add(NameToken.Resources, DictionaryToken.With(resources)); + } + + var pagesDictionary = new DictionaryToken(newPagesNode); + pagesTokenReferences.Add(context.WriteToken(pagesDictionary, (int)currentNodeReference.Data.ObjectNumber)); + + pageCount += pagesReferences.Count; + }; + + foreach (var pageIndex in pageIndices) + { + var pageNode = catalog.GetPageNode(pageIndex); + if (pagesReferences.Count >= ARTIFICIAL_NODE_LIMIT || DoesAEntryCollide(pageNode)) + { + CreateTree(); + + currentNodeReference = context.ReserveNumberToken(); + pagesReferences = new List(); + resources = new Dictionary(); + } + + CopyEntries(pageNode.Parent); + pagesReferences.Add(CopyPageNode(pageNode, currentNodeReference, tokenScanner, referencesFromDocument)); + } + + if (pagesReferences.Count < 1) + { + throw new InvalidOperationException("Pages reference couldn't be less than 1 because we have reserved a indirect reference token"); + } + + CreateTree(); } public byte[] Build() @@ -190,7 +322,7 @@ { NameToken.Count, new NumericToken(pageCount) } }); - var pagesRef = context.WriteToken( pagesDictionary, (int)rootPagesReference.Data.ObjectNumber); + var pagesRef = context.WriteToken(pagesDictionary, (int)rootPagesReference.Data.ObjectNumber); var catalog = new DictionaryToken(new Dictionary { @@ -199,9 +331,9 @@ }); var catalogRef = context.WriteToken(catalog); - + context.Flush(currentVersion, catalogRef); - + var bytes = context.ToArray(); Close(); @@ -214,56 +346,8 @@ context.Dispose(); } - private (IndirectReferenceToken, int) CopyPagesTree(PageTreeNode treeNode, IndirectReferenceToken treeParentReference, IPdfTokenScanner tokenScanner) - { - Debug.Assert(!treeNode.IsPage); - - var currentNodeReference = context.ReserveNumberToken(); - - var pageReferences = new List(); - var nodeCount = 0; - foreach (var pageNode in treeNode.Children) - { - IndirectReferenceToken newEntry; - if (!pageNode.IsPage) - { - var count = 0; - (newEntry, count) = CopyPagesTree(pageNode, currentNodeReference, tokenScanner); - nodeCount += count; - } - else - { - newEntry = CopyPageNode(pageNode, currentNodeReference, tokenScanner); - ++nodeCount; - } - - pageReferences.Add(newEntry); - } - - var newPagesNode = new Dictionary - { - { NameToken.Type, NameToken.Pages }, - { NameToken.Kids, new ArrayToken(pageReferences) }, - { NameToken.Count, new NumericToken(nodeCount) }, - { NameToken.Parent, treeParentReference } - }; - - foreach (var pair in treeNode.NodeDictionary.Data) - { - if (IgnoreKeyForPagesNode(pair)) - { - continue; - } - - newPagesNode[NameToken.Create(pair.Key)] = CopyToken(pair.Value, tokenScanner); - } - - var pagesDictionary = new DictionaryToken(newPagesNode); - - return (context.WriteToken(pagesDictionary, (int)currentNodeReference.Data.ObjectNumber), nodeCount); - } - - private IndirectReferenceToken CopyPageNode(PageTreeNode pageNode, IndirectReferenceToken parentPagesObject, IPdfTokenScanner tokenScanner) + private IndirectReferenceToken CopyPageNode(PageTreeNode pageNode, IndirectReferenceToken parentPagesObject, IPdfTokenScanner tokenScanner, + IDictionary referencesFromDocument) { Debug.Assert(pageNode.IsPage); @@ -283,91 +367,84 @@ continue; } - pageDictionary.Add(NameToken.Create(name), CopyToken(token, tokenScanner)); + pageDictionary.Add(NameToken.Create(name), CopyToken(token, tokenScanner, referencesFromDocument)); } return context.WriteToken(new DictionaryToken(pageDictionary)); } - + /// /// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream /// and replace the indirect reference with the correct/new one /// /// Token to inspect for reference /// scanner get the content from the original document + /// Map of previously copied /// A reference of the token that was copied. With all the reference updated - private IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner) + private IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner, IDictionary referencesFromDocument) { // This token need to be deep copied, because they could contain reference. So we have to update them. switch (tokenToCopy) { case DictionaryToken dictionaryToken: - { - var newContent = new Dictionary(); - foreach (var setPair in dictionaryToken.Data) { - var name = setPair.Key; - var token = setPair.Value; - newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner)); - } + var newContent = new Dictionary(); + foreach (var setPair in dictionaryToken.Data) + { + var name = setPair.Key; + var token = setPair.Value; + newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner, referencesFromDocument)); + } - return new DictionaryToken(newContent); - } + return new DictionaryToken(newContent); + } case ArrayToken arrayToken: - { - var newArray = new List(arrayToken.Length); - foreach (var token in arrayToken.Data) { - newArray.Add(CopyToken(token, tokenScanner)); - } + var newArray = new List(arrayToken.Length); + foreach (var token in arrayToken.Data) + { + newArray.Add(CopyToken(token, tokenScanner, referencesFromDocument)); + } - return new ArrayToken(newArray); - } + return new ArrayToken(newArray); + } case IndirectReferenceToken referenceToken: - { - if (referencesFromDocument.TryGetValue(referenceToken, out var newReferenceToken)) { + if (referencesFromDocument.TryGetValue(referenceToken, out var newReferenceToken)) + { + return newReferenceToken; + } + + var tokenObject = DirectObjectFinder.Get(referenceToken.Data, tokenScanner); + + Debug.Assert(!(tokenObject is IndirectReferenceToken)); + + var newToken = CopyToken(tokenObject, tokenScanner, referencesFromDocument); + newReferenceToken = context.WriteToken(newToken); + + referencesFromDocument.Add(referenceToken, newReferenceToken); + return newReferenceToken; } - - var tokenObject = DirectObjectFinder.Get(referenceToken.Data, tokenScanner); - - Debug.Assert(!(tokenObject is IndirectReferenceToken)); - - var newToken = CopyToken(tokenObject, tokenScanner); - newReferenceToken = context.WriteToken(newToken); - - referencesFromDocument.Add(referenceToken, newReferenceToken); - - return newReferenceToken; - } case StreamToken streamToken: - { - var properties = CopyToken(streamToken.StreamDictionary, tokenScanner) as DictionaryToken; - Debug.Assert(properties != null); + { + var properties = CopyToken(streamToken.StreamDictionary, tokenScanner, referencesFromDocument) as DictionaryToken; + Debug.Assert(properties != null); + + var bytes = streamToken.Data; + return new StreamToken(properties, bytes); + } - var bytes = streamToken.Data; - return new StreamToken(properties, bytes); - } - case ObjectToken _: - { - // Since we don't write token directly to the stream. - // We can't know the offset. Therefore the token would be invalid - throw new NotSupportedException("Copying a Object token is not supported"); - } + { + // Since we don't write token directly to the stream. + // We can't know the offset. Therefore the token would be invalid + throw new NotSupportedException("Copying a Object token is not supported"); + } } return tokenToCopy; } - - private static bool IgnoreKeyForPagesNode(KeyValuePair token) - { - return string.Equals(token.Key, NameToken.Type.Data, StringComparison.OrdinalIgnoreCase) - || string.Equals(token.Key, NameToken.Kids.Data, StringComparison.OrdinalIgnoreCase) - || string.Equals(token.Key, NameToken.Count.Data, StringComparison.OrdinalIgnoreCase) - || string.Equals(token.Key, NameToken.Parent.Data, StringComparison.OrdinalIgnoreCase); - } } } } \ No newline at end of file