diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs
index f3c7d528..ccbabccc 100644
--- a/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs
@@ -115,6 +115,45 @@
}
}
+ [Fact]
+ public void CanMergeWithSelection()
+ {
+ var first = IntegrationHelpers.GetDocumentPath("Multiple Page - from Mortality Statistics.pdf");
+ var result = PdfMerger.Merge(new [] { File.ReadAllBytes(first) }, new [] { new[] {2, 1, 4, 3, 6, 5} });
+
+ WriteFile(nameof(CanMergeWithSelection), result);
+
+ using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff))
+ {
+ Assert.Equal(6, document.NumberOfPages);
+
+ foreach (var page in document.GetPages())
+ {
+ Assert.NotNull(page.Text);
+ }
+ }
+ }
+
+ [Fact]
+ public void CanMergeMultipleWithSelection()
+ {
+ var first = IntegrationHelpers.GetDocumentPath("Multiple Page - from Mortality Statistics.pdf");
+ var second = IntegrationHelpers.GetDocumentPath("Old Gutnish Internet Explorer.pdf");
+ var result = PdfMerger.Merge(new[] { File.ReadAllBytes(first), File.ReadAllBytes(second) }, new[] { new[] { 2, 1, 4, 3, 6, 5 }, new []{ 3, 2, 1 } });
+
+ WriteFile(nameof(CanMergeMultipleWithSelection), result);
+
+ using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff))
+ {
+ Assert.Equal(9, document.NumberOfPages);
+
+ foreach (var page in document.GetPages())
+ {
+ Assert.NotNull(page.Text);
+ }
+ }
+ }
+
private static void WriteFile(string name, byte[] bytes)
{
try
diff --git a/src/UglyToad.PdfPig.Tokens/DictionaryToken.cs b/src/UglyToad.PdfPig.Tokens/DictionaryToken.cs
index fe9f9428..f770dd14 100644
--- a/src/UglyToad.PdfPig.Tokens/DictionaryToken.cs
+++ b/src/UglyToad.PdfPig.Tokens/DictionaryToken.cs
@@ -114,6 +114,15 @@
return new DictionaryToken(result);
}
+ ///
+ /// Create a new .
+ ///
+ /// The data this dictionary will contain.
+ public static DictionaryToken With(IReadOnlyDictionary data)
+ {
+ return new DictionaryToken(data ?? throw new ArgumentNullException(nameof(data)));
+ }
+
///
public bool Equals(IToken obj)
diff --git a/src/UglyToad.PdfPig/Writer/PdfMerger.cs b/src/UglyToad.PdfPig/Writer/PdfMerger.cs
index 1d9a25a9..e8e87bcf 100644
--- a/src/UglyToad.PdfPig/Writer/PdfMerger.cs
+++ b/src/UglyToad.PdfPig/Writer/PdfMerger.cs
@@ -16,6 +16,8 @@
using Tokenization.Scanner;
using Tokens;
using Exceptions;
+ using System.Linq;
+ using Util;
///
/// Merges PDF documents into each other.
@@ -29,7 +31,7 @@
///
/// Merge two PDF documents together with the pages from followed by .
///
- public static byte[] Merge(string file1, string file2)
+ public static byte[] Merge(string file1, string file2, IReadOnlyList file1Selection = null, IReadOnlyList file2Selection = null)
{
if (file1 == null)
{
@@ -45,7 +47,7 @@
{
File.ReadAllBytes(file1),
File.ReadAllBytes(file2)
- });
+ }, new [] { file1Selection, file2Selection });
}
///
@@ -67,13 +69,13 @@
bytes.Add(File.ReadAllBytes(filePath));
}
- return Merge(bytes);
+ return Merge(bytes, null);
}
///
/// Merge the set of PDF documents.
///
- public static byte[] Merge(IReadOnlyList files)
+ public static byte[] Merge(IReadOnlyList files, IReadOnlyList> pagesBundle = null)
{
if (files == null)
{
@@ -84,8 +86,16 @@
var documentBuilder = new DocumentMerger();
- foreach (var file in files)
+ foreach (var fileIndex in Enumerable.Range(0, files.Count))
{
+ var file = files[fileIndex];
+
+ IReadOnlyList pages = null;
+ if (pagesBundle != null && fileIndex < pagesBundle.Count)
+ {
+ pages = pagesBundle[fileIndex];
+ }
+
var inputBytes = new ByteArrayInputBytes(file);
var coreScanner = new CoreTokenScanner(inputBytes);
@@ -112,7 +122,7 @@
var documentCatalog = CatalogFactory.Create(crossReference.Trailer.Root, catalogDictionaryToken, pdfScanner, isLenientParsing);
- documentBuilder.AppendDocument(documentCatalog, version.Version, pdfScanner);
+ documentBuilder.AppendDocument(documentCatalog, version.Version, pdfScanner, pages);
}
return documentBuilder.Build();
@@ -149,31 +159,153 @@
private class DocumentMerger
{
private const decimal DefaultVersion = 1.2m;
-
+
+ private const int ARTIFICIAL_NODE_LIMIT = 100;
+
private readonly PdfStreamWriter context = new PdfStreamWriter();
private readonly List pagesTokenReferences = new List();
private readonly IndirectReferenceToken rootPagesReference;
private decimal currentVersion = DefaultVersion;
private int pageCount = 0;
-
- private readonly Dictionary referencesFromDocument =
- new Dictionary();
public DocumentMerger()
{
rootPagesReference = context.ReserveNumberToken();
}
-
- public void AppendDocument(Catalog documentCatalog, decimal version, IPdfTokenScanner tokenScanner)
+
+ public void AppendDocument(Catalog catalog, decimal version, IPdfTokenScanner tokenScanner, IReadOnlyList pages)
{
+ IEnumerable pageIndices;
+ if (pages == null)
+ {
+ var pagesCount = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count);
+ if (pagesCount < 1)
+ {
+ return;
+ }
+
+ pageIndices = Enumerable.Range(1, pagesCount);
+ }
+ else if (pages.Count < 1)
+ {
+ return;
+ }
+ else
+ {
+ pageIndices = pages;
+ }
+
currentVersion = Math.Max(version, currentVersion);
- var (pagesReference, count) = CopyPagesTree(documentCatalog.PageTree, rootPagesReference, tokenScanner);
- pageCount += count;
- pagesTokenReferences.Add(pagesReference);
+ var referencesFromDocument = new Dictionary();
- referencesFromDocument.Clear();
+ var currentNodeReference = context.ReserveNumberToken();
+ var pagesReferences = new List();
+ var resources = new Dictionary();
+
+ bool DoesAEntryCollide(PageTreeNode node)
+ {
+ while (node != null)
+ {
+ var dictionary = node.NodeDictionary;
+ if (dictionary.TryGet(NameToken.Resources, tokenScanner, out DictionaryToken resourcesDictionary))
+ {
+ var nonCollidingResources = resourcesDictionary.Data.Keys.Except(resources.Keys);
+ if (nonCollidingResources.Count() != resourcesDictionary.Data.Count)
+ {
+ // This means that at least one of the resources collided
+ return true;
+ }
+ }
+
+ /* TODO: How to handle?
+ * `Rotate`
+ * `CropBox`
+ * `MediaBox`
+ */
+
+ // No colliding entry was found, in this node
+ // Keep walking up into the tree
+ node = node.Parent;
+ }
+
+ return false;
+ }
+
+
+ void CopyEntries(PageTreeNode node)
+ {
+ while (node != null)
+ {
+ var dictionary = node.NodeDictionary;
+ if (dictionary.TryGet(NameToken.Resources, tokenScanner, out DictionaryToken resourcesDictionary))
+ {
+ foreach (var pair in resourcesDictionary.Data)
+ {
+ resources.Add(pair.Key, CopyToken(pair.Value, tokenScanner, referencesFromDocument));
+ }
+ }
+
+ /* TODO: How to handle?
+ * `Rotate`
+ * `CropBox`
+ * `MediaBox`
+ */
+
+ // Keep walking up into the tree
+ node = node.Parent;
+ }
+ }
+
+ void CreateTree()
+ {
+ if (pagesReferences.Count < 1)
+ {
+ throw new InvalidOperationException("Pages reference should always be more than 1 when executing this function");
+ }
+
+ var newPagesNode = new Dictionary
+ {
+ { NameToken.Type, NameToken.Pages },
+ { NameToken.Kids, new ArrayToken(pagesReferences) },
+ { NameToken.Count, new NumericToken(pagesReferences.Count) },
+ { NameToken.Parent, rootPagesReference }
+ };
+
+ if (resources.Count > 0)
+ {
+ newPagesNode.Add(NameToken.Resources, DictionaryToken.With(resources));
+ }
+
+ var pagesDictionary = new DictionaryToken(newPagesNode);
+ pagesTokenReferences.Add(context.WriteToken(pagesDictionary, (int)currentNodeReference.Data.ObjectNumber));
+
+ pageCount += pagesReferences.Count;
+ };
+
+ foreach (var pageIndex in pageIndices)
+ {
+ var pageNode = catalog.GetPageNode(pageIndex);
+ if (pagesReferences.Count >= ARTIFICIAL_NODE_LIMIT || DoesAEntryCollide(pageNode))
+ {
+ CreateTree();
+
+ currentNodeReference = context.ReserveNumberToken();
+ pagesReferences = new List();
+ resources = new Dictionary();
+ }
+
+ CopyEntries(pageNode.Parent);
+ pagesReferences.Add(CopyPageNode(pageNode, currentNodeReference, tokenScanner, referencesFromDocument));
+ }
+
+ if (pagesReferences.Count < 1)
+ {
+ throw new InvalidOperationException("Pages reference couldn't be less than 1 because we have reserved a indirect reference token");
+ }
+
+ CreateTree();
}
public byte[] Build()
@@ -190,7 +322,7 @@
{ NameToken.Count, new NumericToken(pageCount) }
});
- var pagesRef = context.WriteToken( pagesDictionary, (int)rootPagesReference.Data.ObjectNumber);
+ var pagesRef = context.WriteToken(pagesDictionary, (int)rootPagesReference.Data.ObjectNumber);
var catalog = new DictionaryToken(new Dictionary
{
@@ -199,9 +331,9 @@
});
var catalogRef = context.WriteToken(catalog);
-
+
context.Flush(currentVersion, catalogRef);
-
+
var bytes = context.ToArray();
Close();
@@ -214,56 +346,8 @@
context.Dispose();
}
- private (IndirectReferenceToken, int) CopyPagesTree(PageTreeNode treeNode, IndirectReferenceToken treeParentReference, IPdfTokenScanner tokenScanner)
- {
- Debug.Assert(!treeNode.IsPage);
-
- var currentNodeReference = context.ReserveNumberToken();
-
- var pageReferences = new List();
- var nodeCount = 0;
- foreach (var pageNode in treeNode.Children)
- {
- IndirectReferenceToken newEntry;
- if (!pageNode.IsPage)
- {
- var count = 0;
- (newEntry, count) = CopyPagesTree(pageNode, currentNodeReference, tokenScanner);
- nodeCount += count;
- }
- else
- {
- newEntry = CopyPageNode(pageNode, currentNodeReference, tokenScanner);
- ++nodeCount;
- }
-
- pageReferences.Add(newEntry);
- }
-
- var newPagesNode = new Dictionary
- {
- { NameToken.Type, NameToken.Pages },
- { NameToken.Kids, new ArrayToken(pageReferences) },
- { NameToken.Count, new NumericToken(nodeCount) },
- { NameToken.Parent, treeParentReference }
- };
-
- foreach (var pair in treeNode.NodeDictionary.Data)
- {
- if (IgnoreKeyForPagesNode(pair))
- {
- continue;
- }
-
- newPagesNode[NameToken.Create(pair.Key)] = CopyToken(pair.Value, tokenScanner);
- }
-
- var pagesDictionary = new DictionaryToken(newPagesNode);
-
- return (context.WriteToken(pagesDictionary, (int)currentNodeReference.Data.ObjectNumber), nodeCount);
- }
-
- private IndirectReferenceToken CopyPageNode(PageTreeNode pageNode, IndirectReferenceToken parentPagesObject, IPdfTokenScanner tokenScanner)
+ private IndirectReferenceToken CopyPageNode(PageTreeNode pageNode, IndirectReferenceToken parentPagesObject, IPdfTokenScanner tokenScanner,
+ IDictionary referencesFromDocument)
{
Debug.Assert(pageNode.IsPage);
@@ -283,91 +367,84 @@
continue;
}
- pageDictionary.Add(NameToken.Create(name), CopyToken(token, tokenScanner));
+ pageDictionary.Add(NameToken.Create(name), CopyToken(token, tokenScanner, referencesFromDocument));
}
return context.WriteToken(new DictionaryToken(pageDictionary));
}
-
+
///
/// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream
/// and replace the indirect reference with the correct/new one
///
/// Token to inspect for reference
/// scanner get the content from the original document
+ /// Map of previously copied
/// A reference of the token that was copied. With all the reference updated
- private IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner)
+ private IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner, IDictionary referencesFromDocument)
{
// This token need to be deep copied, because they could contain reference. So we have to update them.
switch (tokenToCopy)
{
case DictionaryToken dictionaryToken:
- {
- var newContent = new Dictionary();
- foreach (var setPair in dictionaryToken.Data)
{
- var name = setPair.Key;
- var token = setPair.Value;
- newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner));
- }
+ var newContent = new Dictionary();
+ foreach (var setPair in dictionaryToken.Data)
+ {
+ var name = setPair.Key;
+ var token = setPair.Value;
+ newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner, referencesFromDocument));
+ }
- return new DictionaryToken(newContent);
- }
+ return new DictionaryToken(newContent);
+ }
case ArrayToken arrayToken:
- {
- var newArray = new List(arrayToken.Length);
- foreach (var token in arrayToken.Data)
{
- newArray.Add(CopyToken(token, tokenScanner));
- }
+ var newArray = new List(arrayToken.Length);
+ foreach (var token in arrayToken.Data)
+ {
+ newArray.Add(CopyToken(token, tokenScanner, referencesFromDocument));
+ }
- return new ArrayToken(newArray);
- }
+ return new ArrayToken(newArray);
+ }
case IndirectReferenceToken referenceToken:
- {
- if (referencesFromDocument.TryGetValue(referenceToken, out var newReferenceToken))
{
+ if (referencesFromDocument.TryGetValue(referenceToken, out var newReferenceToken))
+ {
+ return newReferenceToken;
+ }
+
+ var tokenObject = DirectObjectFinder.Get(referenceToken.Data, tokenScanner);
+
+ Debug.Assert(!(tokenObject is IndirectReferenceToken));
+
+ var newToken = CopyToken(tokenObject, tokenScanner, referencesFromDocument);
+ newReferenceToken = context.WriteToken(newToken);
+
+ referencesFromDocument.Add(referenceToken, newReferenceToken);
+
return newReferenceToken;
}
-
- var tokenObject = DirectObjectFinder.Get(referenceToken.Data, tokenScanner);
-
- Debug.Assert(!(tokenObject is IndirectReferenceToken));
-
- var newToken = CopyToken(tokenObject, tokenScanner);
- newReferenceToken = context.WriteToken(newToken);
-
- referencesFromDocument.Add(referenceToken, newReferenceToken);
-
- return newReferenceToken;
- }
case StreamToken streamToken:
- {
- var properties = CopyToken(streamToken.StreamDictionary, tokenScanner) as DictionaryToken;
- Debug.Assert(properties != null);
+ {
+ var properties = CopyToken(streamToken.StreamDictionary, tokenScanner, referencesFromDocument) as DictionaryToken;
+ Debug.Assert(properties != null);
+
+ var bytes = streamToken.Data;
+ return new StreamToken(properties, bytes);
+ }
- var bytes = streamToken.Data;
- return new StreamToken(properties, bytes);
- }
-
case ObjectToken _:
- {
- // Since we don't write token directly to the stream.
- // We can't know the offset. Therefore the token would be invalid
- throw new NotSupportedException("Copying a Object token is not supported");
- }
+ {
+ // Since we don't write token directly to the stream.
+ // We can't know the offset. Therefore the token would be invalid
+ throw new NotSupportedException("Copying a Object token is not supported");
+ }
}
return tokenToCopy;
}
-
- private static bool IgnoreKeyForPagesNode(KeyValuePair token)
- {
- return string.Equals(token.Key, NameToken.Type.Data, StringComparison.OrdinalIgnoreCase)
- || string.Equals(token.Key, NameToken.Kids.Data, StringComparison.OrdinalIgnoreCase)
- || string.Equals(token.Key, NameToken.Count.Data, StringComparison.OrdinalIgnoreCase)
- || string.Equals(token.Key, NameToken.Parent.Data, StringComparison.OrdinalIgnoreCase);
- }
}
}
}
\ No newline at end of file