[Improved] PdfMerger: Allow to select the pages when merging documents (#248)

* PdfMerger: Allow to select the pages when merging documents

Supersede #245

* Impose artificial limit in the amount of page, that a node can contain

Plus, Use IReadOnlyList instead of IReadOnlyCollection
This commit is contained in:
Wesley Moret
2021-01-04 19:31:37 -05:00
committed by GitHub
parent eabe6b4662
commit 19ac38bf8b
3 changed files with 247 additions and 122 deletions

View File

@@ -115,6 +115,45 @@
}
}
[Fact]
public void CanMergeWithSelection()
{
var first = IntegrationHelpers.GetDocumentPath("Multiple Page - from Mortality Statistics.pdf");
var result = PdfMerger.Merge(new [] { File.ReadAllBytes(first) }, new [] { new[] {2, 1, 4, 3, 6, 5} });
WriteFile(nameof(CanMergeWithSelection), result);
using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff))
{
Assert.Equal(6, document.NumberOfPages);
foreach (var page in document.GetPages())
{
Assert.NotNull(page.Text);
}
}
}
[Fact]
public void CanMergeMultipleWithSelection()
{
var first = IntegrationHelpers.GetDocumentPath("Multiple Page - from Mortality Statistics.pdf");
var second = IntegrationHelpers.GetDocumentPath("Old Gutnish Internet Explorer.pdf");
var result = PdfMerger.Merge(new[] { File.ReadAllBytes(first), File.ReadAllBytes(second) }, new[] { new[] { 2, 1, 4, 3, 6, 5 }, new []{ 3, 2, 1 } });
WriteFile(nameof(CanMergeMultipleWithSelection), result);
using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff))
{
Assert.Equal(9, document.NumberOfPages);
foreach (var page in document.GetPages())
{
Assert.NotNull(page.Text);
}
}
}
private static void WriteFile(string name, byte[] bytes)
{
try

View File

@@ -114,6 +114,15 @@
return new DictionaryToken(result);
}
/// <summary>
/// Create a new <see cref="DictionaryToken"/>.
/// </summary>
/// <param name="data">The data this dictionary will contain.</param>
public static DictionaryToken With(IReadOnlyDictionary<string, IToken> data)
{
return new DictionaryToken(data ?? throw new ArgumentNullException(nameof(data)));
}
/// <inheritdoc />
public bool Equals(IToken obj)

View File

@@ -16,6 +16,8 @@
using Tokenization.Scanner;
using Tokens;
using Exceptions;
using System.Linq;
using Util;
/// <summary>
/// Merges PDF documents into each other.
@@ -29,7 +31,7 @@
/// <summary>
/// Merge two PDF documents together with the pages from <paramref name="file1"/> followed by <paramref name="file2"/>.
/// </summary>
public static byte[] Merge(string file1, string file2)
public static byte[] Merge(string file1, string file2, IReadOnlyList<int> file1Selection = null, IReadOnlyList<int> file2Selection = null)
{
if (file1 == null)
{
@@ -45,7 +47,7 @@
{
File.ReadAllBytes(file1),
File.ReadAllBytes(file2)
});
}, new [] { file1Selection, file2Selection });
}
/// <summary>
@@ -67,13 +69,13 @@
bytes.Add(File.ReadAllBytes(filePath));
}
return Merge(bytes);
return Merge(bytes, null);
}
/// <summary>
/// Merge the set of PDF documents.
/// </summary>
public static byte[] Merge(IReadOnlyList<byte[]> files)
public static byte[] Merge(IReadOnlyList<byte[]> files, IReadOnlyList<IReadOnlyList<int>> pagesBundle = null)
{
if (files == null)
{
@@ -84,8 +86,16 @@
var documentBuilder = new DocumentMerger();
foreach (var file in files)
foreach (var fileIndex in Enumerable.Range(0, files.Count))
{
var file = files[fileIndex];
IReadOnlyList<int> pages = null;
if (pagesBundle != null && fileIndex < pagesBundle.Count)
{
pages = pagesBundle[fileIndex];
}
var inputBytes = new ByteArrayInputBytes(file);
var coreScanner = new CoreTokenScanner(inputBytes);
@@ -112,7 +122,7 @@
var documentCatalog = CatalogFactory.Create(crossReference.Trailer.Root, catalogDictionaryToken, pdfScanner, isLenientParsing);
documentBuilder.AppendDocument(documentCatalog, version.Version, pdfScanner);
documentBuilder.AppendDocument(documentCatalog, version.Version, pdfScanner, pages);
}
return documentBuilder.Build();
@@ -149,31 +159,153 @@
private class DocumentMerger
{
private const decimal DefaultVersion = 1.2m;
private const int ARTIFICIAL_NODE_LIMIT = 100;
private readonly PdfStreamWriter context = new PdfStreamWriter();
private readonly List<IndirectReferenceToken> pagesTokenReferences = new List<IndirectReferenceToken>();
private readonly IndirectReferenceToken rootPagesReference;
private decimal currentVersion = DefaultVersion;
private int pageCount = 0;
private readonly Dictionary<IndirectReferenceToken, IndirectReferenceToken> referencesFromDocument =
new Dictionary<IndirectReferenceToken, IndirectReferenceToken>();
public DocumentMerger()
{
rootPagesReference = context.ReserveNumberToken();
}
public void AppendDocument(Catalog documentCatalog, decimal version, IPdfTokenScanner tokenScanner)
public void AppendDocument(Catalog catalog, decimal version, IPdfTokenScanner tokenScanner, IReadOnlyList<int> pages)
{
IEnumerable<int> pageIndices;
if (pages == null)
{
var pagesCount = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count);
if (pagesCount < 1)
{
return;
}
pageIndices = Enumerable.Range(1, pagesCount);
}
else if (pages.Count < 1)
{
return;
}
else
{
pageIndices = pages;
}
currentVersion = Math.Max(version, currentVersion);
var (pagesReference, count) = CopyPagesTree(documentCatalog.PageTree, rootPagesReference, tokenScanner);
pageCount += count;
pagesTokenReferences.Add(pagesReference);
var referencesFromDocument = new Dictionary<IndirectReferenceToken, IndirectReferenceToken>();
referencesFromDocument.Clear();
var currentNodeReference = context.ReserveNumberToken();
var pagesReferences = new List<IndirectReferenceToken>();
var resources = new Dictionary<string, IToken>();
bool DoesAEntryCollide(PageTreeNode node)
{
while (node != null)
{
var dictionary = node.NodeDictionary;
if (dictionary.TryGet(NameToken.Resources, tokenScanner, out DictionaryToken resourcesDictionary))
{
var nonCollidingResources = resourcesDictionary.Data.Keys.Except(resources.Keys);
if (nonCollidingResources.Count() != resourcesDictionary.Data.Count)
{
// This means that at least one of the resources collided
return true;
}
}
/* TODO: How to handle?
* `Rotate`
* `CropBox`
* `MediaBox`
*/
// No colliding entry was found, in this node
// Keep walking up into the tree
node = node.Parent;
}
return false;
}
void CopyEntries(PageTreeNode node)
{
while (node != null)
{
var dictionary = node.NodeDictionary;
if (dictionary.TryGet(NameToken.Resources, tokenScanner, out DictionaryToken resourcesDictionary))
{
foreach (var pair in resourcesDictionary.Data)
{
resources.Add(pair.Key, CopyToken(pair.Value, tokenScanner, referencesFromDocument));
}
}
/* TODO: How to handle?
* `Rotate`
* `CropBox`
* `MediaBox`
*/
// Keep walking up into the tree
node = node.Parent;
}
}
void CreateTree()
{
if (pagesReferences.Count < 1)
{
throw new InvalidOperationException("Pages reference should always be more than 1 when executing this function");
}
var newPagesNode = new Dictionary<NameToken, IToken>
{
{ NameToken.Type, NameToken.Pages },
{ NameToken.Kids, new ArrayToken(pagesReferences) },
{ NameToken.Count, new NumericToken(pagesReferences.Count) },
{ NameToken.Parent, rootPagesReference }
};
if (resources.Count > 0)
{
newPagesNode.Add(NameToken.Resources, DictionaryToken.With(resources));
}
var pagesDictionary = new DictionaryToken(newPagesNode);
pagesTokenReferences.Add(context.WriteToken(pagesDictionary, (int)currentNodeReference.Data.ObjectNumber));
pageCount += pagesReferences.Count;
};
foreach (var pageIndex in pageIndices)
{
var pageNode = catalog.GetPageNode(pageIndex);
if (pagesReferences.Count >= ARTIFICIAL_NODE_LIMIT || DoesAEntryCollide(pageNode))
{
CreateTree();
currentNodeReference = context.ReserveNumberToken();
pagesReferences = new List<IndirectReferenceToken>();
resources = new Dictionary<string, IToken>();
}
CopyEntries(pageNode.Parent);
pagesReferences.Add(CopyPageNode(pageNode, currentNodeReference, tokenScanner, referencesFromDocument));
}
if (pagesReferences.Count < 1)
{
throw new InvalidOperationException("Pages reference couldn't be less than 1 because we have reserved a indirect reference token");
}
CreateTree();
}
public byte[] Build()
@@ -190,7 +322,7 @@
{ NameToken.Count, new NumericToken(pageCount) }
});
var pagesRef = context.WriteToken( pagesDictionary, (int)rootPagesReference.Data.ObjectNumber);
var pagesRef = context.WriteToken(pagesDictionary, (int)rootPagesReference.Data.ObjectNumber);
var catalog = new DictionaryToken(new Dictionary<NameToken, IToken>
{
@@ -199,9 +331,9 @@
});
var catalogRef = context.WriteToken(catalog);
context.Flush(currentVersion, catalogRef);
var bytes = context.ToArray();
Close();
@@ -214,56 +346,8 @@
context.Dispose();
}
private (IndirectReferenceToken, int) CopyPagesTree(PageTreeNode treeNode, IndirectReferenceToken treeParentReference, IPdfTokenScanner tokenScanner)
{
Debug.Assert(!treeNode.IsPage);
var currentNodeReference = context.ReserveNumberToken();
var pageReferences = new List<IndirectReferenceToken>();
var nodeCount = 0;
foreach (var pageNode in treeNode.Children)
{
IndirectReferenceToken newEntry;
if (!pageNode.IsPage)
{
var count = 0;
(newEntry, count) = CopyPagesTree(pageNode, currentNodeReference, tokenScanner);
nodeCount += count;
}
else
{
newEntry = CopyPageNode(pageNode, currentNodeReference, tokenScanner);
++nodeCount;
}
pageReferences.Add(newEntry);
}
var newPagesNode = new Dictionary<NameToken, IToken>
{
{ NameToken.Type, NameToken.Pages },
{ NameToken.Kids, new ArrayToken(pageReferences) },
{ NameToken.Count, new NumericToken(nodeCount) },
{ NameToken.Parent, treeParentReference }
};
foreach (var pair in treeNode.NodeDictionary.Data)
{
if (IgnoreKeyForPagesNode(pair))
{
continue;
}
newPagesNode[NameToken.Create(pair.Key)] = CopyToken(pair.Value, tokenScanner);
}
var pagesDictionary = new DictionaryToken(newPagesNode);
return (context.WriteToken(pagesDictionary, (int)currentNodeReference.Data.ObjectNumber), nodeCount);
}
private IndirectReferenceToken CopyPageNode(PageTreeNode pageNode, IndirectReferenceToken parentPagesObject, IPdfTokenScanner tokenScanner)
private IndirectReferenceToken CopyPageNode(PageTreeNode pageNode, IndirectReferenceToken parentPagesObject, IPdfTokenScanner tokenScanner,
IDictionary<IndirectReferenceToken, IndirectReferenceToken> referencesFromDocument)
{
Debug.Assert(pageNode.IsPage);
@@ -283,91 +367,84 @@
continue;
}
pageDictionary.Add(NameToken.Create(name), CopyToken(token, tokenScanner));
pageDictionary.Add(NameToken.Create(name), CopyToken(token, tokenScanner, referencesFromDocument));
}
return context.WriteToken(new DictionaryToken(pageDictionary));
}
/// <summary>
/// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream
/// and replace the indirect reference with the correct/new one
/// </summary>
/// <param name="tokenToCopy">Token to inspect for reference</param>
/// <param name="tokenScanner">scanner get the content from the original document</param>
/// <param name="referencesFromDocument">Map of previously copied</param>
/// <returns>A reference of the token that was copied. With all the reference updated</returns>
private IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner)
private IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner, IDictionary<IndirectReferenceToken, IndirectReferenceToken> referencesFromDocument)
{
// This token need to be deep copied, because they could contain reference. So we have to update them.
switch (tokenToCopy)
{
case DictionaryToken dictionaryToken:
{
var newContent = new Dictionary<NameToken, IToken>();
foreach (var setPair in dictionaryToken.Data)
{
var name = setPair.Key;
var token = setPair.Value;
newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner));
}
var newContent = new Dictionary<NameToken, IToken>();
foreach (var setPair in dictionaryToken.Data)
{
var name = setPair.Key;
var token = setPair.Value;
newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner, referencesFromDocument));
}
return new DictionaryToken(newContent);
}
return new DictionaryToken(newContent);
}
case ArrayToken arrayToken:
{
var newArray = new List<IToken>(arrayToken.Length);
foreach (var token in arrayToken.Data)
{
newArray.Add(CopyToken(token, tokenScanner));
}
var newArray = new List<IToken>(arrayToken.Length);
foreach (var token in arrayToken.Data)
{
newArray.Add(CopyToken(token, tokenScanner, referencesFromDocument));
}
return new ArrayToken(newArray);
}
return new ArrayToken(newArray);
}
case IndirectReferenceToken referenceToken:
{
if (referencesFromDocument.TryGetValue(referenceToken, out var newReferenceToken))
{
if (referencesFromDocument.TryGetValue(referenceToken, out var newReferenceToken))
{
return newReferenceToken;
}
var tokenObject = DirectObjectFinder.Get<IToken>(referenceToken.Data, tokenScanner);
Debug.Assert(!(tokenObject is IndirectReferenceToken));
var newToken = CopyToken(tokenObject, tokenScanner, referencesFromDocument);
newReferenceToken = context.WriteToken(newToken);
referencesFromDocument.Add(referenceToken, newReferenceToken);
return newReferenceToken;
}
var tokenObject = DirectObjectFinder.Get<IToken>(referenceToken.Data, tokenScanner);
Debug.Assert(!(tokenObject is IndirectReferenceToken));
var newToken = CopyToken(tokenObject, tokenScanner);
newReferenceToken = context.WriteToken(newToken);
referencesFromDocument.Add(referenceToken, newReferenceToken);
return newReferenceToken;
}
case StreamToken streamToken:
{
var properties = CopyToken(streamToken.StreamDictionary, tokenScanner) as DictionaryToken;
Debug.Assert(properties != null);
{
var properties = CopyToken(streamToken.StreamDictionary, tokenScanner, referencesFromDocument) as DictionaryToken;
Debug.Assert(properties != null);
var bytes = streamToken.Data;
return new StreamToken(properties, bytes);
}
var bytes = streamToken.Data;
return new StreamToken(properties, bytes);
}
case ObjectToken _:
{
// Since we don't write token directly to the stream.
// We can't know the offset. Therefore the token would be invalid
throw new NotSupportedException("Copying a Object token is not supported");
}
{
// Since we don't write token directly to the stream.
// We can't know the offset. Therefore the token would be invalid
throw new NotSupportedException("Copying a Object token is not supported");
}
}
return tokenToCopy;
}
private static bool IgnoreKeyForPagesNode(KeyValuePair<string, IToken> token)
{
return string.Equals(token.Key, NameToken.Type.Data, StringComparison.OrdinalIgnoreCase)
|| string.Equals(token.Key, NameToken.Kids.Data, StringComparison.OrdinalIgnoreCase)
|| string.Equals(token.Key, NameToken.Count.Data, StringComparison.OrdinalIgnoreCase)
|| string.Equals(token.Key, NameToken.Parent.Data, StringComparison.OrdinalIgnoreCase);
}
}
}
}