mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
[Improved] PdfMerger: Allow to select the pages when merging documents (#248)
* PdfMerger: Allow to select the pages when merging documents Supersede #245 * Impose artificial limit in the amount of page, that a node can contain Plus, Use IReadOnlyList instead of IReadOnlyCollection
This commit is contained in:
@@ -115,6 +115,45 @@
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanMergeWithSelection()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("Multiple Page - from Mortality Statistics.pdf");
|
||||
var result = PdfMerger.Merge(new [] { File.ReadAllBytes(first) }, new [] { new[] {2, 1, 4, 3, 6, 5} });
|
||||
|
||||
WriteFile(nameof(CanMergeWithSelection), result);
|
||||
|
||||
using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
Assert.Equal(6, document.NumberOfPages);
|
||||
|
||||
foreach (var page in document.GetPages())
|
||||
{
|
||||
Assert.NotNull(page.Text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanMergeMultipleWithSelection()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("Multiple Page - from Mortality Statistics.pdf");
|
||||
var second = IntegrationHelpers.GetDocumentPath("Old Gutnish Internet Explorer.pdf");
|
||||
var result = PdfMerger.Merge(new[] { File.ReadAllBytes(first), File.ReadAllBytes(second) }, new[] { new[] { 2, 1, 4, 3, 6, 5 }, new []{ 3, 2, 1 } });
|
||||
|
||||
WriteFile(nameof(CanMergeMultipleWithSelection), result);
|
||||
|
||||
using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
Assert.Equal(9, document.NumberOfPages);
|
||||
|
||||
foreach (var page in document.GetPages())
|
||||
{
|
||||
Assert.NotNull(page.Text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void WriteFile(string name, byte[] bytes)
|
||||
{
|
||||
try
|
||||
|
||||
@@ -114,6 +114,15 @@
|
||||
return new DictionaryToken(result);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="DictionaryToken"/>.
|
||||
/// </summary>
|
||||
/// <param name="data">The data this dictionary will contain.</param>
|
||||
public static DictionaryToken With(IReadOnlyDictionary<string, IToken> data)
|
||||
{
|
||||
return new DictionaryToken(data ?? throw new ArgumentNullException(nameof(data)));
|
||||
}
|
||||
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool Equals(IToken obj)
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
using Exceptions;
|
||||
using System.Linq;
|
||||
using Util;
|
||||
|
||||
/// <summary>
|
||||
/// Merges PDF documents into each other.
|
||||
@@ -29,7 +31,7 @@
|
||||
/// <summary>
|
||||
/// Merge two PDF documents together with the pages from <paramref name="file1"/> followed by <paramref name="file2"/>.
|
||||
/// </summary>
|
||||
public static byte[] Merge(string file1, string file2)
|
||||
public static byte[] Merge(string file1, string file2, IReadOnlyList<int> file1Selection = null, IReadOnlyList<int> file2Selection = null)
|
||||
{
|
||||
if (file1 == null)
|
||||
{
|
||||
@@ -45,7 +47,7 @@
|
||||
{
|
||||
File.ReadAllBytes(file1),
|
||||
File.ReadAllBytes(file2)
|
||||
});
|
||||
}, new [] { file1Selection, file2Selection });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -67,13 +69,13 @@
|
||||
bytes.Add(File.ReadAllBytes(filePath));
|
||||
}
|
||||
|
||||
return Merge(bytes);
|
||||
return Merge(bytes, null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Merge the set of PDF documents.
|
||||
/// </summary>
|
||||
public static byte[] Merge(IReadOnlyList<byte[]> files)
|
||||
public static byte[] Merge(IReadOnlyList<byte[]> files, IReadOnlyList<IReadOnlyList<int>> pagesBundle = null)
|
||||
{
|
||||
if (files == null)
|
||||
{
|
||||
@@ -84,8 +86,16 @@
|
||||
|
||||
var documentBuilder = new DocumentMerger();
|
||||
|
||||
foreach (var file in files)
|
||||
foreach (var fileIndex in Enumerable.Range(0, files.Count))
|
||||
{
|
||||
var file = files[fileIndex];
|
||||
|
||||
IReadOnlyList<int> pages = null;
|
||||
if (pagesBundle != null && fileIndex < pagesBundle.Count)
|
||||
{
|
||||
pages = pagesBundle[fileIndex];
|
||||
}
|
||||
|
||||
var inputBytes = new ByteArrayInputBytes(file);
|
||||
var coreScanner = new CoreTokenScanner(inputBytes);
|
||||
|
||||
@@ -112,7 +122,7 @@
|
||||
|
||||
var documentCatalog = CatalogFactory.Create(crossReference.Trailer.Root, catalogDictionaryToken, pdfScanner, isLenientParsing);
|
||||
|
||||
documentBuilder.AppendDocument(documentCatalog, version.Version, pdfScanner);
|
||||
documentBuilder.AppendDocument(documentCatalog, version.Version, pdfScanner, pages);
|
||||
}
|
||||
|
||||
return documentBuilder.Build();
|
||||
@@ -149,31 +159,153 @@
|
||||
private class DocumentMerger
|
||||
{
|
||||
private const decimal DefaultVersion = 1.2m;
|
||||
|
||||
|
||||
private const int ARTIFICIAL_NODE_LIMIT = 100;
|
||||
|
||||
private readonly PdfStreamWriter context = new PdfStreamWriter();
|
||||
private readonly List<IndirectReferenceToken> pagesTokenReferences = new List<IndirectReferenceToken>();
|
||||
private readonly IndirectReferenceToken rootPagesReference;
|
||||
|
||||
private decimal currentVersion = DefaultVersion;
|
||||
private int pageCount = 0;
|
||||
|
||||
private readonly Dictionary<IndirectReferenceToken, IndirectReferenceToken> referencesFromDocument =
|
||||
new Dictionary<IndirectReferenceToken, IndirectReferenceToken>();
|
||||
|
||||
public DocumentMerger()
|
||||
{
|
||||
rootPagesReference = context.ReserveNumberToken();
|
||||
}
|
||||
|
||||
public void AppendDocument(Catalog documentCatalog, decimal version, IPdfTokenScanner tokenScanner)
|
||||
|
||||
public void AppendDocument(Catalog catalog, decimal version, IPdfTokenScanner tokenScanner, IReadOnlyList<int> pages)
|
||||
{
|
||||
IEnumerable<int> pageIndices;
|
||||
if (pages == null)
|
||||
{
|
||||
var pagesCount = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count);
|
||||
if (pagesCount < 1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
pageIndices = Enumerable.Range(1, pagesCount);
|
||||
}
|
||||
else if (pages.Count < 1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
pageIndices = pages;
|
||||
}
|
||||
|
||||
currentVersion = Math.Max(version, currentVersion);
|
||||
|
||||
var (pagesReference, count) = CopyPagesTree(documentCatalog.PageTree, rootPagesReference, tokenScanner);
|
||||
pageCount += count;
|
||||
pagesTokenReferences.Add(pagesReference);
|
||||
var referencesFromDocument = new Dictionary<IndirectReferenceToken, IndirectReferenceToken>();
|
||||
|
||||
referencesFromDocument.Clear();
|
||||
var currentNodeReference = context.ReserveNumberToken();
|
||||
var pagesReferences = new List<IndirectReferenceToken>();
|
||||
var resources = new Dictionary<string, IToken>();
|
||||
|
||||
bool DoesAEntryCollide(PageTreeNode node)
|
||||
{
|
||||
while (node != null)
|
||||
{
|
||||
var dictionary = node.NodeDictionary;
|
||||
if (dictionary.TryGet(NameToken.Resources, tokenScanner, out DictionaryToken resourcesDictionary))
|
||||
{
|
||||
var nonCollidingResources = resourcesDictionary.Data.Keys.Except(resources.Keys);
|
||||
if (nonCollidingResources.Count() != resourcesDictionary.Data.Count)
|
||||
{
|
||||
// This means that at least one of the resources collided
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: How to handle?
|
||||
* `Rotate`
|
||||
* `CropBox`
|
||||
* `MediaBox`
|
||||
*/
|
||||
|
||||
// No colliding entry was found, in this node
|
||||
// Keep walking up into the tree
|
||||
node = node.Parent;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
void CopyEntries(PageTreeNode node)
|
||||
{
|
||||
while (node != null)
|
||||
{
|
||||
var dictionary = node.NodeDictionary;
|
||||
if (dictionary.TryGet(NameToken.Resources, tokenScanner, out DictionaryToken resourcesDictionary))
|
||||
{
|
||||
foreach (var pair in resourcesDictionary.Data)
|
||||
{
|
||||
resources.Add(pair.Key, CopyToken(pair.Value, tokenScanner, referencesFromDocument));
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: How to handle?
|
||||
* `Rotate`
|
||||
* `CropBox`
|
||||
* `MediaBox`
|
||||
*/
|
||||
|
||||
// Keep walking up into the tree
|
||||
node = node.Parent;
|
||||
}
|
||||
}
|
||||
|
||||
void CreateTree()
|
||||
{
|
||||
if (pagesReferences.Count < 1)
|
||||
{
|
||||
throw new InvalidOperationException("Pages reference should always be more than 1 when executing this function");
|
||||
}
|
||||
|
||||
var newPagesNode = new Dictionary<NameToken, IToken>
|
||||
{
|
||||
{ NameToken.Type, NameToken.Pages },
|
||||
{ NameToken.Kids, new ArrayToken(pagesReferences) },
|
||||
{ NameToken.Count, new NumericToken(pagesReferences.Count) },
|
||||
{ NameToken.Parent, rootPagesReference }
|
||||
};
|
||||
|
||||
if (resources.Count > 0)
|
||||
{
|
||||
newPagesNode.Add(NameToken.Resources, DictionaryToken.With(resources));
|
||||
}
|
||||
|
||||
var pagesDictionary = new DictionaryToken(newPagesNode);
|
||||
pagesTokenReferences.Add(context.WriteToken(pagesDictionary, (int)currentNodeReference.Data.ObjectNumber));
|
||||
|
||||
pageCount += pagesReferences.Count;
|
||||
};
|
||||
|
||||
foreach (var pageIndex in pageIndices)
|
||||
{
|
||||
var pageNode = catalog.GetPageNode(pageIndex);
|
||||
if (pagesReferences.Count >= ARTIFICIAL_NODE_LIMIT || DoesAEntryCollide(pageNode))
|
||||
{
|
||||
CreateTree();
|
||||
|
||||
currentNodeReference = context.ReserveNumberToken();
|
||||
pagesReferences = new List<IndirectReferenceToken>();
|
||||
resources = new Dictionary<string, IToken>();
|
||||
}
|
||||
|
||||
CopyEntries(pageNode.Parent);
|
||||
pagesReferences.Add(CopyPageNode(pageNode, currentNodeReference, tokenScanner, referencesFromDocument));
|
||||
}
|
||||
|
||||
if (pagesReferences.Count < 1)
|
||||
{
|
||||
throw new InvalidOperationException("Pages reference couldn't be less than 1 because we have reserved a indirect reference token");
|
||||
}
|
||||
|
||||
CreateTree();
|
||||
}
|
||||
|
||||
public byte[] Build()
|
||||
@@ -190,7 +322,7 @@
|
||||
{ NameToken.Count, new NumericToken(pageCount) }
|
||||
});
|
||||
|
||||
var pagesRef = context.WriteToken( pagesDictionary, (int)rootPagesReference.Data.ObjectNumber);
|
||||
var pagesRef = context.WriteToken(pagesDictionary, (int)rootPagesReference.Data.ObjectNumber);
|
||||
|
||||
var catalog = new DictionaryToken(new Dictionary<NameToken, IToken>
|
||||
{
|
||||
@@ -199,9 +331,9 @@
|
||||
});
|
||||
|
||||
var catalogRef = context.WriteToken(catalog);
|
||||
|
||||
|
||||
context.Flush(currentVersion, catalogRef);
|
||||
|
||||
|
||||
var bytes = context.ToArray();
|
||||
|
||||
Close();
|
||||
@@ -214,56 +346,8 @@
|
||||
context.Dispose();
|
||||
}
|
||||
|
||||
private (IndirectReferenceToken, int) CopyPagesTree(PageTreeNode treeNode, IndirectReferenceToken treeParentReference, IPdfTokenScanner tokenScanner)
|
||||
{
|
||||
Debug.Assert(!treeNode.IsPage);
|
||||
|
||||
var currentNodeReference = context.ReserveNumberToken();
|
||||
|
||||
var pageReferences = new List<IndirectReferenceToken>();
|
||||
var nodeCount = 0;
|
||||
foreach (var pageNode in treeNode.Children)
|
||||
{
|
||||
IndirectReferenceToken newEntry;
|
||||
if (!pageNode.IsPage)
|
||||
{
|
||||
var count = 0;
|
||||
(newEntry, count) = CopyPagesTree(pageNode, currentNodeReference, tokenScanner);
|
||||
nodeCount += count;
|
||||
}
|
||||
else
|
||||
{
|
||||
newEntry = CopyPageNode(pageNode, currentNodeReference, tokenScanner);
|
||||
++nodeCount;
|
||||
}
|
||||
|
||||
pageReferences.Add(newEntry);
|
||||
}
|
||||
|
||||
var newPagesNode = new Dictionary<NameToken, IToken>
|
||||
{
|
||||
{ NameToken.Type, NameToken.Pages },
|
||||
{ NameToken.Kids, new ArrayToken(pageReferences) },
|
||||
{ NameToken.Count, new NumericToken(nodeCount) },
|
||||
{ NameToken.Parent, treeParentReference }
|
||||
};
|
||||
|
||||
foreach (var pair in treeNode.NodeDictionary.Data)
|
||||
{
|
||||
if (IgnoreKeyForPagesNode(pair))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
newPagesNode[NameToken.Create(pair.Key)] = CopyToken(pair.Value, tokenScanner);
|
||||
}
|
||||
|
||||
var pagesDictionary = new DictionaryToken(newPagesNode);
|
||||
|
||||
return (context.WriteToken(pagesDictionary, (int)currentNodeReference.Data.ObjectNumber), nodeCount);
|
||||
}
|
||||
|
||||
private IndirectReferenceToken CopyPageNode(PageTreeNode pageNode, IndirectReferenceToken parentPagesObject, IPdfTokenScanner tokenScanner)
|
||||
private IndirectReferenceToken CopyPageNode(PageTreeNode pageNode, IndirectReferenceToken parentPagesObject, IPdfTokenScanner tokenScanner,
|
||||
IDictionary<IndirectReferenceToken, IndirectReferenceToken> referencesFromDocument)
|
||||
{
|
||||
Debug.Assert(pageNode.IsPage);
|
||||
|
||||
@@ -283,91 +367,84 @@
|
||||
continue;
|
||||
}
|
||||
|
||||
pageDictionary.Add(NameToken.Create(name), CopyToken(token, tokenScanner));
|
||||
pageDictionary.Add(NameToken.Create(name), CopyToken(token, tokenScanner, referencesFromDocument));
|
||||
}
|
||||
|
||||
return context.WriteToken(new DictionaryToken(pageDictionary));
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream
|
||||
/// and replace the indirect reference with the correct/new one
|
||||
/// </summary>
|
||||
/// <param name="tokenToCopy">Token to inspect for reference</param>
|
||||
/// <param name="tokenScanner">scanner get the content from the original document</param>
|
||||
/// <param name="referencesFromDocument">Map of previously copied</param>
|
||||
/// <returns>A reference of the token that was copied. With all the reference updated</returns>
|
||||
private IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner)
|
||||
private IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner, IDictionary<IndirectReferenceToken, IndirectReferenceToken> referencesFromDocument)
|
||||
{
|
||||
// This token need to be deep copied, because they could contain reference. So we have to update them.
|
||||
switch (tokenToCopy)
|
||||
{
|
||||
case DictionaryToken dictionaryToken:
|
||||
{
|
||||
var newContent = new Dictionary<NameToken, IToken>();
|
||||
foreach (var setPair in dictionaryToken.Data)
|
||||
{
|
||||
var name = setPair.Key;
|
||||
var token = setPair.Value;
|
||||
newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner));
|
||||
}
|
||||
var newContent = new Dictionary<NameToken, IToken>();
|
||||
foreach (var setPair in dictionaryToken.Data)
|
||||
{
|
||||
var name = setPair.Key;
|
||||
var token = setPair.Value;
|
||||
newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner, referencesFromDocument));
|
||||
}
|
||||
|
||||
return new DictionaryToken(newContent);
|
||||
}
|
||||
return new DictionaryToken(newContent);
|
||||
}
|
||||
case ArrayToken arrayToken:
|
||||
{
|
||||
var newArray = new List<IToken>(arrayToken.Length);
|
||||
foreach (var token in arrayToken.Data)
|
||||
{
|
||||
newArray.Add(CopyToken(token, tokenScanner));
|
||||
}
|
||||
var newArray = new List<IToken>(arrayToken.Length);
|
||||
foreach (var token in arrayToken.Data)
|
||||
{
|
||||
newArray.Add(CopyToken(token, tokenScanner, referencesFromDocument));
|
||||
}
|
||||
|
||||
return new ArrayToken(newArray);
|
||||
}
|
||||
return new ArrayToken(newArray);
|
||||
}
|
||||
case IndirectReferenceToken referenceToken:
|
||||
{
|
||||
if (referencesFromDocument.TryGetValue(referenceToken, out var newReferenceToken))
|
||||
{
|
||||
if (referencesFromDocument.TryGetValue(referenceToken, out var newReferenceToken))
|
||||
{
|
||||
return newReferenceToken;
|
||||
}
|
||||
|
||||
var tokenObject = DirectObjectFinder.Get<IToken>(referenceToken.Data, tokenScanner);
|
||||
|
||||
Debug.Assert(!(tokenObject is IndirectReferenceToken));
|
||||
|
||||
var newToken = CopyToken(tokenObject, tokenScanner, referencesFromDocument);
|
||||
newReferenceToken = context.WriteToken(newToken);
|
||||
|
||||
referencesFromDocument.Add(referenceToken, newReferenceToken);
|
||||
|
||||
return newReferenceToken;
|
||||
}
|
||||
|
||||
var tokenObject = DirectObjectFinder.Get<IToken>(referenceToken.Data, tokenScanner);
|
||||
|
||||
Debug.Assert(!(tokenObject is IndirectReferenceToken));
|
||||
|
||||
var newToken = CopyToken(tokenObject, tokenScanner);
|
||||
newReferenceToken = context.WriteToken(newToken);
|
||||
|
||||
referencesFromDocument.Add(referenceToken, newReferenceToken);
|
||||
|
||||
return newReferenceToken;
|
||||
}
|
||||
case StreamToken streamToken:
|
||||
{
|
||||
var properties = CopyToken(streamToken.StreamDictionary, tokenScanner) as DictionaryToken;
|
||||
Debug.Assert(properties != null);
|
||||
{
|
||||
var properties = CopyToken(streamToken.StreamDictionary, tokenScanner, referencesFromDocument) as DictionaryToken;
|
||||
Debug.Assert(properties != null);
|
||||
|
||||
var bytes = streamToken.Data;
|
||||
return new StreamToken(properties, bytes);
|
||||
}
|
||||
|
||||
var bytes = streamToken.Data;
|
||||
return new StreamToken(properties, bytes);
|
||||
}
|
||||
|
||||
case ObjectToken _:
|
||||
{
|
||||
// Since we don't write token directly to the stream.
|
||||
// We can't know the offset. Therefore the token would be invalid
|
||||
throw new NotSupportedException("Copying a Object token is not supported");
|
||||
}
|
||||
{
|
||||
// Since we don't write token directly to the stream.
|
||||
// We can't know the offset. Therefore the token would be invalid
|
||||
throw new NotSupportedException("Copying a Object token is not supported");
|
||||
}
|
||||
}
|
||||
|
||||
return tokenToCopy;
|
||||
}
|
||||
|
||||
private static bool IgnoreKeyForPagesNode(KeyValuePair<string, IToken> token)
|
||||
{
|
||||
return string.Equals(token.Key, NameToken.Type.Data, StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(token.Key, NameToken.Kids.Data, StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(token.Key, NameToken.Count.Data, StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals(token.Key, NameToken.Parent.Data, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user