diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs new file mode 100644 index 00000000..05df66aa --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs @@ -0,0 +1,26 @@ +namespace UglyToad.PdfPig.Tests.Writer +{ + using Integration; + using PdfPig.Writer; + using Xunit; + + public class PdfMergerTests + { + [Fact] + public void CanMerge2SimpleDocuments() + { + var one = IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf"); + var two = IntegrationHelpers.GetDocumentPath("Single Page Simple - from open office.pdf"); + + var result = PdfMerger.Merge(one, two); + + // FIX: Enable UseLenianParseOff + using (var document = PdfDocument.Open(result/*, ParsingOptions.LenientParsingOff */)) + { + Assert.Equal(2, document.NumberOfPages); + + Assert.Equal(1.7m, document.Version); + } + } + } +} diff --git a/src/UglyToad.PdfPig/Writer/Merging/ObjectsTree.cs b/src/UglyToad.PdfPig/Writer/Merging/ObjectsTree.cs new file mode 100644 index 00000000..8580b99e --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/Merging/ObjectsTree.cs @@ -0,0 +1,24 @@ +namespace UglyToad.PdfPig.Writer.Merging +{ + using Content; + using CrossReference; + using Tokens; + + // I don't see the purpose of this class, @Elliot maybe you can explain to me the purpose + internal class ObjectsTree + { + public TrailerDictionary TrailerDictionary { get; } + + public ObjectToken TrailerObject { get; } + + public Catalog Catalog { get; } + + public ObjectsTree(TrailerDictionary trailerDictionary, ObjectToken trailerObject, + Catalog catalog) + { + TrailerDictionary = trailerDictionary; + TrailerObject = trailerObject; + Catalog = catalog; + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Writer/PdfMerger.cs b/src/UglyToad.PdfPig/Writer/PdfMerger.cs new file mode 100644 index 00000000..7d7d3ca1 --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/PdfMerger.cs @@ -0,0 +1,421 @@ +namespace UglyToad.PdfPig.Writer +{ + using System; + using System.Collections.Generic; + using System.Diagnostics; + using System.IO; + using System.Linq; + using Content; + using Core; + using CrossReference; + using Encryption; + using Filters; + using Logging; + using Merging; + using Parser; + using Parser.FileStructure; + using Parser.Parts; + using Tokenization.Scanner; + using Tokens; + using UglyToad.PdfPig.Exceptions; + using UglyToad.PdfPig.Graphics.Operations; + using UglyToad.PdfPig.Writer.Fonts; + + /// + /// Merges PDF documents into each other. + /// + public static class PdfMerger + { + private static readonly ILog Log = new NoOpLog(); + + private static readonly IFilterProvider FilterProvider = new MemoryFilterProvider(new DecodeParameterResolver(Log), + new PngPredictor(), Log); + + /// + /// Merge two PDF documents together with the pages from + /// followed by . + /// + public static byte[] Merge(string file1, string file2) + { + if (file1 == null) + { + throw new ArgumentNullException(nameof(file1)); + } + + if (file2 == null) + { + throw new ArgumentNullException(nameof(file2)); + } + + return Merge(new[] + { + File.ReadAllBytes(file1), + File.ReadAllBytes(file2) + }); + } + + /// + /// Merge the set of PDF documents. + /// + public static byte[] Merge(IReadOnlyList files) + { + if (files == null) + { + throw new ArgumentNullException(nameof(files)); + } + + const bool isLenientParsing = true; + + using var documentBuilder = new DocumentBuilder(); + + foreach (var file in files) + { + var inputBytes = new ByteArrayInputBytes(file); + var coreScanner = new CoreTokenScanner(inputBytes); + + var version = FileHeaderParser.Parse(coreScanner, true, Log); + + var bruteForceSearcher = new BruteForceSearcher(inputBytes); + var xrefValidator = new XrefOffsetValidator(Log); + var objectChecker = new XrefCosOffsetChecker(Log, bruteForceSearcher); + + var crossReferenceParser = new CrossReferenceParser(Log, xrefValidator, objectChecker, new Parser.Parts.CrossReference.CrossReferenceStreamParser(FilterProvider)); + + var crossReferenceOffset = FileTrailerParser.GetFirstCrossReferenceOffset(inputBytes, coreScanner, isLenientParsing); + + var objectLocations = bruteForceSearcher.GetObjectLocations(); + + CrossReferenceTable crossReference = null; + + var locationProvider = new ObjectLocationProvider(() => crossReference, bruteForceSearcher); + // I'm not using the BruteForceObjectLocationProvider because, the offset that it give are wrong by +2 + // var locationProvider = new BruteForcedObjectLocationProvider(objectLocations); + + var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, FilterProvider, NoOpEncryptionHandler.Instance); + + crossReference = crossReferenceParser.Parse(inputBytes, isLenientParsing, crossReferenceOffset, version.OffsetInFile, pdfScanner, coreScanner); + + var trailerDictionary = crossReference.Trailer; + + var (trailerRef, catalogDictionaryToken) = ParseCatalog(crossReference, pdfScanner, out var encryptionDictionary); + + if (encryptionDictionary != null) + { + // TODO: Find option of how to pass password for the documents... + throw new PdfDocumentEncryptedException("Unable to merge document with password"); + // pdfScanner.UpdateEncryptionHandler(new EncryptionHandler(encryptionDictionary, trailerDictionary, new[] { string.Empty })); + } + + var objectsTree = new ObjectsTree(trailerDictionary, pdfScanner.Get(trailerRef), + CatalogFactory.Create(crossReference.Trailer.Root, catalogDictionaryToken, pdfScanner, isLenientParsing)); + + var objectsLocation = bruteForceSearcher.GetObjectLocations(); + + var root = pdfScanner.Get(trailerDictionary.Root); + + var tokens = new List(); + + pdfScanner.Seek(0); + while (pdfScanner.MoveNext()) + { + tokens.Add(pdfScanner.CurrentToken); + } + + if (!(tokens.Count == objectLocations.Count)) + { + // Do we really need to check this? + throw new PdfDocumentFormatException("Something whent wrong while reading file"); + } + + documentBuilder.AppendNewDocument(objectsTree, pdfScanner); + } + + return documentBuilder.Build(); + } + + // This method is a basically a copy of the method UglyToad.PdfPig.Parser.PdfDocumentFactory.ParseTrailer() + private static (IndirectReference, DictionaryToken) ParseCatalog(CrossReferenceTable crossReferenceTable, + IPdfTokenScanner pdfTokenScanner, + out EncryptionDictionary encryptionDictionary) + { + encryptionDictionary = null; + + if (crossReferenceTable.Trailer.EncryptionToken != null) + { + if (!DirectObjectFinder.TryGet(crossReferenceTable.Trailer.EncryptionToken, pdfTokenScanner, + out DictionaryToken encryptionDictionaryToken)) + { + throw new PdfDocumentFormatException($"Unrecognized encryption token in trailer: {crossReferenceTable.Trailer.EncryptionToken}."); + } + + encryptionDictionary = EncryptionDictionaryFactory.Read(encryptionDictionaryToken, pdfTokenScanner); + } + + var rootDictionary = DirectObjectFinder.Get(crossReferenceTable.Trailer.Root, pdfTokenScanner); + + if (!rootDictionary.ContainsKey(NameToken.Type)) + { + rootDictionary = rootDictionary.With(NameToken.Type, NameToken.Catalog); + } + + return (crossReferenceTable.Trailer.Root, rootDictionary); + } + + // Note: I don't think making this a disposable is a good idea. + // Also, suggestion for name? + private class DocumentBuilder : IDisposable + { + private bool isDisposed = false; + + private MemoryStream Memory = new MemoryStream(); + + private readonly BuilderContext Context = new BuilderContext(); + + private readonly List DocumentPages = new List(); + + private IndirectReferenceToken RootPagesIndirectReference; + + public DocumentBuilder() + { + var reserved = Context.ReserveNumber(); + RootPagesIndirectReference = new IndirectReferenceToken(new IndirectReference(reserved, 0)); + + WriteHeaderToStream(); + } + + private void WriteHeaderToStream() + { + // Copied from UglyToad.PdfPig.Writer.PdfDocumentBuilder + WriteString("%PDF-1.7", Memory); + + // Files with binary data should contain a 2nd comment line followed by 4 bytes with values > 127 + Memory.WriteText("%"); + Memory.WriteByte(169); + Memory.WriteByte(205); + Memory.WriteByte(196); + Memory.WriteByte(210); + Memory.WriteNewLine(); + } + + public void AppendNewDocument(ObjectsTree newDocument, IPdfTokenScanner tokenScanner) + { + if (isDisposed) + { + throw new ObjectDisposedException("Merger disposed already"); + } + + /* + * I decided that I want to have an /Pages object for each document's pages. That way I avoided resource name conflict + * But I guess that doesn't matter either way? So that part can be eliminated? + */ + var pageReferences = ConstructPageReferences(newDocument.Catalog.PageTree, tokenScanner); + + var pagesDictionary = new DictionaryToken(new Dictionary + { + { NameToken.Type, NameToken.Pages }, + { NameToken.Kids, new ArrayToken(pageReferences) }, + { NameToken.Count, new NumericToken(pageReferences.Count) }, + { NameToken.Parent, RootPagesIndirectReference } + }); + + var pagesRef = Context.WriteObject(Memory, pagesDictionary); + DocumentPages.Add(new IndirectReferenceToken(pagesRef.Number)); + } + + private IReadOnlyList ConstructPageReferences(PageTreeNode treeNode, IPdfTokenScanner tokenScanner) + { + var reserved = Context.ReserveNumber(); + var parentIndirect = new IndirectReferenceToken(new IndirectReference(reserved, 0)); + + var pageReferences = new List(); + foreach (var pageNode in treeNode.Children) + { + if (!pageNode.IsPage) + { + var nestedPageReferences = ConstructPageReferences(pageNode, tokenScanner); + var pagesDictionary = new DictionaryToken(new Dictionary + { + { NameToken.Type, NameToken.Pages }, + { NameToken.Kids, new ArrayToken(nestedPageReferences) }, + { NameToken.Count, new NumericToken(nestedPageReferences.Count) }, + { NameToken.Parent, parentIndirect } + }); + + var pagesRef = Context.WriteObject(Memory, pagesDictionary); + pageReferences.Add(new IndirectReferenceToken(pagesRef.Number)); + continue; + } + + var pageDictionary = new Dictionary + { + {NameToken.Parent, parentIndirect}, + }; + + foreach(var setPair in pageNode.NodeDictionary.Data) + { + var name = setPair.Key; + var token = setPair.Value; + + if (name == NameToken.Parent) + { + // Skip Parent token, since we have to reassign it + continue; + } + + pageDictionary.Add(NameToken.Create(name), CopyToken(token, tokenScanner)); + } + + var pageRef = Context.WriteObject(Memory, new DictionaryToken(pageDictionary), reserved); + pageReferences.Add(new IndirectReferenceToken(pageRef.Number)); + } + + return pageReferences; + } + + private IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner) + { + if (tokenToCopy is DictionaryToken dictionaryToken) + { + var newContent = new Dictionary(); + foreach (var setPair in dictionaryToken.Data) + { + var name = setPair.Key; + var token = setPair.Value; + newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner)); + } + + return new DictionaryToken(newContent); + } + else if (tokenToCopy is ArrayToken arrayToken) + { + var newArray = new List(arrayToken.Length); + foreach (var token in arrayToken.Data) + { + newArray.Add(CopyToken(token, tokenScanner)); + } + + return new ArrayToken(newArray); + } + else if (tokenToCopy is IndirectReferenceToken referenceToken) + { + var tokenObject = DirectObjectFinder.Get(referenceToken.Data, tokenScanner); + + // Is this even a allowed? + Debug.Assert(!(tokenObject is IndirectReferenceToken)); + + var newToken = CopyToken(tokenObject, tokenScanner); + var objToken = Context.WriteObject(Memory, newToken); + return new IndirectReferenceToken(objToken.Number); + } + else + { + // TODO: Should we do a deep copy of the token? + return tokenToCopy; + } + } + + public byte[] Build() + { + if (isDisposed) + { + throw new ObjectDisposedException("Merger disposed already"); + } + + if (DocumentPages.Count < 1) + { + throw new PdfDocumentFormatException("Empty document"); + } + + var pagesDictionary = new DictionaryToken(new Dictionary + { + { NameToken.Type, NameToken.Pages }, + { NameToken.Kids, new ArrayToken(DocumentPages) }, + { NameToken.Count, new NumericToken(DocumentPages.Count) } + }); + + var pagesRef = Context.WriteObject(Memory, pagesDictionary, (int)RootPagesIndirectReference.Data.ObjectNumber); + + var catalog = new DictionaryToken(new Dictionary + { + { NameToken.Type, NameToken.Catalog }, + { NameToken.Pages, new IndirectReferenceToken(pagesRef.Number) } + }); + + var catalogRef = Context.WriteObject(Memory, catalog); + + TokenWriter.WriteCrossReferenceTable(Context.ObjectOffsets, catalogRef, Memory, null); + + var bytes = Memory.ToArray(); + + Dispose(); + + return bytes; + } + + // Note: This method is copied from UglyToad.PdfPig.Writer.PdfDocumentBuilder + private static void WriteString(string text, MemoryStream stream, bool appendBreak = true) + { + var bytes = OtherEncodings.StringAsLatin1Bytes(text); + stream.Write(bytes, 0, bytes.Length); + if (appendBreak) + { + stream.WriteNewLine(); + } + } + + public void Dispose() + { + if (isDisposed) + return; + + Memory.Dispose(); + Memory = null; + isDisposed = true; + } + } + + // Currently unused becauase, brute force search give the wrong offset (+2) + private class BruteForcedObjectLocationProvider : IObjectLocationProvider + { + private readonly Dictionary objectLocations; + private readonly Dictionary cache = new Dictionary(); + + public BruteForcedObjectLocationProvider(IReadOnlyDictionary objectLocations) + { + this.objectLocations = objectLocations.ToDictionary(x => x.Key, x => x.Value); + } + + public bool TryGetOffset(IndirectReference reference, out long offset) + { + var result = objectLocations.TryGetValue(reference, out offset); + //offset -= 2; + return result; + } + + public void UpdateOffset(IndirectReference reference, long offset) + { + objectLocations[reference] = offset; + } + + public bool TryGetCached(IndirectReference reference, out ObjectToken objectToken) + { + return cache.TryGetValue(reference, out objectToken); + } + + public void Cache(ObjectToken objectToken, bool force = false) + { + if (!TryGetOffset(objectToken.Number, out var offsetExpected) || force) + { + cache[objectToken.Number] = objectToken; + } + + if (offsetExpected != objectToken.Position) + { + return; + } + + cache[objectToken.Number] = objectToken; + } + } + } +} \ No newline at end of file