Allow to copy pages from another document

This is a naive implementation, because if you copy multiple pages from the same document, the recipient document would be bloated with duplicated resources
2026-03-10 00:23:29 +08:00 · 2020-12-20 19:13:19 +00:00
parent ba5bc1f031
commit 7126564eef
4 changed files with 326 additions and 12 deletions
--- a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs
@@ -644,6 +644,53 @@
            }
        }

+        [Fact]
+        public void CanCopyPage()
+        {
+
+            byte[] b;
+            {
+                var builder = new PdfDocumentBuilder();
+
+                var page1 = builder.AddPage(PageSize.A4);
+
+                var file = TrueTypeTestHelper.GetFileBytes("Andada-Regular.ttf");
+
+                var font = builder.AddTrueTypeFont(file);
+
+                page1.AddText("Hello", 12, new PdfPoint(30, 50), font);
+
+                Assert.NotEmpty(page1.CurrentStream.Operations);
+
+
+                using (var readDocument = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("bold-italic.pdf")))
+                {
+                    var rpage = readDocument.GetPage(1);
+
+                    var page2 = builder.AddPage(PageSize.A4);
+                    page2.CopyFrom(rpage);
+                }
+
+                b = builder.Build();
+                Assert.NotEmpty(b);
+            }
+
+            WriteFile(nameof(CanCopyPage), b);
+
+            using (var document = PdfDocument.Open(b))
+            {
+                Assert.Equal( 2, document.NumberOfPages);
+
+                var page1 = document.GetPage(1);
+
+                Assert.Equal("Hello", page1.Text);
+
+                var page2 = document.GetPage(2);
+                
+                Assert.Equal("Lorem ipsum dolor sit amet, consectetur adipiscing elit. ", page2.Text);
+            }
+        }
+
        private static void WriteFile(string name, byte[] bytes, string extension = "pdf")
        {
            try
--- a/src/UglyToad.PdfPig/Content/Page.cs
+++ b/src/UglyToad.PdfPig/Content/Page.cs
@@ -17,7 +17,7 @@
    public class Page
    {
        private readonly AnnotationProvider annotationProvider;
-        private readonly IPdfTokenScanner pdfScanner;
+        internal readonly IPdfTokenScanner pdfScanner;
        private readonly Lazy<string> textLazy;

        /// <summary>
--- a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs
+++ b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs
@@ -3,6 +3,7 @@ namespace UglyToad.PdfPig.Writer
 {
    using System;
    using System.Collections.Generic;
+    using System.Diagnostics;
    using System.IO;
    using System.Linq;
    using Content;
@@ -10,8 +11,10 @@ namespace UglyToad.PdfPig.Writer
    using Fonts;
    using PdfPig.Fonts.TrueType;
    using Graphics.Operations;
+    using Parser.Parts;
    using PdfPig.Fonts.Standard14Fonts;
    using PdfPig.Fonts.TrueType.Parser;
+    using Tokenization.Scanner;
    using Tokens;

    using Util.JetBrains.Annotations;
@@ -25,6 +28,9 @@ namespace UglyToad.PdfPig.Writer
        private readonly Dictionary<int, PdfPageBuilder> pages = new Dictionary<int, PdfPageBuilder>();
        private readonly Dictionary<Guid, FontStored> fonts = new Dictionary<Guid, FontStored>();
        private readonly Dictionary<Guid, ImageStored> images = new Dictionary<Guid, ImageStored>();
+        private readonly Dictionary<IndirectReferenceToken, IToken> unwrittenTokens = new Dictionary<IndirectReferenceToken, IToken>();
+
+        internal int fontId = 0;

        /// <summary>
        /// The standard of PDF/A compliance of the generated document. Defaults to <see cref="PdfAStandard.None"/>.
@@ -50,7 +56,12 @@ namespace UglyToad.PdfPig.Writer
        /// <summary>
        /// The fonts currently available in the document builder added via <see cref="AddTrueTypeFont"/> or <see cref="AddStandard14Font"/>. Keyed by id for internal purposes.
        /// </summary>
-        internal IReadOnlyDictionary<Guid, IWritingFont> Fonts => fonts.ToDictionary(x => x.Key, x => x.Value.FontProgram);
+        internal IReadOnlyDictionary<Guid, FontStored> Fonts => fonts;
+
+        /// <summary>
+        /// The images currently available in the document builder added via <see cref="AddImage"/>. Keyed by id for internal purposes.
+        /// </summary>
+        internal IReadOnlyDictionary<Guid, ImageStored> Images => images;

        /// <summary>
        /// Determines whether the bytes of the TrueType font file provided can be used in a PDF document.
@@ -116,8 +127,7 @@ namespace UglyToad.PdfPig.Writer
            {
                var font = TrueTypeFontParser.Parse(new TrueTypeDataBytes(new ByteArrayInputBytes(fontFileBytes)));
                var id = Guid.NewGuid();
-                var i = fonts.Count;
-                var added = new AddedFont(id, NameToken.Create($"F{i}"));
+                var added = new AddedFont(id, NameToken.Create($"F{fontId++}"));
                fonts[id] = new FontStored(added, new TrueTypeWritingFont(font, fontFileBytes));

                return added;
@@ -141,7 +151,7 @@ namespace UglyToad.PdfPig.Writer
            }

            var id = Guid.NewGuid();
-            var name = NameToken.Create($"F{fonts.Count}");
+            var name = NameToken.Create($"F{fontId++}");
            var added = new AddedFont(id, name);
            fonts[id] = new FontStored(added, new Standard14WritingFont(Standard14.GetAdobeFontMetrics(type)));

@@ -259,6 +269,11 @@ namespace UglyToad.PdfPig.Writer
                    context.WriteObject(memory, streamToken, image.Value.ObjectNumber);
                }

+                foreach (var tokenSet in unwrittenTokens)
+                {
+                    context.WriteObject(memory, tokenSet.Value, (int)tokenSet.Key.Data.ObjectNumber);
+                }
+
                var procSet = new List<NameToken>
                {
                    NameToken.Create("PDF"),
@@ -278,9 +293,7 @@ namespace UglyToad.PdfPig.Writer
                    var fontsDictionary = new DictionaryToken(fontsWritten.Select(x => (fonts[x.Key].FontKey.Name, (IToken)new IndirectReferenceToken(x.Value.Number)))
                        .ToDictionary(x => x.Item1, x => x.Item2));

-                    var fontsDictionaryRef = context.WriteObject(memory, fontsDictionary);
-
-                    resources.Add(NameToken.Font, new IndirectReferenceToken(fontsDictionaryRef.Number));
+                    resources.Add(NameToken.Font, fontsDictionary);
                }

                var reserved = context.ReserveNumber();
@@ -301,8 +314,24 @@ namespace UglyToad.PdfPig.Writer
                    {
                        foreach (var kvp in page.Value.Resources)
                        {
-                            // TODO: combine resources if value is dictionary or array, otherwise overwrite.
-                            individualResources[kvp.Key] = kvp.Value;
+                            var value = kvp.Value;
+                            if (individualResources.TryGetValue(kvp.Key, out var pageToken))
+                            {
+                                if (pageToken is DictionaryToken leftDictionary && value is DictionaryToken rightDictionary)
+                                {
+                                    var merged = leftDictionary.Data.ToDictionary(k => NameToken.Create(k.Key), v => v.Value);
+                                    foreach (var set in rightDictionary.Data)
+                                    {
+                                        merged[NameToken.Create(set.Key)] = set.Value;
+                                    }
+
+                                    value = new DictionaryToken(merged);
+
+                                }
+                                // Else override
+                            }
+
+                            individualResources[kvp.Key] = value;
                        }
                    }

@@ -392,6 +421,75 @@ namespace UglyToad.PdfPig.Writer
            }
        }

+        /// <summary>
+        /// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream
+        /// and replace the indirect reference with the correct/new one
+        /// </summary>
+        /// <param name="tokenToCopy">Token to inspect for reference</param>
+        /// <param name="tokenScanner">scanner get the content from the original document</param>
+        /// <returns>A reference of the token that was copied. With all the reference updated</returns>
+        internal IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner)
+        {
+            // This token need to be deep copied, because they could contain reference. So we have to update them.
+            switch (tokenToCopy)
+            {
+                case DictionaryToken dictionaryToken:
+                    {
+                        var newContent = new Dictionary<NameToken, IToken>();
+                        foreach (var setPair in dictionaryToken.Data)
+                        {
+                            var name = setPair.Key;
+                            var token = setPair.Value;
+                            newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner));
+                        }
+
+                        return new DictionaryToken(newContent);
+                    }
+                case ArrayToken arrayToken:
+                    {
+                        var newArray = new List<IToken>(arrayToken.Length);
+                        foreach (var token in arrayToken.Data)
+                        {
+                            newArray.Add(CopyToken(token, tokenScanner));
+                        }
+
+                        return new ArrayToken(newArray);
+                    }
+                case IndirectReferenceToken referenceToken:
+                    {
+                        var tokenObject = DirectObjectFinder.Get<IToken>(referenceToken.Data, tokenScanner);
+
+                        Debug.Assert(!(tokenObject is IndirectReferenceToken));
+
+                        var newToken = CopyToken(tokenObject, tokenScanner);
+
+                        var reserved = context.ReserveNumber();
+                        var newReference = new IndirectReferenceToken(new IndirectReference(reserved, 0));
+
+                        unwrittenTokens.Add(newReference, newToken);
+
+                        return newReference;
+                    }
+                case StreamToken streamToken:
+                    {
+                        var properties = CopyToken(streamToken.StreamDictionary, tokenScanner) as DictionaryToken;
+                        Debug.Assert(properties != null);
+
+                        var bytes = streamToken.Data;
+                        return new StreamToken(properties, bytes);
+                    }
+
+                case ObjectToken _:
+                    {
+                        // Since we don't write token directly to the stream.
+                        // We can't know the offset. Therefore the token would be invalid
+                        throw new NotSupportedException("Copying a Object token is not supported");
+                    }
+            }
+
+            return tokenToCopy;
+        }
+
        private static StreamToken WriteContentStream(IReadOnlyList<IGraphicsStateOperation> content)
        {
            using (var memoryStream = new MemoryStream())
--- a/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs
+++ b/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs
@@ -15,7 +15,9 @@
    using Images;
    using System;
    using System.Collections.Generic;
+    using System.Diagnostics;
    using System.IO;
+    using System.Linq;
    using PdfFonts;
    using Tokens;
    using Graphics.Operations.PathPainting;
@@ -228,7 +230,7 @@
                throw new ArgumentNullException(nameof(text));
            }

-            if (!documentBuilder.Fonts.TryGetValue(font.Id, out var fontProgram))
+            if (!documentBuilder.Fonts.TryGetValue(font.Id, out var fontStore))
            {
                throw new ArgumentException($"No font has been added to the PdfDocumentBuilder with Id: {font.Id}. " +
                                            $"Use {nameof(documentBuilder.AddTrueTypeFont)} to register a font.", nameof(font));
@@ -239,6 +241,8 @@
                throw new ArgumentOutOfRangeException(nameof(fontSize), "Font size must be greater than 0");
            }

+            var fontProgram = fontStore.FontProgram;
+
            var fm = fontProgram.GetFontMatrix();

            var textMatrix = TransformationMatrix.FromValues(1, 0, 0, 1, position.X, position.Y);
@@ -271,7 +275,7 @@
                throw new ArgumentNullException(nameof(text));
            }

-            if (!documentBuilder.Fonts.TryGetValue(font.Id, out var fontProgram))
+            if (!documentBuilder.Fonts.TryGetValue(font.Id, out var fontStore))
            {
                throw new ArgumentException($"No font has been added to the PdfDocumentBuilder with Id: {font.Id}. " +
                                            $"Use {nameof(documentBuilder.AddTrueTypeFont)} to register a font.", nameof(font));
@@ -282,6 +286,8 @@
                throw new ArgumentOutOfRangeException(nameof(fontSize), "Font size must be greater than 0");
            }

+            var fontProgram = fontStore.FontProgram;
+
            var fm = fontProgram.GetFontMatrix();

            var textMatrix = TransformationMatrix.FromValues(1, 0, 0, 1, position.X, position.Y);
@@ -497,6 +503,169 @@
            return new AddedImage(reference, png.Width, png.Height);
        }

+        /// <summary>
+        /// Copy a page from unknown source to this page
+        /// </summary>
+        /// <param name="srcPage">Page to be copied</param>
+        public void CopyFrom(Page srcPage)
+        {
+            ContentStream destinationStream = null;
+            if (CurrentStream.Operations.Count > 0)
+            {
+                NewContentStreamAfter();
+            }
+
+            destinationStream = CurrentStream;
+
+            if (!srcPage.Dictionary.TryGet(NameToken.Resources, srcPage.pdfScanner, out DictionaryToken srcResourceDictionary))
+            {
+                // If the page doesn't have resources, then we copy the entire content stream, since not operation would collide 
+                // with the ones already written
+                destinationStream.Operations.AddRange(srcPage.Operations);
+                return;
+            }
+
+            // TODO: How should we handle any other token in the page dictionary (Eg. LastModified, MediaBox, CropBox, BleedBox, TrimBox, ArtBox,
+            //      BoxColorInfo, Rotate, Group, Thumb, B, Dur, Trans, Annots, AA, Metadata, PieceInfo, StructParents, ID, PZ, SeparationInfo, Tabs,
+            //      TemplateInstantiated, PresSteps, UserUnit, VP)
+
+            var operations = new List<IGraphicsStateOperation>(srcPage.Operations);
+
+            // We need to relocate the resources, and we have to make sure that none of the resources collide with 
+            // the already written operation's resources
+
+            foreach (var set in srcResourceDictionary.Data)
+            {
+                var nameToken = NameToken.Create(set.Key);
+                if (nameToken == NameToken.Font || nameToken == NameToken.Xobject)
+                {
+                    // We have to skip this two because we have a separate dictionary for them
+                    continue;
+                }
+
+                if (!resourcesDictionary.TryGetValue(nameToken, out var currentToken))
+                {
+                    // It means that this type of resources doesn't currently exist in the page, so we can copy it
+                    // with no problem
+                    resourcesDictionary[nameToken] = documentBuilder.CopyToken(set.Value, srcPage.pdfScanner);
+                    continue;
+                }
+
+                // TODO: I need to find a test case
+                // It would have ExtendedGraphics or colorspaces, etc...
+            }
+
+            // Special cases
+            // Since we don't directly add font's to the pages resources, we have to go look at the document's font
+            if(srcResourceDictionary.TryGet(NameToken.Font, srcPage.pdfScanner, out DictionaryToken fontsDictionary))
+            {
+                Dictionary<NameToken, IToken> pageFontsDictionary = null;
+                if (resourcesDictionary.TryGetValue(NameToken.Font, out var pageFontsToken))
+                {
+                    pageFontsDictionary = (pageFontsToken as DictionaryToken)?.Data.ToDictionary(k => NameToken.Create(k.Key), v => v.Value);
+                    Debug.Assert(pageFontsDictionary != null);
+                }
+                else
+                {
+                    pageFontsDictionary = new Dictionary<NameToken, IToken>();
+                }
+
+                foreach (var fontSet in fontsDictionary.Data)
+                {
+                    var fontName = fontSet.Key;
+                    var addedFont = documentBuilder.Fonts.Values.FirstOrDefault(f => f.FontKey.Name.Data == fontName);
+                    if (addedFont != default)
+                    {
+                        // This would mean that the imported font collide with one of the added font. so we have to rename it
+
+                        var newName = $"F{documentBuilder.fontId++}";
+
+                        // Set all the pertinent SetFontAndSize operations with the new name
+                        operations = operations.Select(op =>
+                        {
+                            if (!(op is SetFontAndSize fontAndSizeOperation))
+                            {
+                                return op;
+                            }
+
+                            if (fontAndSizeOperation.Font.Data == fontName)
+                            {
+                                return new SetFontAndSize(NameToken.Create(newName), fontAndSizeOperation.Size);
+                            }
+
+                            return op;
+                        }).ToList();
+
+                        fontName = newName;
+                    }
+
+                    if (!(fontSet.Value is IndirectReferenceToken fontReferenceToken))
+                    {
+                        throw new PdfDocumentFormatException($"Expected a IndirectReferenceToken for the font, got a {fontSet.Value.GetType().Name}");
+                    }
+
+                    pageFontsDictionary.Add(NameToken.Create(fontName), documentBuilder.CopyToken(fontReferenceToken, srcPage.pdfScanner));
+                }
+
+                resourcesDictionary[NameToken.Font] = new DictionaryToken(pageFontsDictionary);
+            }
+
+            // Since we don't directly add xobjects's to the pages resources, we have to go look at the document's xobjects
+            if (srcResourceDictionary.TryGet(NameToken.Xobject, srcPage.pdfScanner, out DictionaryToken xobjectsDictionary))
+            {
+                Dictionary<NameToken, IToken> pageXobjectsDictionary = null;
+                if (resourcesDictionary.TryGetValue(NameToken.Xobject, out var pageXobjectToken))
+                {
+                    pageXobjectsDictionary = (pageXobjectToken as DictionaryToken)?.Data.ToDictionary(k => NameToken.Create(k.Key), v => v.Value);
+                    Debug.Assert(pageXobjectsDictionary != null);
+                }
+                else
+                {
+                    pageXobjectsDictionary = new Dictionary<NameToken, IToken>();
+                }
+
+                var xobjectNamesUsed = Enumerable.Range(0, imageKey).Select(i => $"I{i}");
+                foreach (var xobjectSet in xobjectsDictionary.Data)
+                {
+                    var xobjectName = xobjectSet.Key;
+                    if (xobjectName[0] == 'I' && xobjectNamesUsed.Any(s => s == xobjectName))
+                    {
+                        // This would mean that the imported xobject collide with one of the added image. so we have to rename it
+                        var newName = $"I{imageKey++}";
+
+                        // Set all the pertinent SetFontAndSize operations with the new name
+                        operations = operations.Select(op =>
+                        {
+                            if (!(op is InvokeNamedXObject invokeNamedOperation))
+                            {
+                                return op;
+                            }
+
+                            if (invokeNamedOperation.Name.Data == xobjectName)
+                            {
+                                return new InvokeNamedXObject(NameToken.Create(newName));
+                            }
+
+                            return op;
+                        }).ToList();
+
+                        xobjectName = newName;
+                    }
+
+                    if (!(xobjectSet.Value is IndirectReferenceToken fontReferenceToken))
+                    {
+                        throw new PdfDocumentFormatException($"Expected a IndirectReferenceToken for the XObject, got a {xobjectSet.Value.GetType().Name}");
+                    }
+
+                    pageXobjectsDictionary.Add(NameToken.Create(xobjectName), documentBuilder.CopyToken(fontReferenceToken, srcPage.pdfScanner));
+                }
+
+                resourcesDictionary[NameToken.Xobject] = new DictionaryToken(pageXobjectsDictionary);
+            }
+
+            destinationStream.Operations.AddRange(operations);
+        }
+
        private List<Letter> DrawLetters(string text, IWritingFont font, TransformationMatrix fontMatrix, decimal fontSize, TransformationMatrix textMatrix)
        {
            var horizontalScaling = 1;