From 7126564eefbc30c205a165739d333dd6da3ddb0e Mon Sep 17 00:00:00 2001 From: InusualZ Date: Sun, 20 Dec 2020 19:13:19 +0000 Subject: [PATCH] Allow to copy pages from another document This is a naive implementation, because if you copy multiple pages from the same document, the recipient document would be bloated with duplicated resources --- .../Writer/PdfDocumentBuilderTests.cs | 47 +++++ src/UglyToad.PdfPig/Content/Page.cs | 2 +- .../Writer/PdfDocumentBuilder.cs | 116 +++++++++++- src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs | 173 +++++++++++++++++- 4 files changed, 326 insertions(+), 12 deletions(-) diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs index c57404e9..5796e0a9 100644 --- a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs @@ -644,6 +644,53 @@ } } + [Fact] + public void CanCopyPage() + { + + byte[] b; + { + var builder = new PdfDocumentBuilder(); + + var page1 = builder.AddPage(PageSize.A4); + + var file = TrueTypeTestHelper.GetFileBytes("Andada-Regular.ttf"); + + var font = builder.AddTrueTypeFont(file); + + page1.AddText("Hello", 12, new PdfPoint(30, 50), font); + + Assert.NotEmpty(page1.CurrentStream.Operations); + + + using (var readDocument = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("bold-italic.pdf"))) + { + var rpage = readDocument.GetPage(1); + + var page2 = builder.AddPage(PageSize.A4); + page2.CopyFrom(rpage); + } + + b = builder.Build(); + Assert.NotEmpty(b); + } + + WriteFile(nameof(CanCopyPage), b); + + using (var document = PdfDocument.Open(b)) + { + Assert.Equal( 2, document.NumberOfPages); + + var page1 = document.GetPage(1); + + Assert.Equal("Hello", page1.Text); + + var page2 = document.GetPage(2); + + Assert.Equal("Lorem ipsum dolor sit amet, consectetur adipiscing elit. ", page2.Text); + } + } + private static void WriteFile(string name, byte[] bytes, string extension = "pdf") { try diff --git a/src/UglyToad.PdfPig/Content/Page.cs b/src/UglyToad.PdfPig/Content/Page.cs index 9268147e..e3886e2a 100644 --- a/src/UglyToad.PdfPig/Content/Page.cs +++ b/src/UglyToad.PdfPig/Content/Page.cs @@ -17,7 +17,7 @@ public class Page { private readonly AnnotationProvider annotationProvider; - private readonly IPdfTokenScanner pdfScanner; + internal readonly IPdfTokenScanner pdfScanner; private readonly Lazy textLazy; /// diff --git a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs index 4e022bb9..69783958 100644 --- a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs @@ -3,6 +3,7 @@ namespace UglyToad.PdfPig.Writer { using System; using System.Collections.Generic; + using System.Diagnostics; using System.IO; using System.Linq; using Content; @@ -10,8 +11,10 @@ namespace UglyToad.PdfPig.Writer using Fonts; using PdfPig.Fonts.TrueType; using Graphics.Operations; + using Parser.Parts; using PdfPig.Fonts.Standard14Fonts; using PdfPig.Fonts.TrueType.Parser; + using Tokenization.Scanner; using Tokens; using Util.JetBrains.Annotations; @@ -25,6 +28,9 @@ namespace UglyToad.PdfPig.Writer private readonly Dictionary pages = new Dictionary(); private readonly Dictionary fonts = new Dictionary(); private readonly Dictionary images = new Dictionary(); + private readonly Dictionary unwrittenTokens = new Dictionary(); + + internal int fontId = 0; /// /// The standard of PDF/A compliance of the generated document. Defaults to . @@ -50,7 +56,12 @@ namespace UglyToad.PdfPig.Writer /// /// The fonts currently available in the document builder added via or . Keyed by id for internal purposes. /// - internal IReadOnlyDictionary Fonts => fonts.ToDictionary(x => x.Key, x => x.Value.FontProgram); + internal IReadOnlyDictionary Fonts => fonts; + + /// + /// The images currently available in the document builder added via . Keyed by id for internal purposes. + /// + internal IReadOnlyDictionary Images => images; /// /// Determines whether the bytes of the TrueType font file provided can be used in a PDF document. @@ -116,8 +127,7 @@ namespace UglyToad.PdfPig.Writer { var font = TrueTypeFontParser.Parse(new TrueTypeDataBytes(new ByteArrayInputBytes(fontFileBytes))); var id = Guid.NewGuid(); - var i = fonts.Count; - var added = new AddedFont(id, NameToken.Create($"F{i}")); + var added = new AddedFont(id, NameToken.Create($"F{fontId++}")); fonts[id] = new FontStored(added, new TrueTypeWritingFont(font, fontFileBytes)); return added; @@ -141,7 +151,7 @@ namespace UglyToad.PdfPig.Writer } var id = Guid.NewGuid(); - var name = NameToken.Create($"F{fonts.Count}"); + var name = NameToken.Create($"F{fontId++}"); var added = new AddedFont(id, name); fonts[id] = new FontStored(added, new Standard14WritingFont(Standard14.GetAdobeFontMetrics(type))); @@ -259,6 +269,11 @@ namespace UglyToad.PdfPig.Writer context.WriteObject(memory, streamToken, image.Value.ObjectNumber); } + foreach (var tokenSet in unwrittenTokens) + { + context.WriteObject(memory, tokenSet.Value, (int)tokenSet.Key.Data.ObjectNumber); + } + var procSet = new List { NameToken.Create("PDF"), @@ -278,9 +293,7 @@ namespace UglyToad.PdfPig.Writer var fontsDictionary = new DictionaryToken(fontsWritten.Select(x => (fonts[x.Key].FontKey.Name, (IToken)new IndirectReferenceToken(x.Value.Number))) .ToDictionary(x => x.Item1, x => x.Item2)); - var fontsDictionaryRef = context.WriteObject(memory, fontsDictionary); - - resources.Add(NameToken.Font, new IndirectReferenceToken(fontsDictionaryRef.Number)); + resources.Add(NameToken.Font, fontsDictionary); } var reserved = context.ReserveNumber(); @@ -301,8 +314,24 @@ namespace UglyToad.PdfPig.Writer { foreach (var kvp in page.Value.Resources) { - // TODO: combine resources if value is dictionary or array, otherwise overwrite. - individualResources[kvp.Key] = kvp.Value; + var value = kvp.Value; + if (individualResources.TryGetValue(kvp.Key, out var pageToken)) + { + if (pageToken is DictionaryToken leftDictionary && value is DictionaryToken rightDictionary) + { + var merged = leftDictionary.Data.ToDictionary(k => NameToken.Create(k.Key), v => v.Value); + foreach (var set in rightDictionary.Data) + { + merged[NameToken.Create(set.Key)] = set.Value; + } + + value = new DictionaryToken(merged); + + } + // Else override + } + + individualResources[kvp.Key] = value; } } @@ -392,6 +421,75 @@ namespace UglyToad.PdfPig.Writer } } + /// + /// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream + /// and replace the indirect reference with the correct/new one + /// + /// Token to inspect for reference + /// scanner get the content from the original document + /// A reference of the token that was copied. With all the reference updated + internal IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner) + { + // This token need to be deep copied, because they could contain reference. So we have to update them. + switch (tokenToCopy) + { + case DictionaryToken dictionaryToken: + { + var newContent = new Dictionary(); + foreach (var setPair in dictionaryToken.Data) + { + var name = setPair.Key; + var token = setPair.Value; + newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner)); + } + + return new DictionaryToken(newContent); + } + case ArrayToken arrayToken: + { + var newArray = new List(arrayToken.Length); + foreach (var token in arrayToken.Data) + { + newArray.Add(CopyToken(token, tokenScanner)); + } + + return new ArrayToken(newArray); + } + case IndirectReferenceToken referenceToken: + { + var tokenObject = DirectObjectFinder.Get(referenceToken.Data, tokenScanner); + + Debug.Assert(!(tokenObject is IndirectReferenceToken)); + + var newToken = CopyToken(tokenObject, tokenScanner); + + var reserved = context.ReserveNumber(); + var newReference = new IndirectReferenceToken(new IndirectReference(reserved, 0)); + + unwrittenTokens.Add(newReference, newToken); + + return newReference; + } + case StreamToken streamToken: + { + var properties = CopyToken(streamToken.StreamDictionary, tokenScanner) as DictionaryToken; + Debug.Assert(properties != null); + + var bytes = streamToken.Data; + return new StreamToken(properties, bytes); + } + + case ObjectToken _: + { + // Since we don't write token directly to the stream. + // We can't know the offset. Therefore the token would be invalid + throw new NotSupportedException("Copying a Object token is not supported"); + } + } + + return tokenToCopy; + } + private static StreamToken WriteContentStream(IReadOnlyList content) { using (var memoryStream = new MemoryStream()) diff --git a/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs index 1eaeef1d..96ce9077 100644 --- a/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs @@ -15,7 +15,9 @@ using Images; using System; using System.Collections.Generic; + using System.Diagnostics; using System.IO; + using System.Linq; using PdfFonts; using Tokens; using Graphics.Operations.PathPainting; @@ -228,7 +230,7 @@ throw new ArgumentNullException(nameof(text)); } - if (!documentBuilder.Fonts.TryGetValue(font.Id, out var fontProgram)) + if (!documentBuilder.Fonts.TryGetValue(font.Id, out var fontStore)) { throw new ArgumentException($"No font has been added to the PdfDocumentBuilder with Id: {font.Id}. " + $"Use {nameof(documentBuilder.AddTrueTypeFont)} to register a font.", nameof(font)); @@ -239,6 +241,8 @@ throw new ArgumentOutOfRangeException(nameof(fontSize), "Font size must be greater than 0"); } + var fontProgram = fontStore.FontProgram; + var fm = fontProgram.GetFontMatrix(); var textMatrix = TransformationMatrix.FromValues(1, 0, 0, 1, position.X, position.Y); @@ -271,7 +275,7 @@ throw new ArgumentNullException(nameof(text)); } - if (!documentBuilder.Fonts.TryGetValue(font.Id, out var fontProgram)) + if (!documentBuilder.Fonts.TryGetValue(font.Id, out var fontStore)) { throw new ArgumentException($"No font has been added to the PdfDocumentBuilder with Id: {font.Id}. " + $"Use {nameof(documentBuilder.AddTrueTypeFont)} to register a font.", nameof(font)); @@ -282,6 +286,8 @@ throw new ArgumentOutOfRangeException(nameof(fontSize), "Font size must be greater than 0"); } + var fontProgram = fontStore.FontProgram; + var fm = fontProgram.GetFontMatrix(); var textMatrix = TransformationMatrix.FromValues(1, 0, 0, 1, position.X, position.Y); @@ -497,6 +503,169 @@ return new AddedImage(reference, png.Width, png.Height); } + /// + /// Copy a page from unknown source to this page + /// + /// Page to be copied + public void CopyFrom(Page srcPage) + { + ContentStream destinationStream = null; + if (CurrentStream.Operations.Count > 0) + { + NewContentStreamAfter(); + } + + destinationStream = CurrentStream; + + if (!srcPage.Dictionary.TryGet(NameToken.Resources, srcPage.pdfScanner, out DictionaryToken srcResourceDictionary)) + { + // If the page doesn't have resources, then we copy the entire content stream, since not operation would collide + // with the ones already written + destinationStream.Operations.AddRange(srcPage.Operations); + return; + } + + // TODO: How should we handle any other token in the page dictionary (Eg. LastModified, MediaBox, CropBox, BleedBox, TrimBox, ArtBox, + // BoxColorInfo, Rotate, Group, Thumb, B, Dur, Trans, Annots, AA, Metadata, PieceInfo, StructParents, ID, PZ, SeparationInfo, Tabs, + // TemplateInstantiated, PresSteps, UserUnit, VP) + + var operations = new List(srcPage.Operations); + + // We need to relocate the resources, and we have to make sure that none of the resources collide with + // the already written operation's resources + + foreach (var set in srcResourceDictionary.Data) + { + var nameToken = NameToken.Create(set.Key); + if (nameToken == NameToken.Font || nameToken == NameToken.Xobject) + { + // We have to skip this two because we have a separate dictionary for them + continue; + } + + if (!resourcesDictionary.TryGetValue(nameToken, out var currentToken)) + { + // It means that this type of resources doesn't currently exist in the page, so we can copy it + // with no problem + resourcesDictionary[nameToken] = documentBuilder.CopyToken(set.Value, srcPage.pdfScanner); + continue; + } + + // TODO: I need to find a test case + // It would have ExtendedGraphics or colorspaces, etc... + } + + // Special cases + // Since we don't directly add font's to the pages resources, we have to go look at the document's font + if(srcResourceDictionary.TryGet(NameToken.Font, srcPage.pdfScanner, out DictionaryToken fontsDictionary)) + { + Dictionary pageFontsDictionary = null; + if (resourcesDictionary.TryGetValue(NameToken.Font, out var pageFontsToken)) + { + pageFontsDictionary = (pageFontsToken as DictionaryToken)?.Data.ToDictionary(k => NameToken.Create(k.Key), v => v.Value); + Debug.Assert(pageFontsDictionary != null); + } + else + { + pageFontsDictionary = new Dictionary(); + } + + foreach (var fontSet in fontsDictionary.Data) + { + var fontName = fontSet.Key; + var addedFont = documentBuilder.Fonts.Values.FirstOrDefault(f => f.FontKey.Name.Data == fontName); + if (addedFont != default) + { + // This would mean that the imported font collide with one of the added font. so we have to rename it + + var newName = $"F{documentBuilder.fontId++}"; + + // Set all the pertinent SetFontAndSize operations with the new name + operations = operations.Select(op => + { + if (!(op is SetFontAndSize fontAndSizeOperation)) + { + return op; + } + + if (fontAndSizeOperation.Font.Data == fontName) + { + return new SetFontAndSize(NameToken.Create(newName), fontAndSizeOperation.Size); + } + + return op; + }).ToList(); + + fontName = newName; + } + + if (!(fontSet.Value is IndirectReferenceToken fontReferenceToken)) + { + throw new PdfDocumentFormatException($"Expected a IndirectReferenceToken for the font, got a {fontSet.Value.GetType().Name}"); + } + + pageFontsDictionary.Add(NameToken.Create(fontName), documentBuilder.CopyToken(fontReferenceToken, srcPage.pdfScanner)); + } + + resourcesDictionary[NameToken.Font] = new DictionaryToken(pageFontsDictionary); + } + + // Since we don't directly add xobjects's to the pages resources, we have to go look at the document's xobjects + if (srcResourceDictionary.TryGet(NameToken.Xobject, srcPage.pdfScanner, out DictionaryToken xobjectsDictionary)) + { + Dictionary pageXobjectsDictionary = null; + if (resourcesDictionary.TryGetValue(NameToken.Xobject, out var pageXobjectToken)) + { + pageXobjectsDictionary = (pageXobjectToken as DictionaryToken)?.Data.ToDictionary(k => NameToken.Create(k.Key), v => v.Value); + Debug.Assert(pageXobjectsDictionary != null); + } + else + { + pageXobjectsDictionary = new Dictionary(); + } + + var xobjectNamesUsed = Enumerable.Range(0, imageKey).Select(i => $"I{i}"); + foreach (var xobjectSet in xobjectsDictionary.Data) + { + var xobjectName = xobjectSet.Key; + if (xobjectName[0] == 'I' && xobjectNamesUsed.Any(s => s == xobjectName)) + { + // This would mean that the imported xobject collide with one of the added image. so we have to rename it + var newName = $"I{imageKey++}"; + + // Set all the pertinent SetFontAndSize operations with the new name + operations = operations.Select(op => + { + if (!(op is InvokeNamedXObject invokeNamedOperation)) + { + return op; + } + + if (invokeNamedOperation.Name.Data == xobjectName) + { + return new InvokeNamedXObject(NameToken.Create(newName)); + } + + return op; + }).ToList(); + + xobjectName = newName; + } + + if (!(xobjectSet.Value is IndirectReferenceToken fontReferenceToken)) + { + throw new PdfDocumentFormatException($"Expected a IndirectReferenceToken for the XObject, got a {xobjectSet.Value.GetType().Name}"); + } + + pageXobjectsDictionary.Add(NameToken.Create(xobjectName), documentBuilder.CopyToken(fontReferenceToken, srcPage.pdfScanner)); + } + + resourcesDictionary[NameToken.Xobject] = new DictionaryToken(pageXobjectsDictionary); + } + + destinationStream.Operations.AddRange(operations); + } + private List DrawLetters(string text, IWritingFont font, TransformationMatrix fontMatrix, decimal fontSize, TransformationMatrix textMatrix) { var horizontalScaling = 1;