clean up pagebuilder, switch merger to use pdfdocumentbuilder

This commit is contained in:
Plaisted
2021-02-08 12:37:09 -06:00
parent ca0b90523e
commit 6e1cf89cf9
5 changed files with 242 additions and 522 deletions

View File

@@ -865,8 +865,8 @@
using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff))
{
Assert.Equal(1, document.NumberOfPages);
var pg = document.GetPage(1);
// single empty page should result in single content stream
var pg = document.GetPage(1);
// single empty page should result in single content stream
Assert.NotNull(pg.Dictionary.Data[NameToken.Contents] as IndirectReferenceToken);
}
}
@@ -925,7 +925,7 @@
var pg = document.GetPage(1);
// multiple streams should be written to array
var streams = pg.Dictionary.Data[NameToken.Contents] as ArrayToken;
Assert.NotNull(streams);
Assert.NotNull(streams);
Assert.Equal(2, streams.Length);
}
}

View File

@@ -74,13 +74,24 @@ namespace UglyToad.PdfPig.Writer
context.InitializePdf(1.7m);
}
/// <summary>
/// Creates a document builder keeping resources in memory.
/// </summary>
/// <param name="version">Pdf version to use in header.</param>
public PdfDocumentBuilder(decimal version)
{
context = new PdfStreamWriter(new MemoryStream(), true);
context.InitializePdf(version);
}
/// <summary>
/// Creates a document builder using the supplied stream.
/// </summary>
/// <param name="stream">Steam to write pdf to.</param>
/// <param name="disposeStream">If stream should be disposed when builder is.</param>
/// <param name="type">Type of pdf stream writer to use</param>
public PdfDocumentBuilder(Stream stream, bool disposeStream=false, PdfWriterType type=PdfWriterType.Default)
/// <param name="version">Pdf version to use in header.</param>
public PdfDocumentBuilder(Stream stream, bool disposeStream=false, PdfWriterType type=PdfWriterType.Default, decimal version=1.7m)
{
switch (type)
{
@@ -91,7 +102,7 @@ namespace UglyToad.PdfPig.Writer
context = new PdfStreamWriter(stream, disposeStream);
break;
}
context.InitializePdf(1.7m);
context.InitializePdf(version);
}
/// <summary>
@@ -287,7 +298,7 @@ namespace UglyToad.PdfPig.Writer
/// <param name="pageNumber">Page to copy.</param>
/// <returns>A builder for editing the page.</returns>
public PdfPageBuilder AddPage(PdfDocument document, int pageNumber)
{
{
if (!existingCopies.TryGetValue(document.Structure.TokenScanner, out var refs))
{
refs = new Dictionary<IndirectReference, IndirectReferenceToken>();
@@ -372,23 +383,12 @@ namespace UglyToad.PdfPig.Writer
WriterUtil.CopyToken(context, kvp.Value, document.Structure.TokenScanner, refs);
}
var builder = new PdfPageBuilder(pages.Count + 1, this, streams, resources, copiedPageDict);
if (resources.TryGetValue(NameToken.Font, out var fonts))
{
var existingFontDict = fonts as DictionaryToken;
foreach (var item in existingFontDict.Data)
{
var key = NameToken.Create(item.Key);
builder.fontDictionary[key] = item.Value;
}
resources.Remove(NameToken.Font);
}
copiedPageDict[NameToken.Resources] = new DictionaryToken(resources);
var builder = new PdfPageBuilder(pages.Count + 1, this, streams, copiedPageDict);
pages[builder.PageNumber] = builder;
return builder;
void CopyResourceDict(IToken token, Dictionary<NameToken, IToken> destinationDict)
{
DictionaryToken dict = GetRemoteDict(token);
@@ -483,7 +483,7 @@ namespace UglyToad.PdfPig.Writer
foreach (var page in pages)
{
var pageDictionary = page.Value.additionalPageProperties;
var pageDictionary = page.Value.pageDictionary;
pageDictionary[NameToken.Type] = NameToken.Page;
pageDictionary[NameToken.Parent] = leafRefs[leafNum];
pageDictionary[NameToken.ProcSet] = new ArrayToken(procSet);
@@ -492,29 +492,19 @@ namespace UglyToad.PdfPig.Writer
pageDictionary[NameToken.MediaBox] = RectangleToArray(page.Value.PageSize);
}
// combine existing resources (if any) with added
var pageResources = new Dictionary<NameToken, IToken>();
foreach (var existing in page.Value.Resources)
{
pageResources[existing.Key] = existing.Value;
}
pageResources[NameToken.Font] = new DictionaryToken(page.Value.fontDictionary);
pageDictionary[NameToken.Resources] = new DictionaryToken(pageResources);
var toWrite = page.Value.contentStreams.Where(x => x.HasContent).ToList();
if (toWrite.Count == 0)
{
{
// write empty
pageDictionary[NameToken.Contents] = new PdfPageBuilder.DefaultContentStream().Write(context);
}
else if (toWrite.Count == 1)
{
{
// write single
pageDictionary[NameToken.Contents] = toWrite[0].Write(context);
}
else
{
else
{
// write array
var streams = new List<IToken>();
foreach (var stream in toWrite)

View File

@@ -2,22 +2,10 @@
{
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using Content;
using Core;
using CrossReference;
using Encryption;
using Filters;
using Logging;
using Parser;
using Parser.FileStructure;
using Parser.Parts;
using Tokenization.Scanner;
using Tokens;
using Exceptions;
using System.Linq;
using Util;
/// <summary>
/// Merges PDF documents into each other.
@@ -48,9 +36,9 @@
_ = file1 ?? throw new ArgumentNullException(nameof(file1));
_ = file2 ?? throw new ArgumentNullException(nameof(file2));
using (var stream1 = new StreamInputBytes(File.OpenRead(file1)))
using (var stream1 = File.OpenRead(file1))
{
using (var stream2 = new StreamInputBytes(File.OpenRead(file2)))
using (var stream2 = File.OpenRead(file2))
{
Merge(new[] { stream1, stream2 }, output, new[] { file1Selection, file2Selection });
}
@@ -74,13 +62,13 @@
/// </summary>
public static void Merge(Stream output, params string[] filePaths)
{
var streams = new List<StreamInputBytes>(filePaths.Length);
var streams = new List<Stream>(filePaths.Length);
try
{
for (var i = 0; i < filePaths.Length; i++)
{
var filePath = filePaths[i] ?? throw new ArgumentNullException(nameof(filePaths), $"Null filepath at index {i}.");
streams.Add(new StreamInputBytes(File.OpenRead(filePath), true));
streams.Add(File.OpenRead(filePath));
}
Merge(streams, output, null);
@@ -103,7 +91,7 @@
using (var output = new MemoryStream())
{
Merge(files.Select(f => new ByteArrayInputBytes(f)).ToArray(), output, pagesBundle);
Merge(files.Select(f => PdfDocument.Open(f)).ToArray(), output, pagesBundle);
return output.ToArray();
}
}
@@ -122,317 +110,39 @@
_ = streams ?? throw new ArgumentNullException(nameof(streams));
_ = output ?? throw new ArgumentNullException(nameof(output));
Merge(streams.Select(f => new StreamInputBytes(f, false)).ToArray(), output, pagesBundle);
Merge(streams.Select(f => PdfDocument.Open(f)).ToArray(), output, pagesBundle);
}
private static void Merge(IReadOnlyList<IInputBytes> files, Stream output, IReadOnlyList<IReadOnlyList<int>> pagesBundle)
private static void Merge(IReadOnlyList<PdfDocument> files, Stream output, IReadOnlyList<IReadOnlyList<int>> pagesBundle)
{
const bool isLenientParsing = false;
var writer = new PdfStreamWriter(output, false);
var documentBuilder = new DocumentMerger(writer);
var maxVersion = 1.2m;
var infos = new List<(CoreTokenScanner CoreScanner, HeaderVersion Version)>();
foreach (var fileIndex in Enumerable.Range(0, files.Count))
var maxVersion = files.Select(x=>x.Version).Max();
using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, maxVersion))
{
var inputBytes = files[fileIndex];
var coreScanner = new CoreTokenScanner(inputBytes);
var version = FileHeaderParser.Parse(coreScanner, isLenientParsing, Log);
maxVersion = Math.Max(maxVersion, version.Version);
infos.Add((coreScanner, version));
}
writer.InitializePdf(maxVersion);
foreach (var fileIndex in Enumerable.Range(0, files.Count))
{
IReadOnlyList<int> pages = null;
if (pagesBundle != null && fileIndex < pagesBundle.Count)
foreach (var fileIndex in Enumerable.Range(0, files.Count))
{
pages = pagesBundle[fileIndex];
}
var inputBytes = files[fileIndex];
var (coreScanner, version) = infos[fileIndex];
var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log),
new Parser.Parts.CrossReference.CrossReferenceStreamParser(FilterProvider));
CrossReferenceTable crossReference = null;
// ReSharper disable once AccessToModifiedClosure
var locationProvider = new ObjectLocationProvider(() => crossReference, inputBytes);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, FilterProvider, NoOpEncryptionHandler.Instance);
var crossReferenceOffset = FileTrailerParser.GetFirstCrossReferenceOffset(inputBytes, coreScanner, isLenientParsing);
crossReference = crossReferenceParser.Parse(inputBytes, isLenientParsing, crossReferenceOffset, version.OffsetInFile, pdfScanner, coreScanner);
var catalogDictionaryToken = ParseCatalog(crossReference, pdfScanner, out var encryptionDictionary);
if (encryptionDictionary != null)
{
throw new PdfDocumentEncryptedException("Unable to merge document with password");
}
var documentCatalog = CatalogFactory.Create(crossReference.Trailer.Root, catalogDictionaryToken, pdfScanner, isLenientParsing);
documentBuilder.AppendDocument(documentCatalog, pdfScanner, pages);
}
documentBuilder.Build();
}
// This method is a basically a copy of the method UglyToad.PdfPig.Parser.PdfDocumentFactory.ParseTrailer()
private static DictionaryToken ParseCatalog(CrossReferenceTable crossReferenceTable,
IPdfTokenScanner pdfTokenScanner,
out EncryptionDictionary encryptionDictionary)
{
encryptionDictionary = null;
if (crossReferenceTable.Trailer.EncryptionToken != null)
{
if (!DirectObjectFinder.TryGet(crossReferenceTable.Trailer.EncryptionToken, pdfTokenScanner,
out DictionaryToken encryptionDictionaryToken))
{
throw new PdfDocumentFormatException($"Unrecognized encryption token in trailer: {crossReferenceTable.Trailer.EncryptionToken}.");
}
encryptionDictionary = EncryptionDictionaryFactory.Read(encryptionDictionaryToken, pdfTokenScanner);
}
var rootDictionary = DirectObjectFinder.Get<DictionaryToken>(crossReferenceTable.Trailer.Root, pdfTokenScanner);
if (!rootDictionary.ContainsKey(NameToken.Type))
{
rootDictionary = rootDictionary.With(NameToken.Type, NameToken.Catalog);
}
return rootDictionary;
}
private class DocumentMerger
{
private const int ARTIFICIAL_NODE_LIMIT = 100;
private readonly IPdfStreamWriter context;
private readonly List<IndirectReferenceToken> pagesTokenReferences = new List<IndirectReferenceToken>();
private readonly IndirectReferenceToken rootPagesReference;
private int pageCount = 0;
public DocumentMerger(IPdfStreamWriter writer)
{
context = writer;
rootPagesReference = context.ReserveObjectNumber();
}
public void AppendDocument(Catalog catalog, IPdfTokenScanner tokenScanner, IReadOnlyList<int> pages)
{
IEnumerable<int> pageIndices;
if (pages == null)
{
var pagesCount = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count);
if (pagesCount < 1)
var existing = files[fileIndex];
IReadOnlyList<int> pages = null;
if (pagesBundle != null && fileIndex < pagesBundle.Count)
{
return;
pages = pagesBundle[fileIndex];
}
pageIndices = Enumerable.Range(1, pagesCount);
}
else if (pages.Count < 1)
{
return;
}
else
{
pageIndices = pages;
}
var referencesFromDocument = new Dictionary<IndirectReference, IndirectReferenceToken>();
var currentNodeReference = context.ReserveObjectNumber();
var pagesReferences = new List<IndirectReferenceToken>();
var resources = new Dictionary<string, IToken>();
bool DoesAEntryCollide(PageTreeNode node)
{
while (node != null)
if (pages == null)
{
var dictionary = node.NodeDictionary;
if (dictionary.TryGet(NameToken.Resources, tokenScanner, out DictionaryToken resourcesDictionary))
for (var i = 1; i <= existing.NumberOfPages; i++)
{
var nonCollidingResources = resourcesDictionary.Data.Keys.Except(resources.Keys);
if (nonCollidingResources.Count() != resourcesDictionary.Data.Count)
{
// This means that at least one of the resources collided
return true;
}
document.AddPage(existing, 1);
}
/* TODO: How to handle?
* `Rotate`
* `CropBox`
* `MediaBox`
*/
// No colliding entry was found, in this node
// Keep walking up into the tree
node = node.Parent;
}
return false;
}
void CopyEntries(PageTreeNode node)
{
while (node != null)
} else
{
var dictionary = node.NodeDictionary;
if (dictionary.TryGet(NameToken.Resources, tokenScanner, out DictionaryToken resourcesDictionary))
foreach (var i in pages)
{
foreach (var pair in resourcesDictionary.Data)
{
resources.Add(pair.Key, CopyToken(pair.Value, tokenScanner, referencesFromDocument));
}
document.AddPage(existing, 1);
}
/* TODO: How to handle?
* `Rotate`
* `CropBox`
* `MediaBox`
*/
// Keep walking up into the tree
node = node.Parent;
}
}
void CreateTree()
{
if (pagesReferences.Count < 1)
{
throw new InvalidOperationException("Pages reference should always be more than 1 when executing this function");
}
var newPagesNode = new Dictionary<NameToken, IToken>
{
{ NameToken.Type, NameToken.Pages },
{ NameToken.Kids, new ArrayToken(pagesReferences) },
{ NameToken.Count, new NumericToken(pagesReferences.Count) },
{ NameToken.Parent, rootPagesReference }
};
if (resources.Count > 0)
{
newPagesNode.Add(NameToken.Resources, DictionaryToken.With(resources));
}
var pagesDictionary = new DictionaryToken(newPagesNode);
context.WriteToken(pagesDictionary, currentNodeReference);
pagesTokenReferences.Add(currentNodeReference);
pageCount += pagesReferences.Count;
};
foreach (var pageIndex in pageIndices)
{
var pageNode = catalog.GetPageNode(pageIndex);
if (pagesReferences.Count >= ARTIFICIAL_NODE_LIMIT || DoesAEntryCollide(pageNode))
{
CreateTree();
currentNodeReference = context.ReserveObjectNumber();
pagesReferences = new List<IndirectReferenceToken>();
resources = new Dictionary<string, IToken>();
}
CopyEntries(pageNode.Parent);
pagesReferences.Add(CopyPageNode(pageNode, currentNodeReference, tokenScanner, referencesFromDocument));
}
if (pagesReferences.Count < 1)
{
throw new InvalidOperationException("Pages reference couldn't be less than 1 because we have reserved a indirect reference token");
}
CreateTree();
}
public void Build()
{
if (pagesTokenReferences.Count < 1)
{
throw new PdfDocumentFormatException("Empty document");
}
var pagesDictionary = new DictionaryToken(new Dictionary<NameToken, IToken>
{
{ NameToken.Type, NameToken.Pages },
{ NameToken.Kids, new ArrayToken(pagesTokenReferences) },
{ NameToken.Count, new NumericToken(pageCount) }
});
var pagesRef = context.WriteToken(pagesDictionary, rootPagesReference);
var catalog = new DictionaryToken(new Dictionary<NameToken, IToken>
{
{ NameToken.Type, NameToken.Catalog },
{ NameToken.Pages, pagesRef }
});
var catalogRef = context.WriteToken(catalog);
context.CompletePdf(catalogRef);
Close();
}
public void Close()
{
context.Dispose();
}
private IndirectReferenceToken CopyPageNode(PageTreeNode pageNode, IndirectReferenceToken parentPagesObject, IPdfTokenScanner tokenScanner,
IDictionary<IndirectReference, IndirectReferenceToken> referencesFromDocument)
{
Debug.Assert(pageNode.IsPage);
var pageDictionary = new Dictionary<NameToken, IToken>
{
{NameToken.Parent, parentPagesObject},
};
foreach (var setPair in pageNode.NodeDictionary.Data)
{
var name = setPair.Key;
var token = setPair.Value;
if (name == NameToken.Parent)
{
// Skip Parent token, since we have to reassign it
continue;
}
pageDictionary.Add(NameToken.Create(name), CopyToken(token, tokenScanner, referencesFromDocument));
}
return context.WriteToken(new DictionaryToken(pageDictionary));
}
/// <summary>
/// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream
/// and replace the indirect reference with the correct/new one
/// </summary>
/// <param name="tokenToCopy">Token to inspect for reference</param>
/// <param name="tokenScanner">scanner get the content from the original document</param>
/// <param name="referencesFromDocument">Map of previously copied</param>
/// <returns>A reference of the token that was copied. With all the reference updated</returns>
private IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner, IDictionary<IndirectReference, IndirectReferenceToken> referencesFromDocument)
{
return WriterUtil.CopyToken(context, tokenToCopy, tokenScanner, referencesFromDocument);
}
}
}
}

View File

@@ -27,23 +27,27 @@
/// A builder used to add construct a page in a PDF document.
/// </summary>
public class PdfPageBuilder
{
private readonly PdfDocumentBuilder documentBuilder;
private IPageContentStream currentStream;
{
// parent
private readonly PdfDocumentBuilder documentBuilder;
// all page data other than content streams
internal readonly Dictionary<NameToken, IToken> pageDictionary = new Dictionary<NameToken, IToken>();
// streams
internal readonly List<IPageContentStream> contentStreams;
internal readonly Dictionary<NameToken, IToken> additionalPageProperties = new Dictionary<NameToken, IToken>();
private readonly Dictionary<NameToken, IToken> resourcesDictionary = new Dictionary<NameToken, IToken>();
internal Dictionary<NameToken, IToken> fontDictionary = new Dictionary<NameToken, IToken>();
internal int nextFontId = 1;
private IPageContentStream currentStream;
// maps fonts added using PdfDocumentBuilder to page font names
private readonly Dictionary<Guid, NameToken> documentFonts = new Dictionary<Guid, NameToken>();
internal int nextFontId = 1;
//a sequence number of ShowText operation to determine whether letters belong to same operation or not (letters that belong to different operations have less changes to belong to same word)
private int textSequence;
private int imageKey = 1;
internal IReadOnlyDictionary<NameToken, IToken> Resources => resourcesDictionary;
internal IReadOnlyDictionary<string, IToken> Resources => pageDictionary.GetOrCreateDict(NameToken.Resources);
/// <summary>
/// The number of this page, 1-indexed.
@@ -75,16 +79,15 @@
}
internal PdfPageBuilder(int number, PdfDocumentBuilder documentBuilder, IEnumerable<CopiedContentStream> copied,
Dictionary<NameToken, IToken> existingResources, Dictionary<NameToken, IToken> pageDict)
Dictionary<NameToken, IToken> pageDict)
{
this.documentBuilder = documentBuilder ?? throw new ArgumentNullException(nameof(documentBuilder));
PageNumber = number;
pageDictionary = pageDict;
contentStreams = new List<IPageContentStream>();
contentStreams.AddRange(copied);
currentStream = new DefaultContentStream();
contentStreams.Add(currentStream);
additionalPageProperties =pageDict ?? new Dictionary<NameToken, IToken>();
resourcesDictionary = existingResources;
}
/// <summary>
@@ -343,13 +346,15 @@
if (!documentFonts.TryGetValue(font.Id, out NameToken value))
{
value = NameToken.Create($"F{nextFontId++}");
while (fontDictionary.ContainsKey(value))
var resources = pageDictionary.GetOrCreateDict(NameToken.Resources);
var fonts = resources.GetOrCreateDict(NameToken.Font);
while (fonts.ContainsKey(value))
{
value = NameToken.Create($"F{nextFontId++}");
}
documentFonts[font.Id] = value;
fontDictionary[value] = font.Reference;
fonts[value] = font.Reference;
}
return value;
@@ -395,17 +400,11 @@
};
var reference = documentBuilder.AddImage(new DictionaryToken(imgDictionary), data);
if (!resourcesDictionary.TryGetValue(NameToken.Xobject, out var xobjectsDict)
|| !(xobjectsDict is DictionaryToken xobjects))
{
xobjects = new DictionaryToken(new Dictionary<NameToken, IToken>());
resourcesDictionary[NameToken.Xobject] = xobjects;
}
var resources = pageDictionary.GetOrCreateDict(NameToken.Resources);
var xObjects = resources.GetOrCreateDict(NameToken.Xobject);
var key = NameToken.Create($"I{imageKey++}");
resourcesDictionary[NameToken.Xobject] = xobjects.With(key, reference);
xObjects[key] = reference;
currentStream.Add(Push.Value);
// This needs to be the placement rectangle.
@@ -435,16 +434,11 @@
/// </summary>
public void AddImage(AddedImage image, PdfRectangle placementRectangle)
{
if (!resourcesDictionary.TryGetValue(NameToken.Xobject, out var xobjectsDict)
|| !(xobjectsDict is DictionaryToken xobjects))
{
xobjects = new DictionaryToken(new Dictionary<NameToken, IToken>());
resourcesDictionary[NameToken.Xobject] = xobjects;
}
var resources = pageDictionary.GetOrCreateDict(NameToken.Resources);
var xObjects = resources.GetOrCreateDict(NameToken.Xobject);
var key = NameToken.Create($"I{imageKey++}");
resourcesDictionary[NameToken.Xobject] = xobjects.With(key, new IndirectReferenceToken(image.Reference));
xObjects[key] = new IndirectReferenceToken(image.Reference);
currentStream.Add(Push.Value);
// This needs to be the placement rectangle.
@@ -513,16 +507,12 @@
var reference = documentBuilder.AddImage(new DictionaryToken(imgDictionary), compressed);
if (!resourcesDictionary.TryGetValue(NameToken.Xobject, out var xobjectsDict)
|| !(xobjectsDict is DictionaryToken xobjects))
{
xobjects = new DictionaryToken(new Dictionary<NameToken, IToken>());
resourcesDictionary[NameToken.Xobject] = xobjects;
}
var resources = pageDictionary.GetOrCreateDict(NameToken.Resources);
var xObjects = resources.GetOrCreateDict(NameToken.Xobject);
var key = NameToken.Create($"I{imageKey++}");
resourcesDictionary[NameToken.Xobject] = xobjects.With(key, reference);
xObjects[key] = reference;
currentStream.Add(Push.Value);
// This needs to be the placement rectangle.
@@ -568,6 +558,8 @@
// We need to relocate the resources, and we have to make sure that none of the resources collide with
// the already written operation's resources
var resources = pageDictionary.GetOrCreateDict(NameToken.Resources);
foreach (var set in srcResourceDictionary.Data)
{
var nameToken = NameToken.Create(set.Key);
@@ -577,11 +569,11 @@
continue;
}
if (!resourcesDictionary.TryGetValue(nameToken, out var currentToken))
if (!resources.ContainsKey(nameToken))
{
// It means that this type of resources doesn't currently exist in the page, so we can copy it
// with no problem
resourcesDictionary[nameToken] = documentBuilder.CopyToken(srcPage.pdfScanner, set.Value);
resources[nameToken] = documentBuilder.CopyToken(srcPage.pdfScanner, set.Value);
continue;
}
@@ -592,26 +584,17 @@
// Special cases
// Since we don't directly add font's to the pages resources, we have to go look at the document's font
if(srcResourceDictionary.TryGet(NameToken.Font, srcPage.pdfScanner, out DictionaryToken fontsDictionary))
{
Dictionary<NameToken, IToken> pageFontsDictionary = null;
if (resourcesDictionary.TryGetValue(NameToken.Font, out var pageFontsToken))
{
pageFontsDictionary = (pageFontsToken as DictionaryToken)?.Data.ToDictionary(k => NameToken.Create(k.Key), v => v.Value);
Debug.Assert(pageFontsDictionary != null);
}
else
{
pageFontsDictionary = new Dictionary<NameToken, IToken>();
}
{
var pageFontsDictionary = resources.GetOrCreateDict(NameToken.Font);
foreach (var fontSet in fontsDictionary.Data)
{
var fontName = NameToken.Create(fontSet.Key);
if (fontDictionary.ContainsKey(fontName))
if (pageFontsDictionary.ContainsKey(fontName))
{
// This would mean that the imported font collide with one of the added font. so we have to rename it
var newName = NameToken.Create($"F{nextFontId++}");
while (fontDictionary.ContainsKey(newName))
while (pageFontsDictionary.ContainsKey(newName))
{
newName = NameToken.Create($"F{nextFontId++}");
}
@@ -642,26 +625,12 @@
pageFontsDictionary.Add(fontName, documentBuilder.CopyToken(srcPage.pdfScanner, fontReferenceToken));
}
foreach (var item in pageFontsDictionary)
{
fontDictionary[item.Key] = item.Value;
}
}
// Since we don't directly add xobjects's to the pages resources, we have to go look at the document's xobjects
if (srcResourceDictionary.TryGet(NameToken.Xobject, srcPage.pdfScanner, out DictionaryToken xobjectsDictionary))
{
Dictionary<NameToken, IToken> pageXobjectsDictionary = null;
if (resourcesDictionary.TryGetValue(NameToken.Xobject, out var pageXobjectToken))
{
pageXobjectsDictionary = (pageXobjectToken as DictionaryToken)?.Data.ToDictionary(k => NameToken.Create(k.Key), v => v.Value);
Debug.Assert(pageXobjectsDictionary != null);
}
else
{
pageXobjectsDictionary = new Dictionary<NameToken, IToken>();
}
var pageXobjectsDictionary = resources.GetOrCreateDict(NameToken.Xobject);
var xobjectNamesUsed = Enumerable.Range(0, imageKey).Select(i => $"I{i}");
foreach (var xobjectSet in xobjectsDictionary.Data)
@@ -696,10 +665,8 @@
throw new PdfDocumentFormatException($"Expected a IndirectReferenceToken for the XObject, got a {xobjectSet.Value.GetType().Name}");
}
pageXobjectsDictionary.Add(NameToken.Create(xobjectName), documentBuilder.CopyToken(srcPage.pdfScanner, fontReferenceToken));
pageXobjectsDictionary[xobjectName] = documentBuilder.CopyToken(srcPage.pdfScanner, fontReferenceToken);
}
resourcesDictionary[NameToken.Xobject] = new DictionaryToken(pageXobjectsDictionary);
}
destinationStream.Operations.AddRange(operations);
@@ -782,10 +749,10 @@
internal interface IPageContentStream : IContentStream
{
bool ReadOnly { get; }
bool ReadOnly { get; }
bool HasContent { get; }
void Add(IGraphicsStateOperation operation);
IndirectReferenceToken Write(IPdfStreamWriter writer);
IndirectReferenceToken Write(IPdfStreamWriter writer);
}
@@ -846,8 +813,8 @@
{
private readonly IndirectReferenceToken token;
public bool ReadOnly => true;
public bool HasContent => true;
public bool HasContent => true;
public CopiedContentStream(IndirectReferenceToken indirectReferenceToken)
{
token = indirectReferenceToken;
@@ -905,5 +872,7 @@
Height = height;
}
}
}
}

View File

@@ -7,132 +7,183 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using Tokenization.Scanner;
using Tokens;
internal class WriterUtil
internal static class WriterUtil
{
/// <summary>
/// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream
/// and replace the indirect reference with the correct/new one
/// </summary>
/// <param name="writer">PDF stream writer</param>
/// <param name="tokenToCopy">Token to inspect for reference</param>
/// <param name="tokenScanner">scanner get the content from the original document</param>
/// <param name="referencesFromDocument">Map of previously copied tokens for original document.</param>
/// <param name="callstack">Call stack of indirect references</param>
/// <returns>A reference of the token that was copied. With all the reference updated</returns>
public static IToken CopyToken(IPdfStreamWriter writer, IToken tokenToCopy, IPdfTokenScanner tokenScanner,
IDictionary<IndirectReference, IndirectReferenceToken> referencesFromDocument, Dictionary<IndirectReference, IndirectReferenceToken> callstack=null)
public static Dictionary<string, IToken> GetOrCreateDict(this Dictionary<NameToken, IToken> dict, NameToken key)
{
if (dict.ContainsKey(key))
{
if (callstack == null)
var item = dict[key];
if (!(item is DictionaryToken dt))
{
callstack = new Dictionary<IndirectReference, IndirectReferenceToken>();
throw new ApplicationException("Expected dictionary token, got " + item.GetType());
}
// This token need to be deep copied, because they could contain reference. So we have to update them.
switch (tokenToCopy)
if (dt.Data is Dictionary<string, IToken> mutable)
{
case DictionaryToken dictionaryToken:
return mutable;
}
mutable = dt.Data.
ToDictionary(x => x.Key, x => x.Value);
dict[key] = DictionaryToken.With(mutable);
return mutable;
}
var created = new Dictionary<string, IToken>();
dict[key] = DictionaryToken.With(created);
return created;
}
public static Dictionary<string, IToken> GetOrCreateDict(this Dictionary<string, IToken> dict, string key)
{
if (dict.ContainsKey(key))
{
var item = dict[key];
if (!(item is DictionaryToken dt))
{
throw new ApplicationException("Expected dictionary token, got " + item.GetType());
}
if (dt.Data is Dictionary<string, IToken> mutable)
{
return mutable;
}
mutable = dt.Data.
ToDictionary(x => x.Key, x => x.Value);
dict[key] = DictionaryToken.With(mutable);
return mutable;
}
var created = new Dictionary<string, IToken>();
dict[key] = DictionaryToken.With(created);
return created;
}
/// <summary>
/// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream
/// and replace the indirect reference with the correct/new one
/// </summary>
/// <param name="writer">PDF stream writer</param>
/// <param name="tokenToCopy">Token to inspect for reference</param>
/// <param name="tokenScanner">scanner get the content from the original document</param>
/// <param name="referencesFromDocument">Map of previously copied tokens for original document.</param>
/// <param name="callstack">Call stack of indirect references</param>
/// <returns>A reference of the token that was copied. With all the reference updated</returns>
public static IToken CopyToken(IPdfStreamWriter writer, IToken tokenToCopy, IPdfTokenScanner tokenScanner,
IDictionary<IndirectReference, IndirectReferenceToken> referencesFromDocument, Dictionary<IndirectReference, IndirectReferenceToken> callstack=null)
{
if (callstack == null)
{
callstack = new Dictionary<IndirectReference, IndirectReferenceToken>();
}
// This token need to be deep copied, because they could contain reference. So we have to update them.
switch (tokenToCopy)
{
case DictionaryToken dictionaryToken:
{
var newContent = new Dictionary<NameToken, IToken>();
foreach (var setPair in dictionaryToken.Data)
{
var name = setPair.Key;
var token = setPair.Value;
newContent.Add(NameToken.Create(name), CopyToken(writer, token, tokenScanner, referencesFromDocument, callstack));
}
return new DictionaryToken(newContent);
}
case ArrayToken arrayToken:
{
var newContent = new Dictionary<NameToken, IToken>();
foreach (var setPair in dictionaryToken.Data)
{
var name = setPair.Key;
var token = setPair.Value;
newContent.Add(NameToken.Create(name), CopyToken(writer, token, tokenScanner, referencesFromDocument, callstack));
}
return new DictionaryToken(newContent);
}
case ArrayToken arrayToken:
var newArray = new List<IToken>(arrayToken.Length);
foreach (var token in arrayToken.Data)
{
var newArray = new List<IToken>(arrayToken.Length);
foreach (var token in arrayToken.Data)
{
newArray.Add(CopyToken(writer, token, tokenScanner, referencesFromDocument, callstack));
}
return new ArrayToken(newArray);
newArray.Add(CopyToken(writer, token, tokenScanner, referencesFromDocument, callstack));
}
case IndirectReferenceToken referenceToken:
return new ArrayToken(newArray);
}
case IndirectReferenceToken referenceToken:
{
if (referencesFromDocument.TryGetValue(referenceToken.Data, out var newReferenceToken))
{
if (referencesFromDocument.TryGetValue(referenceToken.Data, out var newReferenceToken))
{
return newReferenceToken;
}
return newReferenceToken;
}
if (callstack.ContainsKey(referenceToken.Data) && callstack[referenceToken.Data] == null)
{
newReferenceToken = writer.ReserveObjectNumber();
callstack[referenceToken.Data] = newReferenceToken;
referencesFromDocument.Add(referenceToken.Data, newReferenceToken);
return newReferenceToken;
}
callstack.Add(referenceToken.Data, null);
// we add the token to referencesFromDocument to prevent stackoverflow on references cycles
// newReferenceToken = context.ReserveNumberToken();
// callstack.Add(newReferenceToken.Data.ObjectNumber);
// referencesFromDocument.Add(referenceToken.Data, newReferenceToken);
//
var tokenObject = DirectObjectFinder.Get<IToken>(referenceToken.Data, tokenScanner);
Debug.Assert(!(tokenObject is IndirectReferenceToken));
var result = CopyToken(writer, tokenObject, tokenScanner, referencesFromDocument, callstack);
if (callstack[referenceToken.Data] != null)
{
return writer.WriteToken(result, callstack[referenceToken.Data]);
}
newReferenceToken = writer.WriteToken(result);
if (callstack.ContainsKey(referenceToken.Data) && callstack[referenceToken.Data] == null)
{
newReferenceToken = writer.ReserveObjectNumber();
callstack[referenceToken.Data] = newReferenceToken;
referencesFromDocument.Add(referenceToken.Data, newReferenceToken);
return newReferenceToken;
}
case StreamToken streamToken:
{
var properties = CopyToken(writer, streamToken.StreamDictionary, tokenScanner, referencesFromDocument, callstack) as DictionaryToken;
Debug.Assert(properties != null);
var bytes = streamToken.Data;
return new StreamToken(properties, bytes);
}
callstack.Add(referenceToken.Data, null);
case ObjectToken _:
// we add the token to referencesFromDocument to prevent stackoverflow on references cycles
// newReferenceToken = context.ReserveNumberToken();
// callstack.Add(newReferenceToken.Data.ObjectNumber);
// referencesFromDocument.Add(referenceToken.Data, newReferenceToken);
//
var tokenObject = DirectObjectFinder.Get<IToken>(referenceToken.Data, tokenScanner);
Debug.Assert(!(tokenObject is IndirectReferenceToken));
var result = CopyToken(writer, tokenObject, tokenScanner, referencesFromDocument, callstack);
if (callstack[referenceToken.Data] != null)
{
// Since we don't write token directly to the stream.
// We can't know the offset. Therefore the token would be invalid
throw new NotSupportedException("Copying a Object token is not supported");
return writer.WriteToken(result, callstack[referenceToken.Data]);
}
}
return tokenToCopy;
}
internal static IEnumerable<(DictionaryToken, List<DictionaryToken>)> WalkTree(PageTreeNode node, List<DictionaryToken> parents=null)
{
if (parents == null)
{
parents = new List<DictionaryToken>();
}
if (node.IsPage)
{
yield return (node.NodeDictionary, parents);
yield break;
}
parents = parents.ToList();
parents.Add(node.NodeDictionary);
foreach (var child in node.Children)
{
foreach (var item in WalkTree(child, parents))
{
yield return item;
newReferenceToken = writer.WriteToken(result);
referencesFromDocument.Add(referenceToken.Data, newReferenceToken);
return newReferenceToken;
}
case StreamToken streamToken:
{
var properties = CopyToken(writer, streamToken.StreamDictionary, tokenScanner, referencesFromDocument, callstack) as DictionaryToken;
Debug.Assert(properties != null);
var bytes = streamToken.Data;
return new StreamToken(properties, bytes);
}
case ObjectToken _:
{
// Since we don't write token directly to the stream.
// We can't know the offset. Therefore the token would be invalid
throw new NotSupportedException("Copying a Object token is not supported");
}
}
return tokenToCopy;
}
internal static IEnumerable<(DictionaryToken, List<DictionaryToken>)> WalkTree(PageTreeNode node, List<DictionaryToken> parents=null)
{
if (parents == null)
{
parents = new List<DictionaryToken>();
}
if (node.IsPage)
{
yield return (node.NodeDictionary, parents);
yield break;
}
parents = parents.ToList();
parents.Add(node.NodeDictionary);
foreach (var child in node.Children)
{
foreach (var item in WalkTree(child, parents))
{
yield return item;
}
}
}
}
}