clean up pagebuilder, switch merger to use pdfdocumentbuilder

This commit is contained in:
Plaisted
2021-02-08 12:37:09 -06:00
parent ca0b90523e
commit 6e1cf89cf9
5 changed files with 242 additions and 522 deletions

View File

@@ -865,8 +865,8 @@
using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff)) using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff))
{ {
Assert.Equal(1, document.NumberOfPages); Assert.Equal(1, document.NumberOfPages);
var pg = document.GetPage(1); var pg = document.GetPage(1);
// single empty page should result in single content stream // single empty page should result in single content stream
Assert.NotNull(pg.Dictionary.Data[NameToken.Contents] as IndirectReferenceToken); Assert.NotNull(pg.Dictionary.Data[NameToken.Contents] as IndirectReferenceToken);
} }
} }
@@ -925,7 +925,7 @@
var pg = document.GetPage(1); var pg = document.GetPage(1);
// multiple streams should be written to array // multiple streams should be written to array
var streams = pg.Dictionary.Data[NameToken.Contents] as ArrayToken; var streams = pg.Dictionary.Data[NameToken.Contents] as ArrayToken;
Assert.NotNull(streams); Assert.NotNull(streams);
Assert.Equal(2, streams.Length); Assert.Equal(2, streams.Length);
} }
} }

View File

@@ -74,13 +74,24 @@ namespace UglyToad.PdfPig.Writer
context.InitializePdf(1.7m); context.InitializePdf(1.7m);
} }
/// <summary>
/// Creates a document builder keeping resources in memory.
/// </summary>
/// <param name="version">Pdf version to use in header.</param>
public PdfDocumentBuilder(decimal version)
{
context = new PdfStreamWriter(new MemoryStream(), true);
context.InitializePdf(version);
}
/// <summary> /// <summary>
/// Creates a document builder using the supplied stream. /// Creates a document builder using the supplied stream.
/// </summary> /// </summary>
/// <param name="stream">Steam to write pdf to.</param> /// <param name="stream">Steam to write pdf to.</param>
/// <param name="disposeStream">If stream should be disposed when builder is.</param> /// <param name="disposeStream">If stream should be disposed when builder is.</param>
/// <param name="type">Type of pdf stream writer to use</param> /// <param name="type">Type of pdf stream writer to use</param>
public PdfDocumentBuilder(Stream stream, bool disposeStream=false, PdfWriterType type=PdfWriterType.Default) /// <param name="version">Pdf version to use in header.</param>
public PdfDocumentBuilder(Stream stream, bool disposeStream=false, PdfWriterType type=PdfWriterType.Default, decimal version=1.7m)
{ {
switch (type) switch (type)
{ {
@@ -91,7 +102,7 @@ namespace UglyToad.PdfPig.Writer
context = new PdfStreamWriter(stream, disposeStream); context = new PdfStreamWriter(stream, disposeStream);
break; break;
} }
context.InitializePdf(1.7m); context.InitializePdf(version);
} }
/// <summary> /// <summary>
@@ -287,7 +298,7 @@ namespace UglyToad.PdfPig.Writer
/// <param name="pageNumber">Page to copy.</param> /// <param name="pageNumber">Page to copy.</param>
/// <returns>A builder for editing the page.</returns> /// <returns>A builder for editing the page.</returns>
public PdfPageBuilder AddPage(PdfDocument document, int pageNumber) public PdfPageBuilder AddPage(PdfDocument document, int pageNumber)
{ {
if (!existingCopies.TryGetValue(document.Structure.TokenScanner, out var refs)) if (!existingCopies.TryGetValue(document.Structure.TokenScanner, out var refs))
{ {
refs = new Dictionary<IndirectReference, IndirectReferenceToken>(); refs = new Dictionary<IndirectReference, IndirectReferenceToken>();
@@ -372,23 +383,12 @@ namespace UglyToad.PdfPig.Writer
WriterUtil.CopyToken(context, kvp.Value, document.Structure.TokenScanner, refs); WriterUtil.CopyToken(context, kvp.Value, document.Structure.TokenScanner, refs);
} }
var builder = new PdfPageBuilder(pages.Count + 1, this, streams, resources, copiedPageDict); copiedPageDict[NameToken.Resources] = new DictionaryToken(resources);
if (resources.TryGetValue(NameToken.Font, out var fonts))
{
var existingFontDict = fonts as DictionaryToken;
foreach (var item in existingFontDict.Data)
{
var key = NameToken.Create(item.Key);
builder.fontDictionary[key] = item.Value;
}
resources.Remove(NameToken.Font);
}
var builder = new PdfPageBuilder(pages.Count + 1, this, streams, copiedPageDict);
pages[builder.PageNumber] = builder; pages[builder.PageNumber] = builder;
return builder; return builder;
void CopyResourceDict(IToken token, Dictionary<NameToken, IToken> destinationDict) void CopyResourceDict(IToken token, Dictionary<NameToken, IToken> destinationDict)
{ {
DictionaryToken dict = GetRemoteDict(token); DictionaryToken dict = GetRemoteDict(token);
@@ -483,7 +483,7 @@ namespace UglyToad.PdfPig.Writer
foreach (var page in pages) foreach (var page in pages)
{ {
var pageDictionary = page.Value.additionalPageProperties; var pageDictionary = page.Value.pageDictionary;
pageDictionary[NameToken.Type] = NameToken.Page; pageDictionary[NameToken.Type] = NameToken.Page;
pageDictionary[NameToken.Parent] = leafRefs[leafNum]; pageDictionary[NameToken.Parent] = leafRefs[leafNum];
pageDictionary[NameToken.ProcSet] = new ArrayToken(procSet); pageDictionary[NameToken.ProcSet] = new ArrayToken(procSet);
@@ -492,29 +492,19 @@ namespace UglyToad.PdfPig.Writer
pageDictionary[NameToken.MediaBox] = RectangleToArray(page.Value.PageSize); pageDictionary[NameToken.MediaBox] = RectangleToArray(page.Value.PageSize);
} }
// combine existing resources (if any) with added
var pageResources = new Dictionary<NameToken, IToken>();
foreach (var existing in page.Value.Resources)
{
pageResources[existing.Key] = existing.Value;
}
pageResources[NameToken.Font] = new DictionaryToken(page.Value.fontDictionary);
pageDictionary[NameToken.Resources] = new DictionaryToken(pageResources);
var toWrite = page.Value.contentStreams.Where(x => x.HasContent).ToList(); var toWrite = page.Value.contentStreams.Where(x => x.HasContent).ToList();
if (toWrite.Count == 0) if (toWrite.Count == 0)
{ {
// write empty // write empty
pageDictionary[NameToken.Contents] = new PdfPageBuilder.DefaultContentStream().Write(context); pageDictionary[NameToken.Contents] = new PdfPageBuilder.DefaultContentStream().Write(context);
} }
else if (toWrite.Count == 1) else if (toWrite.Count == 1)
{ {
// write single // write single
pageDictionary[NameToken.Contents] = toWrite[0].Write(context); pageDictionary[NameToken.Contents] = toWrite[0].Write(context);
} }
else else
{ {
// write array // write array
var streams = new List<IToken>(); var streams = new List<IToken>();
foreach (var stream in toWrite) foreach (var stream in toWrite)

View File

@@ -2,22 +2,10 @@
{ {
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Diagnostics;
using System.IO; using System.IO;
using Content;
using Core;
using CrossReference;
using Encryption;
using Filters; using Filters;
using Logging; using Logging;
using Parser;
using Parser.FileStructure;
using Parser.Parts;
using Tokenization.Scanner;
using Tokens;
using Exceptions;
using System.Linq; using System.Linq;
using Util;
/// <summary> /// <summary>
/// Merges PDF documents into each other. /// Merges PDF documents into each other.
@@ -48,9 +36,9 @@
_ = file1 ?? throw new ArgumentNullException(nameof(file1)); _ = file1 ?? throw new ArgumentNullException(nameof(file1));
_ = file2 ?? throw new ArgumentNullException(nameof(file2)); _ = file2 ?? throw new ArgumentNullException(nameof(file2));
using (var stream1 = new StreamInputBytes(File.OpenRead(file1))) using (var stream1 = File.OpenRead(file1))
{ {
using (var stream2 = new StreamInputBytes(File.OpenRead(file2))) using (var stream2 = File.OpenRead(file2))
{ {
Merge(new[] { stream1, stream2 }, output, new[] { file1Selection, file2Selection }); Merge(new[] { stream1, stream2 }, output, new[] { file1Selection, file2Selection });
} }
@@ -74,13 +62,13 @@
/// </summary> /// </summary>
public static void Merge(Stream output, params string[] filePaths) public static void Merge(Stream output, params string[] filePaths)
{ {
var streams = new List<StreamInputBytes>(filePaths.Length); var streams = new List<Stream>(filePaths.Length);
try try
{ {
for (var i = 0; i < filePaths.Length; i++) for (var i = 0; i < filePaths.Length; i++)
{ {
var filePath = filePaths[i] ?? throw new ArgumentNullException(nameof(filePaths), $"Null filepath at index {i}."); var filePath = filePaths[i] ?? throw new ArgumentNullException(nameof(filePaths), $"Null filepath at index {i}.");
streams.Add(new StreamInputBytes(File.OpenRead(filePath), true)); streams.Add(File.OpenRead(filePath));
} }
Merge(streams, output, null); Merge(streams, output, null);
@@ -103,7 +91,7 @@
using (var output = new MemoryStream()) using (var output = new MemoryStream())
{ {
Merge(files.Select(f => new ByteArrayInputBytes(f)).ToArray(), output, pagesBundle); Merge(files.Select(f => PdfDocument.Open(f)).ToArray(), output, pagesBundle);
return output.ToArray(); return output.ToArray();
} }
} }
@@ -122,317 +110,39 @@
_ = streams ?? throw new ArgumentNullException(nameof(streams)); _ = streams ?? throw new ArgumentNullException(nameof(streams));
_ = output ?? throw new ArgumentNullException(nameof(output)); _ = output ?? throw new ArgumentNullException(nameof(output));
Merge(streams.Select(f => new StreamInputBytes(f, false)).ToArray(), output, pagesBundle); Merge(streams.Select(f => PdfDocument.Open(f)).ToArray(), output, pagesBundle);
} }
private static void Merge(IReadOnlyList<IInputBytes> files, Stream output, IReadOnlyList<IReadOnlyList<int>> pagesBundle) private static void Merge(IReadOnlyList<PdfDocument> files, Stream output, IReadOnlyList<IReadOnlyList<int>> pagesBundle)
{ {
const bool isLenientParsing = false; var maxVersion = files.Select(x=>x.Version).Max();
using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, maxVersion))
var writer = new PdfStreamWriter(output, false);
var documentBuilder = new DocumentMerger(writer);
var maxVersion = 1.2m;
var infos = new List<(CoreTokenScanner CoreScanner, HeaderVersion Version)>();
foreach (var fileIndex in Enumerable.Range(0, files.Count))
{ {
var inputBytes = files[fileIndex]; foreach (var fileIndex in Enumerable.Range(0, files.Count))
var coreScanner = new CoreTokenScanner(inputBytes);
var version = FileHeaderParser.Parse(coreScanner, isLenientParsing, Log);
maxVersion = Math.Max(maxVersion, version.Version);
infos.Add((coreScanner, version));
}
writer.InitializePdf(maxVersion);
foreach (var fileIndex in Enumerable.Range(0, files.Count))
{
IReadOnlyList<int> pages = null;
if (pagesBundle != null && fileIndex < pagesBundle.Count)
{ {
pages = pagesBundle[fileIndex]; var existing = files[fileIndex];
} IReadOnlyList<int> pages = null;
if (pagesBundle != null && fileIndex < pagesBundle.Count)
var inputBytes = files[fileIndex];
var (coreScanner, version) = infos[fileIndex];
var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log),
new Parser.Parts.CrossReference.CrossReferenceStreamParser(FilterProvider));
CrossReferenceTable crossReference = null;
// ReSharper disable once AccessToModifiedClosure
var locationProvider = new ObjectLocationProvider(() => crossReference, inputBytes);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, FilterProvider, NoOpEncryptionHandler.Instance);
var crossReferenceOffset = FileTrailerParser.GetFirstCrossReferenceOffset(inputBytes, coreScanner, isLenientParsing);
crossReference = crossReferenceParser.Parse(inputBytes, isLenientParsing, crossReferenceOffset, version.OffsetInFile, pdfScanner, coreScanner);
var catalogDictionaryToken = ParseCatalog(crossReference, pdfScanner, out var encryptionDictionary);
if (encryptionDictionary != null)
{
throw new PdfDocumentEncryptedException("Unable to merge document with password");
}
var documentCatalog = CatalogFactory.Create(crossReference.Trailer.Root, catalogDictionaryToken, pdfScanner, isLenientParsing);
documentBuilder.AppendDocument(documentCatalog, pdfScanner, pages);
}
documentBuilder.Build();
}
// This method is a basically a copy of the method UglyToad.PdfPig.Parser.PdfDocumentFactory.ParseTrailer()
private static DictionaryToken ParseCatalog(CrossReferenceTable crossReferenceTable,
IPdfTokenScanner pdfTokenScanner,
out EncryptionDictionary encryptionDictionary)
{
encryptionDictionary = null;
if (crossReferenceTable.Trailer.EncryptionToken != null)
{
if (!DirectObjectFinder.TryGet(crossReferenceTable.Trailer.EncryptionToken, pdfTokenScanner,
out DictionaryToken encryptionDictionaryToken))
{
throw new PdfDocumentFormatException($"Unrecognized encryption token in trailer: {crossReferenceTable.Trailer.EncryptionToken}.");
}
encryptionDictionary = EncryptionDictionaryFactory.Read(encryptionDictionaryToken, pdfTokenScanner);
}
var rootDictionary = DirectObjectFinder.Get<DictionaryToken>(crossReferenceTable.Trailer.Root, pdfTokenScanner);
if (!rootDictionary.ContainsKey(NameToken.Type))
{
rootDictionary = rootDictionary.With(NameToken.Type, NameToken.Catalog);
}
return rootDictionary;
}
private class DocumentMerger
{
private const int ARTIFICIAL_NODE_LIMIT = 100;
private readonly IPdfStreamWriter context;
private readonly List<IndirectReferenceToken> pagesTokenReferences = new List<IndirectReferenceToken>();
private readonly IndirectReferenceToken rootPagesReference;
private int pageCount = 0;
public DocumentMerger(IPdfStreamWriter writer)
{
context = writer;
rootPagesReference = context.ReserveObjectNumber();
}
public void AppendDocument(Catalog catalog, IPdfTokenScanner tokenScanner, IReadOnlyList<int> pages)
{
IEnumerable<int> pageIndices;
if (pages == null)
{
var pagesCount = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count);
if (pagesCount < 1)
{ {
return; pages = pagesBundle[fileIndex];
} }
pageIndices = Enumerable.Range(1, pagesCount); if (pages == null)
}
else if (pages.Count < 1)
{
return;
}
else
{
pageIndices = pages;
}
var referencesFromDocument = new Dictionary<IndirectReference, IndirectReferenceToken>();
var currentNodeReference = context.ReserveObjectNumber();
var pagesReferences = new List<IndirectReferenceToken>();
var resources = new Dictionary<string, IToken>();
bool DoesAEntryCollide(PageTreeNode node)
{
while (node != null)
{ {
var dictionary = node.NodeDictionary; for (var i = 1; i <= existing.NumberOfPages; i++)
if (dictionary.TryGet(NameToken.Resources, tokenScanner, out DictionaryToken resourcesDictionary))
{ {
var nonCollidingResources = resourcesDictionary.Data.Keys.Except(resources.Keys); document.AddPage(existing, 1);
if (nonCollidingResources.Count() != resourcesDictionary.Data.Count)
{
// This means that at least one of the resources collided
return true;
}
} }
} else
/* TODO: How to handle?
* `Rotate`
* `CropBox`
* `MediaBox`
*/
// No colliding entry was found, in this node
// Keep walking up into the tree
node = node.Parent;
}
return false;
}
void CopyEntries(PageTreeNode node)
{
while (node != null)
{ {
var dictionary = node.NodeDictionary; foreach (var i in pages)
if (dictionary.TryGet(NameToken.Resources, tokenScanner, out DictionaryToken resourcesDictionary))
{ {
foreach (var pair in resourcesDictionary.Data) document.AddPage(existing, 1);
{
resources.Add(pair.Key, CopyToken(pair.Value, tokenScanner, referencesFromDocument));
}
} }
/* TODO: How to handle?
* `Rotate`
* `CropBox`
* `MediaBox`
*/
// Keep walking up into the tree
node = node.Parent;
} }
} }
void CreateTree()
{
if (pagesReferences.Count < 1)
{
throw new InvalidOperationException("Pages reference should always be more than 1 when executing this function");
}
var newPagesNode = new Dictionary<NameToken, IToken>
{
{ NameToken.Type, NameToken.Pages },
{ NameToken.Kids, new ArrayToken(pagesReferences) },
{ NameToken.Count, new NumericToken(pagesReferences.Count) },
{ NameToken.Parent, rootPagesReference }
};
if (resources.Count > 0)
{
newPagesNode.Add(NameToken.Resources, DictionaryToken.With(resources));
}
var pagesDictionary = new DictionaryToken(newPagesNode);
context.WriteToken(pagesDictionary, currentNodeReference);
pagesTokenReferences.Add(currentNodeReference);
pageCount += pagesReferences.Count;
};
foreach (var pageIndex in pageIndices)
{
var pageNode = catalog.GetPageNode(pageIndex);
if (pagesReferences.Count >= ARTIFICIAL_NODE_LIMIT || DoesAEntryCollide(pageNode))
{
CreateTree();
currentNodeReference = context.ReserveObjectNumber();
pagesReferences = new List<IndirectReferenceToken>();
resources = new Dictionary<string, IToken>();
}
CopyEntries(pageNode.Parent);
pagesReferences.Add(CopyPageNode(pageNode, currentNodeReference, tokenScanner, referencesFromDocument));
}
if (pagesReferences.Count < 1)
{
throw new InvalidOperationException("Pages reference couldn't be less than 1 because we have reserved a indirect reference token");
}
CreateTree();
}
public void Build()
{
if (pagesTokenReferences.Count < 1)
{
throw new PdfDocumentFormatException("Empty document");
}
var pagesDictionary = new DictionaryToken(new Dictionary<NameToken, IToken>
{
{ NameToken.Type, NameToken.Pages },
{ NameToken.Kids, new ArrayToken(pagesTokenReferences) },
{ NameToken.Count, new NumericToken(pageCount) }
});
var pagesRef = context.WriteToken(pagesDictionary, rootPagesReference);
var catalog = new DictionaryToken(new Dictionary<NameToken, IToken>
{
{ NameToken.Type, NameToken.Catalog },
{ NameToken.Pages, pagesRef }
});
var catalogRef = context.WriteToken(catalog);
context.CompletePdf(catalogRef);
Close();
}
public void Close()
{
context.Dispose();
}
private IndirectReferenceToken CopyPageNode(PageTreeNode pageNode, IndirectReferenceToken parentPagesObject, IPdfTokenScanner tokenScanner,
IDictionary<IndirectReference, IndirectReferenceToken> referencesFromDocument)
{
Debug.Assert(pageNode.IsPage);
var pageDictionary = new Dictionary<NameToken, IToken>
{
{NameToken.Parent, parentPagesObject},
};
foreach (var setPair in pageNode.NodeDictionary.Data)
{
var name = setPair.Key;
var token = setPair.Value;
if (name == NameToken.Parent)
{
// Skip Parent token, since we have to reassign it
continue;
}
pageDictionary.Add(NameToken.Create(name), CopyToken(token, tokenScanner, referencesFromDocument));
}
return context.WriteToken(new DictionaryToken(pageDictionary));
}
/// <summary>
/// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream
/// and replace the indirect reference with the correct/new one
/// </summary>
/// <param name="tokenToCopy">Token to inspect for reference</param>
/// <param name="tokenScanner">scanner get the content from the original document</param>
/// <param name="referencesFromDocument">Map of previously copied</param>
/// <returns>A reference of the token that was copied. With all the reference updated</returns>
private IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner, IDictionary<IndirectReference, IndirectReferenceToken> referencesFromDocument)
{
return WriterUtil.CopyToken(context, tokenToCopy, tokenScanner, referencesFromDocument);
} }
} }
} }
} }

View File

@@ -27,23 +27,27 @@
/// A builder used to add construct a page in a PDF document. /// A builder used to add construct a page in a PDF document.
/// </summary> /// </summary>
public class PdfPageBuilder public class PdfPageBuilder
{ {
private readonly PdfDocumentBuilder documentBuilder; // parent
private IPageContentStream currentStream; private readonly PdfDocumentBuilder documentBuilder;
// all page data other than content streams
internal readonly Dictionary<NameToken, IToken> pageDictionary = new Dictionary<NameToken, IToken>();
// streams
internal readonly List<IPageContentStream> contentStreams; internal readonly List<IPageContentStream> contentStreams;
internal readonly Dictionary<NameToken, IToken> additionalPageProperties = new Dictionary<NameToken, IToken>(); private IPageContentStream currentStream;
private readonly Dictionary<NameToken, IToken> resourcesDictionary = new Dictionary<NameToken, IToken>();
internal Dictionary<NameToken, IToken> fontDictionary = new Dictionary<NameToken, IToken>(); // maps fonts added using PdfDocumentBuilder to page font names
internal int nextFontId = 1;
private readonly Dictionary<Guid, NameToken> documentFonts = new Dictionary<Guid, NameToken>(); private readonly Dictionary<Guid, NameToken> documentFonts = new Dictionary<Guid, NameToken>();
internal int nextFontId = 1;
//a sequence number of ShowText operation to determine whether letters belong to same operation or not (letters that belong to different operations have less changes to belong to same word) //a sequence number of ShowText operation to determine whether letters belong to same operation or not (letters that belong to different operations have less changes to belong to same word)
private int textSequence; private int textSequence;
private int imageKey = 1; private int imageKey = 1;
internal IReadOnlyDictionary<NameToken, IToken> Resources => resourcesDictionary; internal IReadOnlyDictionary<string, IToken> Resources => pageDictionary.GetOrCreateDict(NameToken.Resources);
/// <summary> /// <summary>
/// The number of this page, 1-indexed. /// The number of this page, 1-indexed.
@@ -75,16 +79,15 @@
} }
internal PdfPageBuilder(int number, PdfDocumentBuilder documentBuilder, IEnumerable<CopiedContentStream> copied, internal PdfPageBuilder(int number, PdfDocumentBuilder documentBuilder, IEnumerable<CopiedContentStream> copied,
Dictionary<NameToken, IToken> existingResources, Dictionary<NameToken, IToken> pageDict) Dictionary<NameToken, IToken> pageDict)
{ {
this.documentBuilder = documentBuilder ?? throw new ArgumentNullException(nameof(documentBuilder)); this.documentBuilder = documentBuilder ?? throw new ArgumentNullException(nameof(documentBuilder));
PageNumber = number; PageNumber = number;
pageDictionary = pageDict;
contentStreams = new List<IPageContentStream>(); contentStreams = new List<IPageContentStream>();
contentStreams.AddRange(copied); contentStreams.AddRange(copied);
currentStream = new DefaultContentStream(); currentStream = new DefaultContentStream();
contentStreams.Add(currentStream); contentStreams.Add(currentStream);
additionalPageProperties =pageDict ?? new Dictionary<NameToken, IToken>();
resourcesDictionary = existingResources;
} }
/// <summary> /// <summary>
@@ -343,13 +346,15 @@
if (!documentFonts.TryGetValue(font.Id, out NameToken value)) if (!documentFonts.TryGetValue(font.Id, out NameToken value))
{ {
value = NameToken.Create($"F{nextFontId++}"); value = NameToken.Create($"F{nextFontId++}");
while (fontDictionary.ContainsKey(value)) var resources = pageDictionary.GetOrCreateDict(NameToken.Resources);
var fonts = resources.GetOrCreateDict(NameToken.Font);
while (fonts.ContainsKey(value))
{ {
value = NameToken.Create($"F{nextFontId++}"); value = NameToken.Create($"F{nextFontId++}");
} }
documentFonts[font.Id] = value; documentFonts[font.Id] = value;
fontDictionary[value] = font.Reference; fonts[value] = font.Reference;
} }
return value; return value;
@@ -395,17 +400,11 @@
}; };
var reference = documentBuilder.AddImage(new DictionaryToken(imgDictionary), data); var reference = documentBuilder.AddImage(new DictionaryToken(imgDictionary), data);
var resources = pageDictionary.GetOrCreateDict(NameToken.Resources);
if (!resourcesDictionary.TryGetValue(NameToken.Xobject, out var xobjectsDict) var xObjects = resources.GetOrCreateDict(NameToken.Xobject);
|| !(xobjectsDict is DictionaryToken xobjects))
{
xobjects = new DictionaryToken(new Dictionary<NameToken, IToken>());
resourcesDictionary[NameToken.Xobject] = xobjects;
}
var key = NameToken.Create($"I{imageKey++}"); var key = NameToken.Create($"I{imageKey++}");
xObjects[key] = reference;
resourcesDictionary[NameToken.Xobject] = xobjects.With(key, reference);
currentStream.Add(Push.Value); currentStream.Add(Push.Value);
// This needs to be the placement rectangle. // This needs to be the placement rectangle.
@@ -435,16 +434,11 @@
/// </summary> /// </summary>
public void AddImage(AddedImage image, PdfRectangle placementRectangle) public void AddImage(AddedImage image, PdfRectangle placementRectangle)
{ {
if (!resourcesDictionary.TryGetValue(NameToken.Xobject, out var xobjectsDict) var resources = pageDictionary.GetOrCreateDict(NameToken.Resources);
|| !(xobjectsDict is DictionaryToken xobjects)) var xObjects = resources.GetOrCreateDict(NameToken.Xobject);
{
xobjects = new DictionaryToken(new Dictionary<NameToken, IToken>());
resourcesDictionary[NameToken.Xobject] = xobjects;
}
var key = NameToken.Create($"I{imageKey++}"); var key = NameToken.Create($"I{imageKey++}");
xObjects[key] = new IndirectReferenceToken(image.Reference);
resourcesDictionary[NameToken.Xobject] = xobjects.With(key, new IndirectReferenceToken(image.Reference));
currentStream.Add(Push.Value); currentStream.Add(Push.Value);
// This needs to be the placement rectangle. // This needs to be the placement rectangle.
@@ -513,16 +507,12 @@
var reference = documentBuilder.AddImage(new DictionaryToken(imgDictionary), compressed); var reference = documentBuilder.AddImage(new DictionaryToken(imgDictionary), compressed);
if (!resourcesDictionary.TryGetValue(NameToken.Xobject, out var xobjectsDict) var resources = pageDictionary.GetOrCreateDict(NameToken.Resources);
|| !(xobjectsDict is DictionaryToken xobjects)) var xObjects = resources.GetOrCreateDict(NameToken.Xobject);
{
xobjects = new DictionaryToken(new Dictionary<NameToken, IToken>());
resourcesDictionary[NameToken.Xobject] = xobjects;
}
var key = NameToken.Create($"I{imageKey++}"); var key = NameToken.Create($"I{imageKey++}");
resourcesDictionary[NameToken.Xobject] = xobjects.With(key, reference); xObjects[key] = reference;
currentStream.Add(Push.Value); currentStream.Add(Push.Value);
// This needs to be the placement rectangle. // This needs to be the placement rectangle.
@@ -568,6 +558,8 @@
// We need to relocate the resources, and we have to make sure that none of the resources collide with // We need to relocate the resources, and we have to make sure that none of the resources collide with
// the already written operation's resources // the already written operation's resources
var resources = pageDictionary.GetOrCreateDict(NameToken.Resources);
foreach (var set in srcResourceDictionary.Data) foreach (var set in srcResourceDictionary.Data)
{ {
var nameToken = NameToken.Create(set.Key); var nameToken = NameToken.Create(set.Key);
@@ -577,11 +569,11 @@
continue; continue;
} }
if (!resourcesDictionary.TryGetValue(nameToken, out var currentToken)) if (!resources.ContainsKey(nameToken))
{ {
// It means that this type of resources doesn't currently exist in the page, so we can copy it // It means that this type of resources doesn't currently exist in the page, so we can copy it
// with no problem // with no problem
resourcesDictionary[nameToken] = documentBuilder.CopyToken(srcPage.pdfScanner, set.Value); resources[nameToken] = documentBuilder.CopyToken(srcPage.pdfScanner, set.Value);
continue; continue;
} }
@@ -592,26 +584,17 @@
// Special cases // Special cases
// Since we don't directly add font's to the pages resources, we have to go look at the document's font // Since we don't directly add font's to the pages resources, we have to go look at the document's font
if(srcResourceDictionary.TryGet(NameToken.Font, srcPage.pdfScanner, out DictionaryToken fontsDictionary)) if(srcResourceDictionary.TryGet(NameToken.Font, srcPage.pdfScanner, out DictionaryToken fontsDictionary))
{ {
Dictionary<NameToken, IToken> pageFontsDictionary = null; var pageFontsDictionary = resources.GetOrCreateDict(NameToken.Font);
if (resourcesDictionary.TryGetValue(NameToken.Font, out var pageFontsToken))
{
pageFontsDictionary = (pageFontsToken as DictionaryToken)?.Data.ToDictionary(k => NameToken.Create(k.Key), v => v.Value);
Debug.Assert(pageFontsDictionary != null);
}
else
{
pageFontsDictionary = new Dictionary<NameToken, IToken>();
}
foreach (var fontSet in fontsDictionary.Data) foreach (var fontSet in fontsDictionary.Data)
{ {
var fontName = NameToken.Create(fontSet.Key); var fontName = NameToken.Create(fontSet.Key);
if (fontDictionary.ContainsKey(fontName)) if (pageFontsDictionary.ContainsKey(fontName))
{ {
// This would mean that the imported font collide with one of the added font. so we have to rename it // This would mean that the imported font collide with one of the added font. so we have to rename it
var newName = NameToken.Create($"F{nextFontId++}"); var newName = NameToken.Create($"F{nextFontId++}");
while (fontDictionary.ContainsKey(newName)) while (pageFontsDictionary.ContainsKey(newName))
{ {
newName = NameToken.Create($"F{nextFontId++}"); newName = NameToken.Create($"F{nextFontId++}");
} }
@@ -642,26 +625,12 @@
pageFontsDictionary.Add(fontName, documentBuilder.CopyToken(srcPage.pdfScanner, fontReferenceToken)); pageFontsDictionary.Add(fontName, documentBuilder.CopyToken(srcPage.pdfScanner, fontReferenceToken));
} }
foreach (var item in pageFontsDictionary)
{
fontDictionary[item.Key] = item.Value;
}
} }
// Since we don't directly add xobjects's to the pages resources, we have to go look at the document's xobjects // Since we don't directly add xobjects's to the pages resources, we have to go look at the document's xobjects
if (srcResourceDictionary.TryGet(NameToken.Xobject, srcPage.pdfScanner, out DictionaryToken xobjectsDictionary)) if (srcResourceDictionary.TryGet(NameToken.Xobject, srcPage.pdfScanner, out DictionaryToken xobjectsDictionary))
{ {
Dictionary<NameToken, IToken> pageXobjectsDictionary = null; var pageXobjectsDictionary = resources.GetOrCreateDict(NameToken.Xobject);
if (resourcesDictionary.TryGetValue(NameToken.Xobject, out var pageXobjectToken))
{
pageXobjectsDictionary = (pageXobjectToken as DictionaryToken)?.Data.ToDictionary(k => NameToken.Create(k.Key), v => v.Value);
Debug.Assert(pageXobjectsDictionary != null);
}
else
{
pageXobjectsDictionary = new Dictionary<NameToken, IToken>();
}
var xobjectNamesUsed = Enumerable.Range(0, imageKey).Select(i => $"I{i}"); var xobjectNamesUsed = Enumerable.Range(0, imageKey).Select(i => $"I{i}");
foreach (var xobjectSet in xobjectsDictionary.Data) foreach (var xobjectSet in xobjectsDictionary.Data)
@@ -696,10 +665,8 @@
throw new PdfDocumentFormatException($"Expected a IndirectReferenceToken for the XObject, got a {xobjectSet.Value.GetType().Name}"); throw new PdfDocumentFormatException($"Expected a IndirectReferenceToken for the XObject, got a {xobjectSet.Value.GetType().Name}");
} }
pageXobjectsDictionary.Add(NameToken.Create(xobjectName), documentBuilder.CopyToken(srcPage.pdfScanner, fontReferenceToken)); pageXobjectsDictionary[xobjectName] = documentBuilder.CopyToken(srcPage.pdfScanner, fontReferenceToken);
} }
resourcesDictionary[NameToken.Xobject] = new DictionaryToken(pageXobjectsDictionary);
} }
destinationStream.Operations.AddRange(operations); destinationStream.Operations.AddRange(operations);
@@ -782,10 +749,10 @@
internal interface IPageContentStream : IContentStream internal interface IPageContentStream : IContentStream
{ {
bool ReadOnly { get; } bool ReadOnly { get; }
bool HasContent { get; } bool HasContent { get; }
void Add(IGraphicsStateOperation operation); void Add(IGraphicsStateOperation operation);
IndirectReferenceToken Write(IPdfStreamWriter writer); IndirectReferenceToken Write(IPdfStreamWriter writer);
} }
@@ -846,8 +813,8 @@
{ {
private readonly IndirectReferenceToken token; private readonly IndirectReferenceToken token;
public bool ReadOnly => true; public bool ReadOnly => true;
public bool HasContent => true; public bool HasContent => true;
public CopiedContentStream(IndirectReferenceToken indirectReferenceToken) public CopiedContentStream(IndirectReferenceToken indirectReferenceToken)
{ {
token = indirectReferenceToken; token = indirectReferenceToken;
@@ -905,5 +872,7 @@
Height = height; Height = height;
} }
} }
} }
} }

View File

@@ -7,132 +7,183 @@
using System.Collections.Generic; using System.Collections.Generic;
using System.Diagnostics; using System.Diagnostics;
using System.Linq; using System.Linq;
using System.Text;
using Tokenization.Scanner; using Tokenization.Scanner;
using Tokens; using Tokens;
internal class WriterUtil
internal static class WriterUtil
{ {
/// <summary> public static Dictionary<string, IToken> GetOrCreateDict(this Dictionary<NameToken, IToken> dict, NameToken key)
/// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream {
/// and replace the indirect reference with the correct/new one if (dict.ContainsKey(key))
/// </summary>
/// <param name="writer">PDF stream writer</param>
/// <param name="tokenToCopy">Token to inspect for reference</param>
/// <param name="tokenScanner">scanner get the content from the original document</param>
/// <param name="referencesFromDocument">Map of previously copied tokens for original document.</param>
/// <param name="callstack">Call stack of indirect references</param>
/// <returns>A reference of the token that was copied. With all the reference updated</returns>
public static IToken CopyToken(IPdfStreamWriter writer, IToken tokenToCopy, IPdfTokenScanner tokenScanner,
IDictionary<IndirectReference, IndirectReferenceToken> referencesFromDocument, Dictionary<IndirectReference, IndirectReferenceToken> callstack=null)
{ {
if (callstack == null) var item = dict[key];
if (!(item is DictionaryToken dt))
{ {
callstack = new Dictionary<IndirectReference, IndirectReferenceToken>(); throw new ApplicationException("Expected dictionary token, got " + item.GetType());
} }
// This token need to be deep copied, because they could contain reference. So we have to update them. if (dt.Data is Dictionary<string, IToken> mutable)
switch (tokenToCopy)
{ {
case DictionaryToken dictionaryToken: return mutable;
}
mutable = dt.Data.
ToDictionary(x => x.Key, x => x.Value);
dict[key] = DictionaryToken.With(mutable);
return mutable;
}
var created = new Dictionary<string, IToken>();
dict[key] = DictionaryToken.With(created);
return created;
}
public static Dictionary<string, IToken> GetOrCreateDict(this Dictionary<string, IToken> dict, string key)
{
if (dict.ContainsKey(key))
{
var item = dict[key];
if (!(item is DictionaryToken dt))
{
throw new ApplicationException("Expected dictionary token, got " + item.GetType());
}
if (dt.Data is Dictionary<string, IToken> mutable)
{
return mutable;
}
mutable = dt.Data.
ToDictionary(x => x.Key, x => x.Value);
dict[key] = DictionaryToken.With(mutable);
return mutable;
}
var created = new Dictionary<string, IToken>();
dict[key] = DictionaryToken.With(created);
return created;
}
/// <summary>
/// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream
/// and replace the indirect reference with the correct/new one
/// </summary>
/// <param name="writer">PDF stream writer</param>
/// <param name="tokenToCopy">Token to inspect for reference</param>
/// <param name="tokenScanner">scanner get the content from the original document</param>
/// <param name="referencesFromDocument">Map of previously copied tokens for original document.</param>
/// <param name="callstack">Call stack of indirect references</param>
/// <returns>A reference of the token that was copied. With all the reference updated</returns>
public static IToken CopyToken(IPdfStreamWriter writer, IToken tokenToCopy, IPdfTokenScanner tokenScanner,
IDictionary<IndirectReference, IndirectReferenceToken> referencesFromDocument, Dictionary<IndirectReference, IndirectReferenceToken> callstack=null)
{
if (callstack == null)
{
callstack = new Dictionary<IndirectReference, IndirectReferenceToken>();
}
// This token need to be deep copied, because they could contain reference. So we have to update them.
switch (tokenToCopy)
{
case DictionaryToken dictionaryToken:
{
var newContent = new Dictionary<NameToken, IToken>();
foreach (var setPair in dictionaryToken.Data)
{
var name = setPair.Key;
var token = setPair.Value;
newContent.Add(NameToken.Create(name), CopyToken(writer, token, tokenScanner, referencesFromDocument, callstack));
}
return new DictionaryToken(newContent);
}
case ArrayToken arrayToken:
{ {
var newContent = new Dictionary<NameToken, IToken>(); var newArray = new List<IToken>(arrayToken.Length);
foreach (var setPair in dictionaryToken.Data) foreach (var token in arrayToken.Data)
{
var name = setPair.Key;
var token = setPair.Value;
newContent.Add(NameToken.Create(name), CopyToken(writer, token, tokenScanner, referencesFromDocument, callstack));
}
return new DictionaryToken(newContent);
}
case ArrayToken arrayToken:
{ {
var newArray = new List<IToken>(arrayToken.Length); newArray.Add(CopyToken(writer, token, tokenScanner, referencesFromDocument, callstack));
foreach (var token in arrayToken.Data)
{
newArray.Add(CopyToken(writer, token, tokenScanner, referencesFromDocument, callstack));
}
return new ArrayToken(newArray);
} }
case IndirectReferenceToken referenceToken:
return new ArrayToken(newArray);
}
case IndirectReferenceToken referenceToken:
{
if (referencesFromDocument.TryGetValue(referenceToken.Data, out var newReferenceToken))
{ {
if (referencesFromDocument.TryGetValue(referenceToken.Data, out var newReferenceToken)) return newReferenceToken;
{ }
return newReferenceToken;
}
if (callstack.ContainsKey(referenceToken.Data) && callstack[referenceToken.Data] == null) if (callstack.ContainsKey(referenceToken.Data) && callstack[referenceToken.Data] == null)
{ {
newReferenceToken = writer.ReserveObjectNumber(); newReferenceToken = writer.ReserveObjectNumber();
callstack[referenceToken.Data] = newReferenceToken; callstack[referenceToken.Data] = newReferenceToken;
referencesFromDocument.Add(referenceToken.Data, newReferenceToken);
return newReferenceToken;
}
callstack.Add(referenceToken.Data, null);
// we add the token to referencesFromDocument to prevent stackoverflow on references cycles
// newReferenceToken = context.ReserveNumberToken();
// callstack.Add(newReferenceToken.Data.ObjectNumber);
// referencesFromDocument.Add(referenceToken.Data, newReferenceToken);
//
var tokenObject = DirectObjectFinder.Get<IToken>(referenceToken.Data, tokenScanner);
Debug.Assert(!(tokenObject is IndirectReferenceToken));
var result = CopyToken(writer, tokenObject, tokenScanner, referencesFromDocument, callstack);
if (callstack[referenceToken.Data] != null)
{
return writer.WriteToken(result, callstack[referenceToken.Data]);
}
newReferenceToken = writer.WriteToken(result);
referencesFromDocument.Add(referenceToken.Data, newReferenceToken); referencesFromDocument.Add(referenceToken.Data, newReferenceToken);
return newReferenceToken; return newReferenceToken;
} }
case StreamToken streamToken:
{
var properties = CopyToken(writer, streamToken.StreamDictionary, tokenScanner, referencesFromDocument, callstack) as DictionaryToken;
Debug.Assert(properties != null);
var bytes = streamToken.Data; callstack.Add(referenceToken.Data, null);
return new StreamToken(properties, bytes);
}
case ObjectToken _: // we add the token to referencesFromDocument to prevent stackoverflow on references cycles
// newReferenceToken = context.ReserveNumberToken();
// callstack.Add(newReferenceToken.Data.ObjectNumber);
// referencesFromDocument.Add(referenceToken.Data, newReferenceToken);
//
var tokenObject = DirectObjectFinder.Get<IToken>(referenceToken.Data, tokenScanner);
Debug.Assert(!(tokenObject is IndirectReferenceToken));
var result = CopyToken(writer, tokenObject, tokenScanner, referencesFromDocument, callstack);
if (callstack[referenceToken.Data] != null)
{ {
// Since we don't write token directly to the stream. return writer.WriteToken(result, callstack[referenceToken.Data]);
// We can't know the offset. Therefore the token would be invalid
throw new NotSupportedException("Copying a Object token is not supported");
} }
}
return tokenToCopy; newReferenceToken = writer.WriteToken(result);
} referencesFromDocument.Add(referenceToken.Data, newReferenceToken);
return newReferenceToken;
internal static IEnumerable<(DictionaryToken, List<DictionaryToken>)> WalkTree(PageTreeNode node, List<DictionaryToken> parents=null)
{
if (parents == null)
{
parents = new List<DictionaryToken>();
}
if (node.IsPage)
{
yield return (node.NodeDictionary, parents);
yield break;
}
parents = parents.ToList();
parents.Add(node.NodeDictionary);
foreach (var child in node.Children)
{
foreach (var item in WalkTree(child, parents))
{
yield return item;
} }
case StreamToken streamToken:
{
var properties = CopyToken(writer, streamToken.StreamDictionary, tokenScanner, referencesFromDocument, callstack) as DictionaryToken;
Debug.Assert(properties != null);
var bytes = streamToken.Data;
return new StreamToken(properties, bytes);
}
case ObjectToken _:
{
// Since we don't write token directly to the stream.
// We can't know the offset. Therefore the token would be invalid
throw new NotSupportedException("Copying a Object token is not supported");
}
}
return tokenToCopy;
}
internal static IEnumerable<(DictionaryToken, List<DictionaryToken>)> WalkTree(PageTreeNode node, List<DictionaryToken> parents=null)
{
if (parents == null)
{
parents = new List<DictionaryToken>();
}
if (node.IsPage)
{
yield return (node.NodeDictionary, parents);
yield break;
}
parents = parents.ToList();
parents.Add(node.NodeDictionary);
foreach (var child in node.Children)
{
foreach (var item in WalkTree(child, parents))
{
yield return item;
} }
} }
}
} }
} }