From 8f0326a8180346539f100188712ac85c94dfb1b2 Mon Sep 17 00:00:00 2001 From: InusualZ Date: Fri, 2 Oct 2020 21:00:27 -0400 Subject: [PATCH] Introduce a new API for intercepting token that are being copied This API would allow us to track any type of token while is/was copied to a stream, so when a similar token come again, we can decide if want to just use the already written token or the new one. This API would allow us to divide the code for each specific thing that we are trying to avoid having duplicate, while not penalizing performance. Another plus would be, that since every "deduplicator" code would be behind a class, if a class is causing some performance regression that the user don't want, the user could decide not to add it and the resultant pdf would still be valid --- .../Writer/Copier/IObjectCopier.cs | 24 +++ .../Writer/Copier/MultiCopier.cs | 75 ++++++++ .../Writer/Copier/ObjectCopier.cs | 170 ++++++++++++++++++ .../Writer/Copier/TokenHelper.cs | 34 ++++ 4 files changed, 303 insertions(+) create mode 100644 src/UglyToad.PdfPig/Writer/Copier/IObjectCopier.cs create mode 100644 src/UglyToad.PdfPig/Writer/Copier/MultiCopier.cs create mode 100644 src/UglyToad.PdfPig/Writer/Copier/ObjectCopier.cs create mode 100644 src/UglyToad.PdfPig/Writer/Copier/TokenHelper.cs diff --git a/src/UglyToad.PdfPig/Writer/Copier/IObjectCopier.cs b/src/UglyToad.PdfPig/Writer/Copier/IObjectCopier.cs new file mode 100644 index 00000000..ce24aaad --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/Copier/IObjectCopier.cs @@ -0,0 +1,24 @@ +namespace UglyToad.PdfPig.Writer.Copier +{ + using System; + using Tokens; + + /// + /// An interface for copying token + /// + public interface IObjectCopier + { + /// + /// Copy the token to the destination stream + /// + /// Token to copy + /// Function to resolve indirect reference identified in the token to copy + /// + public IToken CopyObject(IToken sourceToken, Func tokenScanner); + + /// + /// Clear the references of the previously copied object + /// + public void ClearReference(); + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Writer/Copier/MultiCopier.cs b/src/UglyToad.PdfPig/Writer/Copier/MultiCopier.cs new file mode 100644 index 00000000..3e626253 --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/Copier/MultiCopier.cs @@ -0,0 +1,75 @@ +namespace UglyToad.PdfPig.Writer.Copier +{ + using System; + using System.Collections.Generic; + using Tokens; + using Writer; + + /// + internal class MultiCopier : ObjectCopier + { + private readonly List copiers; + + /// + public MultiCopier(PdfStreamWriter destinationStream) : base(destinationStream) + { + copiers = new List(); + } + + /// + /// + /// + /// + public void AddCopier(IObjectCopier copier) + { + copiers.Add(copier); + } + + /// + /// + /// + /// + /// + public bool RemoveCopier(IObjectCopier copier) + { + return copiers.Remove(copier); + } + + /// + /// + /// + /// + public IReadOnlyList GetCopiers() + { + return copiers; + } + + /// + public override IToken CopyObject(IToken sourceToken, Func tokenScanner) + { + // We give the token to the child copiers, to see if they have a better way of copying the token + foreach (var copier in copiers) + { + var newToken = copier.CopyObject(sourceToken, tokenScanner); + if (newToken != null) + { + return newToken; + } + } + + // If the token did not found a suitable copier, let just do a simple copy of the token + return base.CopyObject(sourceToken, tokenScanner); + } + + /// + public override void ClearReference() + { + foreach (var copier in copiers) + { + copier.ClearReference(); + } + + base.ClearReference(); + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Writer/Copier/ObjectCopier.cs b/src/UglyToad.PdfPig/Writer/Copier/ObjectCopier.cs new file mode 100644 index 00000000..bb82e094 --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/Copier/ObjectCopier.cs @@ -0,0 +1,170 @@ +namespace UglyToad.PdfPig.Writer.Copier +{ + using System; + using System.Collections.Generic; + using PdfPig; + using Tokenization.Scanner; + using Tokens; + using Writer; + + /// + internal class ObjectCopier : IObjectCopier + { + private readonly PdfStreamWriter pdfStream; + + private readonly Dictionary newReferenceMap; + + /// + public ObjectCopier(PdfStreamWriter destinationStream) + { + pdfStream = destinationStream ?? throw new ArgumentNullException(nameof(destinationStream)); + newReferenceMap = new Dictionary(); + } + + /// + public IToken CopyObject(IToken sourceToken, PdfDocument sourceDocument) + { + IToken tokenScanner(IndirectReferenceToken referenceToken) + { + var objToken = sourceDocument.Structure.GetObject(referenceToken.Data); + return objToken.Data; + } + + return CopyObject(sourceToken, tokenScanner); + } + + /// + public IToken CopyObject(IToken sourceToken, IPdfTokenScanner tokenScanner) + { + IToken tokenGetter(IndirectReferenceToken referenceToken) + { + var objToken = tokenScanner.Get(referenceToken.Data); + return objToken.Data; + } + + return CopyObject(sourceToken, tokenGetter); + } + + /// + public virtual IToken CopyObject(IToken sourceToken, Func tokenScanner) + { + // This token need to be deep copied, because they could contain reference. So we have to update them. + switch (sourceToken) + { + case DictionaryToken dictionaryToken: + { + var newContent = new Dictionary(); + foreach (var setPair in dictionaryToken.Data) + { + var name = setPair.Key; + var token = setPair.Value; + + newContent.Add(NameToken.Create(name), CopyObject(token, tokenScanner)); + } + + return new DictionaryToken(newContent); + } + case ArrayToken arrayToken: + { + var newArray = new List(arrayToken.Length); + foreach (var token in arrayToken.Data) + { + newArray.Add(CopyObject(token, tokenScanner)); + } + + return new ArrayToken(newArray); + } + case IndirectReferenceToken referenceToken: + { + if (TryGetNewReference(referenceToken, out var newReferenceToken)) + { + return newReferenceToken; + } + + var referencedToken = tokenScanner(referenceToken); + var newReferencedToken = CopyObject(referencedToken, tokenScanner); + + var newToken = WriteToken(newReferencedToken); + SetNewReference(referenceToken, newToken); + return newToken; + } + + case StreamToken streamToken: + { + var properties = CopyObject(streamToken.StreamDictionary, tokenScanner); + var bytes = streamToken.Data; + return new StreamToken(properties as DictionaryToken, bytes); + } + + case ObjectToken _: + { + + // This is because, since we don't write token directly to the stream. So we can't know the offset. + // The token would be invalid. Although I don't think the copy of an object token would ever happen + throw new NotSupportedException("Copying a Object token is not supported"); + } + } + + return sourceToken; + } + + /// + /// + /// + /// + /// + /// + public virtual bool TryGetNewReference(IndirectReferenceToken sourceReferenceToken, out IndirectReferenceToken newReferenceToken) + { + newReferenceToken = default; + foreach (var referenceSet in newReferenceMap) + { + if (!referenceSet.Key.Equals(sourceReferenceToken)) + { + continue; + } + + newReferenceToken = referenceSet.Value; + return true; + } + + return false; + } + + /// + public virtual void ClearReference() + { + newReferenceMap.Clear(); + } + + /// + /// + /// + /// + /// + public void SetNewReference(IndirectReferenceToken oldToken, IndirectReferenceToken newToken) + { + newReferenceMap.Add(oldToken, newToken); + } + + /// + /// + /// + /// + public int ReserveTokenNumber() + { + return pdfStream.ReserveNumber(); + } + + /// + /// + /// + /// + /// + /// + public IndirectReferenceToken WriteToken(IToken token, int? reservedNumber = null) + { + return pdfStream.WriteToken(token, reservedNumber); + } + } +} diff --git a/src/UglyToad.PdfPig/Writer/Copier/TokenHelper.cs b/src/UglyToad.PdfPig/Writer/Copier/TokenHelper.cs new file mode 100644 index 00000000..71a7820a --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/Copier/TokenHelper.cs @@ -0,0 +1,34 @@ +namespace UglyToad.PdfPig.Writer.Copier +{ + using System; + using Tokens; + + internal static class TokenHelper + { + // This is to avoid infinite loop in production. Although, it should never happen + const int MAX_ITERATIONS = 10; + + public static T GetTokenAs(IToken token, Func lookupFunc) where T : IToken + { + var iterations = 0; + + var original = token; + while (iterations++ < MAX_ITERATIONS) + { + switch (token) + { + case T result: + return result; + case IndirectReferenceToken tokenReference: + token = lookupFunc(tokenReference); + continue; + case ObjectToken tokenObject: + token = tokenObject.Data; + continue; + } + } + + throw new InvalidOperationException($"Unable to extract a {typeof(T)} token from {original}"); + } + } +} \ No newline at end of file