From 7f42ad0af9873a20e7bce7e31d1067e72323a1b8 Mon Sep 17 00:00:00 2001 From: Plaisted Date: Sat, 6 Feb 2021 12:24:53 -0600 Subject: [PATCH] refactored previous work to fit pr #250 --- .../PublicApiScannerTests.cs | 1 + .../Tokens/TestPdfTokenScanner.cs | 5 + .../Writer/PdfDocumentBuilderTests.cs | 180 ++++++ .../Writer/PdfMergerTests.cs | 9 +- .../AdvancedPdfDocumentAccess.cs | 25 + .../Tokenization/Scanner/IPdfTokenScanner.cs | 8 + .../Tokenization/Scanner/PdfTokenScanner.cs | 15 + .../Writer/Colors/OutputIntentsFactory.cs | 4 +- .../Writer/Fonts/IWritingFont.cs | 2 +- .../Writer/Fonts/Standard14WritingFont.cs | 47 +- .../Writer/Fonts/TrueTypeWritingFont.cs | 16 +- .../Writer/IPdfStreamWriter.cs | 51 ++ .../Writer/PdfABaselineRuleBuilder.cs | 4 +- .../Writer/PdfDedupStreamWriter.cs | 200 ++++++ .../Writer/PdfDocumentBuilder.cs | 605 ++++++++++-------- src/UglyToad.PdfPig/Writer/PdfMerger.cs | 110 +--- src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs | 221 ++++--- src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs | 128 ++-- src/UglyToad.PdfPig/Writer/PdfWriterType.cs | 21 + src/UglyToad.PdfPig/Writer/TokenWriter.cs | 30 +- src/UglyToad.PdfPig/Writer/WriterUtil.cs | 135 ++++ 21 files changed, 1249 insertions(+), 568 deletions(-) create mode 100644 src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs create mode 100644 src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs create mode 100644 src/UglyToad.PdfPig/Writer/PdfWriterType.cs create mode 100644 src/UglyToad.PdfPig/Writer/WriterUtil.cs diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 8d50073e..b532f189 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -209,6 +209,7 @@ "UglyToad.PdfPig.Writer.PdfAStandard", "UglyToad.PdfPig.Writer.PdfDocumentBuilder", "UglyToad.PdfPig.Writer.PdfMerger", + "UglyToad.PdfPig.Writer.PdfWriterType", "UglyToad.PdfPig.Writer.PdfPageBuilder", "UglyToad.PdfPig.Writer.TokenWriter", "UglyToad.PdfPig.XObjects.XObjectImage" diff --git a/src/UglyToad.PdfPig.Tests/Tokens/TestPdfTokenScanner.cs b/src/UglyToad.PdfPig.Tests/Tokens/TestPdfTokenScanner.cs index aca81558..26022bec 100644 --- a/src/UglyToad.PdfPig.Tests/Tokens/TestPdfTokenScanner.cs +++ b/src/UglyToad.PdfPig.Tests/Tokens/TestPdfTokenScanner.cs @@ -45,6 +45,11 @@ namespace UglyToad.PdfPig.Tests.Tokens return Objects[reference]; } + public void ReplaceToken(IndirectReference reference, IToken token) + { + throw new NotImplementedException(); + } + public void Dispose() { } diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs index 5796e0a9..a37ad419 100644 --- a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs @@ -689,6 +689,186 @@ Assert.Equal("Lorem ipsum dolor sit amet, consectetur adipiscing elit. ", page2.Text); } + } + + [Fact] + public void CanAddHelloWorldToSimplePage() + { + var path = IntegrationHelpers.GetDocumentPath("Single Page Simple - from open office.pdf"); + var doc = PdfDocument.Open(path); + var builder = new PdfDocumentBuilder(); + + var page = builder.AddPage(doc, 1); + + page.DrawLine(new PdfPoint(30, 520), new PdfPoint(360, 520)); + page.DrawLine(new PdfPoint(360, 520), new PdfPoint(360, 250)); + + page.SetStrokeColor(250, 132, 131); + page.DrawLine(new PdfPoint(25, 70), new PdfPoint(100, 70), 3); + page.ResetColor(); + page.DrawRectangle(new PdfPoint(30, 200), 250, 100, 0.5m); + page.DrawRectangle(new PdfPoint(30, 100), 250, 100, 0.5m); + + var file = TrueTypeTestHelper.GetFileBytes("Andada-Regular.ttf"); + + var font = builder.AddTrueTypeFont(file); + + var letters = page.AddText("Hello World!", 12, new PdfPoint(30, 50), font); + + Assert.NotEmpty(page.CurrentStream.Operations); + + var b = builder.Build(); + + WriteFile(nameof(CanWriteSinglePageHelloWorld), b); + + Assert.NotEmpty(b); + + using (var document = PdfDocument.Open(b)) + { + var page1 = document.GetPage(1); + + Assert.Equal("I am a simple pdf.Hello World!", page1.Text); + + var h = page1.Letters[18]; + + Assert.Equal("H", h.Value); + Assert.Equal("Andada-Regular", h.FontName); + + var comparer = new DoubleComparer(0.01); + var pointComparer = new PointComparer(comparer); + + for (int i = 0; i < letters.Count; i++) + { + var readerLetter = page1.Letters[i+18]; + var writerLetter = letters[i]; + + Assert.Equal(readerLetter.Value, writerLetter.Value); + Assert.Equal(readerLetter.Location, writerLetter.Location, pointComparer); + Assert.Equal(readerLetter.FontSize, writerLetter.FontSize, comparer); + Assert.Equal(readerLetter.GlyphRectangle.Width, writerLetter.GlyphRectangle.Width, comparer); + Assert.Equal(readerLetter.GlyphRectangle.Height, writerLetter.GlyphRectangle.Height, comparer); + Assert.Equal(readerLetter.GlyphRectangle.BottomLeft, writerLetter.GlyphRectangle.BottomLeft, pointComparer); + } + } + } + + [Fact] + public void CanMerge2SimpleDocumentsReversed_Builder() + { + var one = IntegrationHelpers.GetDocumentPath("Single Page Simple - from open office.pdf"); + var two = IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf"); + + using var docOne = PdfDocument.Open(one); + using var docTwo = PdfDocument.Open(two); + var builder = new PdfDocumentBuilder(); + builder.AddPage(docOne, 1); + builder.AddPage(docTwo, 1); + var result = builder.Build(); + PdfMergerTests.CanMerge2SimpleDocumentsAssertions(new MemoryStream(result), "I am a simple pdf.", "Write something inInkscape", false); + } + + [Fact] + public void CanMerge2SimpleDocuments_Builder() + { + var one = IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf"); + var two = IntegrationHelpers.GetDocumentPath("Single Page Simple - from open office.pdf"); + + using var docOne = PdfDocument.Open(one); + using var docTwo = PdfDocument.Open(two); + var builder = new PdfDocumentBuilder(); + builder.AddPage(docOne, 1); + builder.AddPage(docTwo, 1); + var result = builder.Build(); + PdfMergerTests.CanMerge2SimpleDocumentsAssertions(new MemoryStream(result), "Write something inInkscape", "I am a simple pdf.", false); + } + + [Fact] + public void CanDedupObjectsFromSameDoc_Builder() + { + var one = IntegrationHelpers.GetDocumentPath("Multiple Page - from Mortality Statistics.pdf"); + + using var doc = PdfDocument.Open(one); + + using var builder = new PdfDocumentBuilder(); + builder.AddPage(doc, 1); + builder.AddPage(doc, 1); + + var result = builder.Build(); + + using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff)) + { + Assert.Equal(2, document.NumberOfPages); + Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count <= 29, + "Expected object count to be lower than 30"); // 45 objects with duplicates, 29 with correct re-use + } + } + + [Fact] + public void CanDedupObjectsFromDifferentDoc_HashBuilder() + { + var one = IntegrationHelpers.GetDocumentPath("Multiple Page - from Mortality Statistics.pdf"); + + using var doc = PdfDocument.Open(one); + using var doc2 = PdfDocument.Open(one); + + using var builder = new PdfDocumentBuilder(new MemoryStream(), true, PdfWriterType.ObjectInMemoryDedup); + builder.AddPage(doc, 1); + builder.AddPage(doc2, 1); + + var result = builder.Build(); + + using (var document = PdfDocument.Open(result, ParsingOptions.LenientParsingOff)) + { + Assert.Equal(2, document.NumberOfPages); + Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count <= 29, + "Expected object count to be lower than 30"); // 45 objects with duplicates, 29 with correct re-use + } + } + + [InlineData("Single Page Simple - from google drive.pdf")] + [InlineData("Old Gutnish Internet Explorer.pdf")] + [InlineData("68-1990-01_A.pdf")] + [InlineData("Multiple Page - from Mortality Statistics.pdf")] + [Theory] + public void CopiedPagesResultInSameData(string name) + { + var docPath = IntegrationHelpers.GetDocumentPath(name); + + using var doc = PdfDocument.Open(docPath, ParsingOptions.LenientParsingOff); + var count1 = GetCounts(doc); + + using var builder = new PdfDocumentBuilder(); + for (var i = 1; i <= doc.NumberOfPages; i++) + { + builder.AddPage(doc, i); + } + var result = builder.Build(); + + using (var doc2 = PdfDocument.Open(result, ParsingOptions.LenientParsingOff)) + { + var count2 = GetCounts(doc2); + Assert.Equal(count1.Item1, count2.Item1); + Assert.Equal(count1.Item2, count2.Item2); + } + + (int, double) GetCounts(PdfDocument toCount) + { + int letters = 0; + double location = 0; + foreach (var page in toCount.GetPages()) + { + foreach (var letter in page.Letters) + { + unchecked { letters += 1; } + unchecked { + location += letter.Location.X; + location += letter.Location.Y; + } + } + } + + return (letters, location); + } } private static void WriteFile(string name, byte[] bytes, string extension = "pdf") diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs index 00180bc7..22af9a6c 100644 --- a/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfMergerTests.cs @@ -47,13 +47,16 @@ CanMerge2SimpleDocumentsAssertions(new MemoryStream(result), "I am a simple pdf.", "Write something inInkscape"); } - private void CanMerge2SimpleDocumentsAssertions(Stream stream, string page1Text, string page2Text) + internal static void CanMerge2SimpleDocumentsAssertions(Stream stream, string page1Text, string page2Text, bool checkVersion=true) { stream.Position = 0; using (var document = PdfDocument.Open(stream, ParsingOptions.LenientParsingOff)) { Assert.Equal(2, document.NumberOfPages); - Assert.Equal(1.5m, document.Version); + if (checkVersion) + { + Assert.Equal(1.5m, document.Version); + } var page1 = document.GetPage(1); Assert.Equal(page1Text, page1.Text); @@ -105,7 +108,7 @@ [Fact] public void DedupsObjectsFromSameDoc() { - var one = IntegrationHelpers.GetDocumentPath("Multiple Page - from Mortality Statistics.pdf"); + var one = IntegrationHelpers.GetDocumentPath("Multiple Page - from Mortality Statistics.pdf"); var result = PdfMerger.Merge(new List { File.ReadAllBytes(one) }, new List> { new List { 1, 2} }); diff --git a/src/UglyToad.PdfPig/AdvancedPdfDocumentAccess.cs b/src/UglyToad.PdfPig/AdvancedPdfDocumentAccess.cs index 7c786144..7569e801 100644 --- a/src/UglyToad.PdfPig/AdvancedPdfDocumentAccess.cs +++ b/src/UglyToad.PdfPig/AdvancedPdfDocumentAccess.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; using Content; + using Core; using Filters; using Parser.Parts; using Tokenization.Scanner; @@ -82,6 +83,30 @@ return embeddedFiles.Count > 0; } + /// + /// Replaces the token in an internal cache that will be returned instead of + /// scanning the source PDF data for future requests. + /// + /// The object number for the object to replace. + /// Func that takes existing token as input and return new token. + public void ReplaceIndirectObject(IndirectReference reference, Func replacer) + { + var obj = pdfScanner.Get(reference); + var replacement = replacer(obj.Data); + pdfScanner.ReplaceToken(reference, replacement); + } + + /// + /// Replaces the token in an internal cache that will be returned instead of + /// scanning the source PDF data for future requests. + /// + /// The object number for the object to replace. + /// Replacement token to use. + public void ReplaceIndirectObject(IndirectReference reference, IToken replacement) + { + pdfScanner.ReplaceToken(reference, replacement); + } + private void GuardDisposed() { if (isDisposed) diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/IPdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/IPdfTokenScanner.cs index 907de186..31171cc4 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/IPdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/IPdfTokenScanner.cs @@ -16,5 +16,13 @@ /// The object number for the object to tokenize. /// The tokenized object. ObjectToken Get(IndirectReference reference); + + /// + /// Adds the token to an internal cache that will be returned instead of + /// scanning the source PDF data. + /// + /// The object number for the object to replace. + /// The token to replace the existing data. + void ReplaceToken(IndirectReference reference, IToken token); } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 4b2b786e..0a2a1030 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -29,6 +29,9 @@ private bool isDisposed; private bool isBruteForcing; + private readonly Dictionary overwrittenTokens = + new Dictionary(); + /// /// Stores tokens encountered between obj - endobj markers for each call. /// Cleared after each operation. @@ -670,6 +673,11 @@ throw new ObjectDisposedException(nameof(PdfTokenScanner)); } + if (overwrittenTokens.TryGetValue(reference, out var value)) + { + return value; + } + if (objectLocationProvider.TryGetCached(reference, out var objectToken)) { return objectToken; @@ -705,6 +713,13 @@ return BruteForceFileToFindReference(reference); } + public void ReplaceToken(IndirectReference reference, IToken token) + { + // Using 0 position as it isn't written to stream and this value doesn't + // seem to be used by any callers. In future may need to revisit this. + overwrittenTokens[reference] = new ObjectToken(0, reference, token); + } + private ObjectToken BruteForceFileToFindReference(IndirectReference reference) { try diff --git a/src/UglyToad.PdfPig/Writer/Colors/OutputIntentsFactory.cs b/src/UglyToad.PdfPig/Writer/Colors/OutputIntentsFactory.cs index 0a785af5..f2c48955 100644 --- a/src/UglyToad.PdfPig/Writer/Colors/OutputIntentsFactory.cs +++ b/src/UglyToad.PdfPig/Writer/Colors/OutputIntentsFactory.cs @@ -9,7 +9,7 @@ private const string SrgbIec61966OutputCondition = "sRGB IEC61966-2.1"; private const string RegistryName = "http://www.color.org"; - public static ArrayToken GetOutputIntentsArray(Func objectWriter) + public static ArrayToken GetOutputIntentsArray(Func objectWriter) { var rgbColorCondition = new StringToken(SrgbIec61966OutputCondition); @@ -38,7 +38,7 @@ {NameToken.OutputConditionIdentifier, rgbColorCondition}, {NameToken.RegistryName, new StringToken(RegistryName)}, {NameToken.Info, rgbColorCondition}, - {NameToken.DestOutputProfile, new IndirectReferenceToken(written.Number)} + {NameToken.DestOutputProfile, written} }), }); } diff --git a/src/UglyToad.PdfPig/Writer/Fonts/IWritingFont.cs b/src/UglyToad.PdfPig/Writer/Fonts/IWritingFont.cs index 4d5141f8..ebc6ee52 100644 --- a/src/UglyToad.PdfPig/Writer/Fonts/IWritingFont.cs +++ b/src/UglyToad.PdfPig/Writer/Fonts/IWritingFont.cs @@ -16,7 +16,7 @@ TransformationMatrix GetFontMatrix(); - ObjectToken WriteFont(NameToken fontKeyName, Stream outputStream, BuilderContext context); + IndirectReferenceToken WriteFont(IPdfStreamWriter writer, NameToken fontKeyName); byte GetValueForCharacter(char character); } diff --git a/src/UglyToad.PdfPig/Writer/Fonts/Standard14WritingFont.cs b/src/UglyToad.PdfPig/Writer/Fonts/Standard14WritingFont.cs index e9b3f8f6..4d2e3107 100644 --- a/src/UglyToad.PdfPig/Writer/Fonts/Standard14WritingFont.cs +++ b/src/UglyToad.PdfPig/Writer/Fonts/Standard14WritingFont.cs @@ -55,7 +55,7 @@ return TransformationMatrix.FromValues(1/1000.0, 0, 0, 1/1000.0, 0, 0); } - public ObjectToken WriteFont(NameToken fontKeyName, Stream outputStream, BuilderContext context) + public IndirectReferenceToken WriteFont(IPdfStreamWriter writer, NameToken fontKeyName) { var dictionary = new Dictionary { @@ -68,7 +68,7 @@ var token = new DictionaryToken(dictionary); - var result = context.WriteObject(outputStream, token); + var result = writer.WriteToken(token); return result; } @@ -92,46 +92,5 @@ return result; } } - - internal class BuilderContext - { - private readonly List reservedNumbers = new List(); - - public int CurrentNumber { get; private set; } = 1; - - private readonly Dictionary objectOffsets = new Dictionary(); - public IReadOnlyDictionary ObjectOffsets => objectOffsets; - - public ObjectToken WriteObject(Stream stream, IToken token, int? reservedNumber = null) - { - int number; - if (reservedNumber.HasValue) - { - if (!reservedNumbers.Remove(reservedNumber.Value)) - { - throw new InvalidOperationException(); - } - - number = reservedNumber.Value; - } - else - { - number = CurrentNumber++; - } - - var reference = new IndirectReference(number, 0); - var obj = new ObjectToken(stream.Position, reference, token); - objectOffsets.Add(reference, obj.Position); - TokenWriter.WriteToken(obj, stream); - return obj; - } - - public int ReserveNumber() - { - var reserved = CurrentNumber; - reservedNumbers.Add(reserved); - CurrentNumber++; - return reserved; - } - } + } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Writer/Fonts/TrueTypeWritingFont.cs b/src/UglyToad.PdfPig/Writer/Fonts/TrueTypeWritingFont.cs index 8bbff48b..f1475862 100644 --- a/src/UglyToad.PdfPig/Writer/Fonts/TrueTypeWritingFont.cs +++ b/src/UglyToad.PdfPig/Writer/Fonts/TrueTypeWritingFont.cs @@ -47,14 +47,14 @@ return TransformationMatrix.FromValues(1.0 / unitsPerEm, 0, 0, 1.0 / unitsPerEm, 0, 0); } - public ObjectToken WriteFont(NameToken fontKeyName, Stream outputStream, BuilderContext context) + public IndirectReferenceToken WriteFont(IPdfStreamWriter writer, NameToken fontKeyName) { var newEncoding = new TrueTypeSubsetEncoding(characterMapping.Keys.ToList()); var subsetBytes = TrueTypeSubsetter.Subset(fontFileBytes.ToArray(), newEncoding); var embeddedFile = DataCompresser.CompressToStream(subsetBytes); - var fileRef = context.WriteObject(outputStream, embeddedFile); + var fileRef = writer.WriteToken(embeddedFile); var baseFont = NameToken.Create(font.TableRegister.NameTable.GetPostscriptName()); @@ -76,7 +76,7 @@ { NameToken.Descent, new NumericToken(Math.Round(hhead.Descent * scaling, 2)) }, { NameToken.CapHeight, new NumericToken(90) }, { NameToken.StemV, new NumericToken(90) }, - { NameToken.FontFile2, new IndirectReferenceToken(fileRef.Number) } + { NameToken.FontFile2, fileRef } }; var os2 = font.TableRegister.Os2Table; @@ -108,27 +108,27 @@ widths.Add(new NumericToken(width)); } - var descriptor = context.WriteObject(outputStream, new DictionaryToken(descriptorDictionary)); + var descriptor = writer.WriteToken(new DictionaryToken(descriptorDictionary)); var toUnicodeCMap = ToUnicodeCMapBuilder.ConvertToCMapStream(characterMapping); var toUnicodeStream = DataCompresser.CompressToStream(toUnicodeCMap); - var toUnicode = context.WriteObject(outputStream, toUnicodeStream); + var toUnicode = writer.WriteToken(toUnicodeStream); var dictionary = new Dictionary { { NameToken.Type, NameToken.Font }, { NameToken.Subtype, NameToken.TrueType }, { NameToken.BaseFont, baseFont }, - { NameToken.FontDescriptor, new IndirectReferenceToken(descriptor.Number) }, + { NameToken.FontDescriptor, descriptor }, { NameToken.FirstChar, new NumericToken(0) }, { NameToken.LastChar, new NumericToken(lastCharacter) }, { NameToken.Widths, new ArrayToken(widths) }, - {NameToken.ToUnicode, new IndirectReferenceToken(toUnicode.Number) } + {NameToken.ToUnicode, toUnicode } }; var token = new DictionaryToken(dictionary); - var result = context.WriteObject(outputStream, token); + var result = writer.WriteToken(token); return result; } diff --git a/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs b/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs new file mode 100644 index 00000000..4cb86d06 --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs @@ -0,0 +1,51 @@ +namespace UglyToad.PdfPig.Writer +{ + using System; + using System.Collections.Generic; + using System.IO; + using System.Text; + using Tokens; + + internal interface IPdfStreamWriter : IDisposable + { + /// + /// + /// + Stream Stream { get; } + + /// + /// + /// + /// + /// + IndirectReferenceToken WriteToken(IToken token); + + /// + /// + /// + /// + /// + /// + IndirectReferenceToken WriteToken(IToken token, IndirectReferenceToken indirectReference); + + /// + /// + /// + /// + IndirectReferenceToken ReserveObjectNumber(); + + /// + /// + /// + /// + void InitializePdf(decimal version); + + /// + /// + /// + /// + /// + void CompletePdf(IndirectReferenceToken catalogReference, IndirectReferenceToken documentInformationReference=null); + + } +} diff --git a/src/UglyToad.PdfPig/Writer/PdfABaselineRuleBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfABaselineRuleBuilder.cs index f3c2dcfa..af810f85 100644 --- a/src/UglyToad.PdfPig/Writer/PdfABaselineRuleBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfABaselineRuleBuilder.cs @@ -8,14 +8,14 @@ namespace UglyToad.PdfPig.Writer { internal static class PdfABaselineRuleBuilder { - public static void Obey(Dictionary catalog, Func writerFunc, + public static void Obey(Dictionary catalog, Func writerFunc, PdfDocumentBuilder.DocumentInformationBuilder documentInformationBuilder, PdfAStandard archiveStandard) { catalog[NameToken.OutputIntents] = OutputIntentsFactory.GetOutputIntentsArray(writerFunc); var xmpStream = XmpWriter.GenerateXmpStream(documentInformationBuilder, 1.7m, archiveStandard); var xmpObj = writerFunc(xmpStream); - catalog[NameToken.Metadata] = new IndirectReferenceToken(xmpObj.Number); + catalog[NameToken.Metadata] = xmpObj; } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs b/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs new file mode 100644 index 00000000..4fb59f7d --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs @@ -0,0 +1,200 @@ +namespace UglyToad.PdfPig.Writer +{ +using Core; + using Graphics.Operations; + using System; + using System.Collections.Generic; + using System.Globalization; + using System.IO; + using System.Text; + using Tokens; + + internal class PdfDedupStreamWriter : IPdfStreamWriter + { + + public Stream Stream { get; } + private int CurrentNumber { get; set; } = 1; + private bool DisposeStream { get; set; } + private const decimal DefaultVersion = 1.2m; + private bool Initialized { get; set; } + private readonly Dictionary offsets = new Dictionary(); + private readonly Dictionary hashes = new (new FNVByteComparison()); + + public PdfDedupStreamWriter(Stream stream, bool dispose) + { + Stream = stream; + DisposeStream = dispose; + } + + private MemoryStream ms = new MemoryStream(); + public IndirectReferenceToken WriteToken(IToken token) + { + if (!Initialized) + { + InitializePdf(DefaultVersion); + } + + ms.SetLength(0); + TokenWriter.WriteToken(token, ms); + var contents = ms.ToArray(); + if (hashes.TryGetValue(contents, out var value)) + { + return value; + } + + var ir = ReserveObjectNumber(); + hashes.Add(contents, ir); + + offsets.Add(ir.Data, Stream.Position); + TokenWriter.WriteObject(ir.Data.ObjectNumber, ir.Data.Generation, contents, Stream); + + return ir; + } + + public IndirectReferenceToken WriteToken(IToken token, IndirectReferenceToken indirectReference) + { + if (!Initialized) + { + InitializePdf(DefaultVersion); + } + + ms.SetLength(0); + TokenWriter.WriteToken(token, ms); + var contents = ms.ToArray(); + + hashes.Add(contents, indirectReference); + offsets.Add(indirectReference.Data, Stream.Position); + TokenWriter.WriteObject(indirectReference.Data.ObjectNumber, indirectReference.Data.Generation, contents, Stream); + return indirectReference; + } + + public IndirectReferenceToken ReserveObjectNumber() + { + return new IndirectReferenceToken(new IndirectReference(CurrentNumber++, 0)); + } + + public void InitializePdf(decimal version) + { + WriteString($"%PDF-{version.ToString("0.0", CultureInfo.InvariantCulture)}", Stream); + + Stream.WriteText("%"); + Stream.WriteByte(169); + Stream.WriteByte(205); + Stream.WriteByte(196); + Stream.WriteByte(210); + Stream.WriteNewLine(); + + Initialized = true; + } + + public void CompletePdf(IndirectReferenceToken catalogReference, IndirectReferenceToken documentInformationReference=null) + { + TokenWriter.WriteCrossReferenceTable(offsets, catalogReference.Data, Stream, documentInformationReference?.Data); + } + + private static void WriteString(string text, Stream stream) + { + var bytes = OtherEncodings.StringAsLatin1Bytes(text); + stream.Write(bytes, 0, bytes.Length); + stream.WriteNewLine(); + } + + public void Dispose() + { + if (DisposeStream) + { + Stream.Dispose(); + } + + hashes.Clear(); + } + + class FNVByteComparison : IEqualityComparer + { + public bool Equals(byte[] x, byte[] y) + { + if (x.Length != y.Length) + { + return false; + } + + for (var i = 0; i < x.Length; i++) + { + if (x[i] != y[i]) + { + return false; + } + } + + return true; + } + + public int GetHashCode(byte[] obj) + { + var hash = FnvHash.Create(); + foreach (var t in obj) + { + hash.Combine(t); + } + + return hash.HashCode; + } + } + + /// + /// A hash combiner that is implemented with the Fowler/Noll/Vo algorithm (FNV-1a). This is a mutable struct for performance reasons. + /// + struct FnvHash + { + /// + /// The starting point of the FNV hash. + /// + public const int Offset = unchecked((int)2166136261); + + /// + /// The prime number used to compute the FNV hash. + /// + private const int Prime = 16777619; + + /// + /// Gets the current result of the hash function. + /// + public int HashCode { get; private set; } + + /// + /// Creates a new FNV hash initialized to . + /// + public static FnvHash Create() + { + var result = new FnvHash(); + result.HashCode = Offset; + return result; + } + + /// + /// Adds the specified byte to the hash. + /// + /// The byte to hash. + public void Combine(byte data) + { + unchecked + { + HashCode ^= data; + HashCode *= Prime; + } + } + + /// + /// Adds the specified integer to this hash, in little-endian order. + /// + /// The integer to hash. + public void Combine(int data) + { + Combine(unchecked((byte)data)); + Combine(unchecked((byte)(data >> 8))); + Combine(unchecked((byte)(data >> 16))); + Combine(unchecked((byte)(data >> 24))); + } + } + } +} diff --git a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs index 69783958..695cd220 100644 --- a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs @@ -14,6 +14,7 @@ namespace UglyToad.PdfPig.Writer using Parser.Parts; using PdfPig.Fonts.Standard14Fonts; using PdfPig.Fonts.TrueType.Parser; + using System.Runtime.CompilerServices; using Tokenization.Scanner; using Tokens; @@ -22,9 +23,9 @@ namespace UglyToad.PdfPig.Writer /// /// Provides methods to construct new PDF documents. /// - public class PdfDocumentBuilder + public class PdfDocumentBuilder : IDisposable { - private readonly BuilderContext context = new BuilderContext(); + private readonly IPdfStreamWriter context; private readonly Dictionary pages = new Dictionary(); private readonly Dictionary fonts = new Dictionary(); private readonly Dictionary images = new Dictionary(); @@ -63,6 +64,36 @@ namespace UglyToad.PdfPig.Writer /// internal IReadOnlyDictionary Images => images; + + /// + /// Creates a document builder keeping resources in memory. + /// + public PdfDocumentBuilder() + { + context = new PdfStreamWriter(new MemoryStream(), true); + context.InitializePdf(1.7m); + } + + /// + /// Creates a document builder using the supplied stream. + /// + /// Steam to write pdf to. + /// If stream should be disposed when builder is. + /// Type of pdf stream writer to use + public PdfDocumentBuilder(Stream stream, bool disposeStream=false, PdfWriterType type=PdfWriterType.Default) + { + switch (type) + { + case PdfWriterType.ObjectInMemoryDedup: + context = new PdfDedupStreamWriter(stream, disposeStream); + break; + default: + context = new PdfStreamWriter(stream, disposeStream); + break; + } + context.InitializePdf(1.7m); + } + /// /// Determines whether the bytes of the TrueType font file provided can be used in a PDF document. /// @@ -158,15 +189,10 @@ namespace UglyToad.PdfPig.Writer return added; } - internal IndirectReference AddImage(DictionaryToken dictionary, byte[] bytes) + internal IndirectReferenceToken AddImage(DictionaryToken dictionary, byte[] bytes) { - var reserved = context.ReserveNumber(); - - var stored = new ImageStored(dictionary, bytes, reserved); - - images[stored.Id] = stored; - - return new IndirectReference(reserved, 0); + var streamToken = new StreamToken(dictionary, bytes); + return context.WriteToken(streamToken); } /// @@ -235,275 +261,310 @@ namespace UglyToad.PdfPig.Writer return AddPage(rectangle.Width, rectangle.Height); } + + internal IToken CopyToken(IPdfTokenScanner source, IToken token) + { + if (!existingCopies.TryGetValue(source, out var refs)) + { + refs = new Dictionary(); + existingCopies.Add(source, refs); + } + + return WriterUtil.CopyToken(context, token, source, refs); + } + + private readonly ConditionalWeakTable> existingCopies = + new ConditionalWeakTable>(); + /// + /// Add a new page with the specified size, this page will be included in the output when is called. + /// + /// Source document. + /// Page to copy. + /// A builder for editing the page. + public PdfPageBuilder AddPage(PdfDocument document, int pageNumber) + { + if (!existingCopies.TryGetValue(document.Structure.TokenScanner, out var refs)) + { + refs = new Dictionary(); + existingCopies.Add(document.Structure.TokenScanner, refs); + } + + int i = 1; + foreach (var (pageDict, parents) in WriterUtil.WalkTree(document.Structure.Catalog.PageTree)) + { + if (i == pageNumber) + { + // copy content streams + var streams = new List(); + if (pageDict.ContainsKey(NameToken.Contents)) + { + var token = pageDict.Data[NameToken.Contents]; + if (token is ArrayToken array) + { + foreach (var item in array.Data) + { + if (item is IndirectReferenceToken ir) + { + streams.Add(new PdfPageBuilder.CopiedContentStream( + WriterUtil.CopyToken(context, ir, document.Structure.TokenScanner, refs) as IndirectReferenceToken)); + } + + } + } + else if (token is IndirectReferenceToken ir) + { + streams.Add(new PdfPageBuilder.CopiedContentStream( + WriterUtil.CopyToken(context, ir, document.Structure.TokenScanner, refs) as IndirectReferenceToken)); + } + } + + // manually copy page dict / resources as we need to modify some + var copiedPageDict = new Dictionary(); + Dictionary resources = new Dictionary(); + + // just put all parent resources into new page + foreach (var dict in parents) + { + if (dict.TryGet(NameToken.Resources, out var token)) + { + CopyResourceDict(token, resources); + } + } + + + foreach (var kvp in pageDict.Data) + { + if (kvp.Key == NameToken.Contents || kvp.Key == NameToken.Parent || kvp.Key == NameToken.Type) + { + continue; + } + + if (kvp.Key == NameToken.Resources) + { + CopyResourceDict(kvp.Value, resources); + continue; + } + + copiedPageDict[NameToken.Create(kvp.Key)] = + WriterUtil.CopyToken(context, kvp.Value, document.Structure.TokenScanner, refs); + } + + var builder = new PdfPageBuilder(pages.Count + 1, this, streams, resources, copiedPageDict); + pages[builder.PageNumber] = builder; + return builder; + } + + i++; + } + + throw new KeyNotFoundException($"Page {pageNumber} was not found in the source document."); + + void CopyResourceDict(IToken token, Dictionary destinationDict) + { + DictionaryToken dict = GetRemoteDict(token); + if (dict == null) + { + return; + } + foreach (var item in dict.Data) + { + + if (!destinationDict.ContainsKey(NameToken.Create(item.Key))) + { + if (item.Value is IndirectReferenceToken ir) + { + destinationDict[NameToken.Create(item.Key)] = WriterUtil.CopyToken(context, document.Structure.TokenScanner.Get(ir.Data).Data, document.Structure.TokenScanner, refs); + } + else + { + destinationDict[NameToken.Create(item.Key)] = WriterUtil.CopyToken(context, item.Value, document.Structure.TokenScanner, refs); + } + + continue; + } + + + var subDict = GetRemoteDict(item.Value); + var destSubDict = destinationDict[NameToken.Create(item.Key)] as DictionaryToken; + if (destSubDict == null || subDict == null) + { + // not a dict.. just overwrite with more important one? should maybe check arrays? + destinationDict[NameToken.Create(item.Key)] = WriterUtil.CopyToken(context, item.Value, document.Structure.TokenScanner, refs); + continue; + } + foreach (var subItem in subDict.Data) + { + // last copied most important important + destinationDict[NameToken.Create(subItem.Key)] = WriterUtil.CopyToken(context, subItem.Value, + document.Structure.TokenScanner, refs); + } + } + } + + DictionaryToken GetRemoteDict(IToken token) + { + DictionaryToken dict = null; + if (token is IndirectReferenceToken ir) + { + dict = document.Structure.TokenScanner.Get(ir.Data).Data as DictionaryToken; + } + else if (token is DictionaryToken dt) + { + dict = dt; + } + return dict; + } + } + + private void CompleteDocument() + { + var fontsWritten = new Dictionary(); + + foreach (var font in fonts) + { + var fontObj = font.Value.FontProgram.WriteFont(context, font.Value.FontKey.Name); + fontsWritten.Add(font.Key, fontObj); + } + + var procSet = new List + { + NameToken.Create("PDF"), + NameToken.Text, + NameToken.ImageB, + NameToken.ImageC, + NameToken.ImageI + }; + + var resources = new Dictionary + { + { NameToken.ProcSet, new ArrayToken(procSet) } + }; + + if (fontsWritten.Count > 0) + { + var fontsDictionary = new DictionaryToken(fontsWritten.Select(x => + (fonts[x.Key].FontKey.Name, (IToken)x.Value)) + .ToDictionary(x => x.Item1, x => x.Item2)); + + var fontsDictionaryRef = context.WriteToken(fontsDictionary); + + resources.Add(NameToken.Font, fontsDictionaryRef); + } + + var parentIndirect = context.ReserveObjectNumber(); + + var pageReferences = new List(); + foreach (var page in pages) + { + var pageDictionary = page.Value.additionalPageProperties; + pageDictionary[NameToken.Type] = NameToken.Page; + pageDictionary[NameToken.Parent] = parentIndirect; + if (!pageDictionary.ContainsKey(NameToken.MediaBox)) + { + pageDictionary[NameToken.MediaBox] = RectangleToArray(page.Value.PageSize); + } + + pageDictionary[NameToken.Resources] = new DictionaryToken(page.Value.Resources); + + if (page.Value.contentStreams.Count == 1) + { + pageDictionary[NameToken.Contents] = page.Value.contentStreams[0].Write(context); + } + else + { + var streams = new List(); + foreach (var stream in page.Value.contentStreams) + { + streams.Add(stream.Write(context)); + } + + pageDictionary[NameToken.Contents] = new ArrayToken(streams); + } + + + var pageRef = context.WriteToken( new DictionaryToken(pageDictionary)); + + pageReferences.Add(pageRef); + } + + var pagesDictionaryData = new Dictionary + { + {NameToken.Type, NameToken.Pages}, + {NameToken.Kids, new ArrayToken(pageReferences)}, + {NameToken.Resources, new DictionaryToken(resources)}, + {NameToken.Count, new NumericToken(pageReferences.Count)} + }; + + var pagesDictionary = new DictionaryToken(pagesDictionaryData); + + var pagesRef = context.WriteToken(pagesDictionary, parentIndirect); + + var catalogDictionary = new Dictionary + { + {NameToken.Type, NameToken.Catalog}, + {NameToken.Pages, pagesRef} + }; + + if (ArchiveStandard != PdfAStandard.None) + { + Func writerFunc = x => context.WriteToken(x); + + PdfABaselineRuleBuilder.Obey(catalogDictionary, writerFunc, DocumentInformation, ArchiveStandard); + + switch (ArchiveStandard) + { + case PdfAStandard.A1A: + PdfA1ARuleBuilder.Obey(catalogDictionary); + break; + case PdfAStandard.A2B: + break; + case PdfAStandard.A2A: + PdfA1ARuleBuilder.Obey(catalogDictionary); + break; + } + } + + var catalog = new DictionaryToken(catalogDictionary); + + var catalogRef = context.WriteToken(catalog); + + var informationReference = default(IndirectReferenceToken); + if (IncludeDocumentInformation) + { + var informationDictionary = DocumentInformation.ToDictionary(); + if (informationDictionary.Count > 0) + { + var dictionary = new DictionaryToken(informationDictionary); + informationReference = context.WriteToken(dictionary); + } + } + + context.CompletePdf(catalogRef, informationReference); + } + /// /// Builds a PDF document from the current content of this builder and its pages. /// /// The bytes of the resulting PDF document. public byte[] Build() { - var fontsWritten = new Dictionary(); - using (var memory = new MemoryStream()) + CompleteDocument(); + + if (context.Stream is MemoryStream ms) { - // Header - WriteString("%PDF-1.7", memory); - - // Files with binary data should contain a 2nd comment line followed by 4 bytes with values > 127 - memory.WriteText("%"); - memory.WriteByte(169); - memory.WriteByte(205); - memory.WriteByte(196); - memory.WriteByte(210); - memory.WriteNewLine(); - - // Body - foreach (var font in fonts) - { - var fontObj = font.Value.FontProgram.WriteFont(font.Value.FontKey.Name, memory, context); - fontsWritten.Add(font.Key, fontObj); - } - - foreach (var image in images) - { - var streamToken = new StreamToken(image.Value.StreamDictionary, image.Value.StreamData); - - context.WriteObject(memory, streamToken, image.Value.ObjectNumber); - } - - foreach (var tokenSet in unwrittenTokens) - { - context.WriteObject(memory, tokenSet.Value, (int)tokenSet.Key.Data.ObjectNumber); - } - - var procSet = new List - { - NameToken.Create("PDF"), - NameToken.Text, - NameToken.ImageB, - NameToken.ImageC, - NameToken.ImageI - }; - - var resources = new Dictionary - { - { NameToken.ProcSet, new ArrayToken(procSet) } - }; - - if (fontsWritten.Count > 0) - { - var fontsDictionary = new DictionaryToken(fontsWritten.Select(x => (fonts[x.Key].FontKey.Name, (IToken)new IndirectReferenceToken(x.Value.Number))) - .ToDictionary(x => x.Item1, x => x.Item2)); - - resources.Add(NameToken.Font, fontsDictionary); - } - - var reserved = context.ReserveNumber(); - var parentIndirect = new IndirectReferenceToken(new IndirectReference(reserved, 0)); - - var pageReferences = new List(); - foreach (var page in pages) - { - var individualResources = new Dictionary(resources); - var pageDictionary = new Dictionary - { - {NameToken.Type, NameToken.Page}, - {NameToken.MediaBox, RectangleToArray(page.Value.PageSize)}, - {NameToken.Parent, parentIndirect} - }; - - if (page.Value.Resources.Count > 0) - { - foreach (var kvp in page.Value.Resources) - { - var value = kvp.Value; - if (individualResources.TryGetValue(kvp.Key, out var pageToken)) - { - if (pageToken is DictionaryToken leftDictionary && value is DictionaryToken rightDictionary) - { - var merged = leftDictionary.Data.ToDictionary(k => NameToken.Create(k.Key), v => v.Value); - foreach (var set in rightDictionary.Data) - { - merged[NameToken.Create(set.Key)] = set.Value; - } - - value = new DictionaryToken(merged); - - } - // Else override - } - - individualResources[kvp.Key] = value; - } - } - - pageDictionary[NameToken.Resources] = new DictionaryToken(individualResources); - - if (page.Value.ContentStreams.Count == 1) - { - var contentStream = WriteContentStream(page.Value.CurrentStream.Operations); - - var contentStreamObj = context.WriteObject(memory, contentStream); - - pageDictionary[NameToken.Contents] = new IndirectReferenceToken(contentStreamObj.Number); - } - else if (page.Value.ContentStreams.Count > 1) - { - var streamTokens = page.Value.ContentStreams.Select(contentStream => - { - var streamToken = WriteContentStream(contentStream.Operations); - - var contentStreamObj = context.WriteObject(memory, streamToken); - - return new IndirectReferenceToken(contentStreamObj.Number); - }).ToList(); - - pageDictionary[NameToken.Contents] = new ArrayToken(streamTokens); - } - - var pageRef = context.WriteObject(memory, new DictionaryToken(pageDictionary)); - - pageReferences.Add(new IndirectReferenceToken(pageRef.Number)); - } - - var pagesDictionaryData = new Dictionary - { - {NameToken.Type, NameToken.Pages}, - {NameToken.Kids, new ArrayToken(pageReferences)}, - {NameToken.Count, new NumericToken(pageReferences.Count)} - }; - - var pagesDictionary = new DictionaryToken(pagesDictionaryData); - - var pagesRef = context.WriteObject(memory, pagesDictionary, reserved); - - var catalogDictionary = new Dictionary - { - {NameToken.Type, NameToken.Catalog}, - {NameToken.Pages, new IndirectReferenceToken(pagesRef.Number)} - }; - - if (ArchiveStandard != PdfAStandard.None) - { - Func writerFunc = x => context.WriteObject(memory, x); - - PdfABaselineRuleBuilder.Obey(catalogDictionary, writerFunc, DocumentInformation, ArchiveStandard); - - switch (ArchiveStandard) - { - case PdfAStandard.A1A: - PdfA1ARuleBuilder.Obey(catalogDictionary); - break; - case PdfAStandard.A2B: - break; - case PdfAStandard.A2A: - PdfA1ARuleBuilder.Obey(catalogDictionary); - break; - } - } - - var catalog = new DictionaryToken(catalogDictionary); - - var catalogRef = context.WriteObject(memory, catalog); - - var informationReference = default(IndirectReference?); - if (IncludeDocumentInformation) - { - var informationDictionary = DocumentInformation.ToDictionary(); - if (informationDictionary.Count > 0) - { - var dictionary = new DictionaryToken(informationDictionary); - informationReference = context.WriteObject(memory, dictionary).Number; - } - } - - TokenWriter.WriteCrossReferenceTable(context.ObjectOffsets, catalogRef, memory, informationReference); - - return memory.ToArray(); - } - } - - /// - /// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream - /// and replace the indirect reference with the correct/new one - /// - /// Token to inspect for reference - /// scanner get the content from the original document - /// A reference of the token that was copied. With all the reference updated - internal IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner) - { - // This token need to be deep copied, because they could contain reference. So we have to update them. - switch (tokenToCopy) - { - case DictionaryToken dictionaryToken: - { - var newContent = new Dictionary(); - foreach (var setPair in dictionaryToken.Data) - { - var name = setPair.Key; - var token = setPair.Value; - newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner)); - } - - return new DictionaryToken(newContent); - } - case ArrayToken arrayToken: - { - var newArray = new List(arrayToken.Length); - foreach (var token in arrayToken.Data) - { - newArray.Add(CopyToken(token, tokenScanner)); - } - - return new ArrayToken(newArray); - } - case IndirectReferenceToken referenceToken: - { - var tokenObject = DirectObjectFinder.Get(referenceToken.Data, tokenScanner); - - Debug.Assert(!(tokenObject is IndirectReferenceToken)); - - var newToken = CopyToken(tokenObject, tokenScanner); - - var reserved = context.ReserveNumber(); - var newReference = new IndirectReferenceToken(new IndirectReference(reserved, 0)); - - unwrittenTokens.Add(newReference, newToken); - - return newReference; - } - case StreamToken streamToken: - { - var properties = CopyToken(streamToken.StreamDictionary, tokenScanner) as DictionaryToken; - Debug.Assert(properties != null); - - var bytes = streamToken.Data; - return new StreamToken(properties, bytes); - } - - case ObjectToken _: - { - // Since we don't write token directly to the stream. - // We can't know the offset. Therefore the token would be invalid - throw new NotSupportedException("Copying a Object token is not supported"); - } + return ms.ToArray(); } - return tokenToCopy; - } - - private static StreamToken WriteContentStream(IReadOnlyList content) - { - using (var memoryStream = new MemoryStream()) + if (!context.Stream.CanSeek) { - foreach (var operation in content) - { - operation.Write(memoryStream); - } + throw new InvalidOperationException("PdfDocument.Build() called with non-seekable stream."); + } - var bytes = memoryStream.ToArray(); - - var stream = DataCompresser.CompressToStream(bytes); - - return stream; + using (var temp = new MemoryStream()) + { + context.Stream.Seek(0, SeekOrigin.Begin); + context.Stream.CopyTo(temp); + return temp.ToArray(); } } @@ -661,5 +722,13 @@ namespace UglyToad.PdfPig.Writer return result; } } + + /// + /// Disposes underlying stream if set to do so. + /// + public void Dispose() + { + context.Dispose(); + } } } diff --git a/src/UglyToad.PdfPig/Writer/PdfMerger.cs b/src/UglyToad.PdfPig/Writer/PdfMerger.cs index e1c98c7c..58d098a0 100644 --- a/src/UglyToad.PdfPig/Writer/PdfMerger.cs +++ b/src/UglyToad.PdfPig/Writer/PdfMerger.cs @@ -129,7 +129,21 @@ { const bool isLenientParsing = false; - var documentBuilder = new DocumentMerger(output); + var writer = new PdfStreamWriter(output, false); + var documentBuilder = new DocumentMerger(writer); + + var maxVersion = 1.2m; + var infos = new List<(CoreTokenScanner CoreScanner, HeaderVersion Version)>(); + foreach (var fileIndex in Enumerable.Range(0, files.Count)) + { + var inputBytes = files[fileIndex]; + var coreScanner = new CoreTokenScanner(inputBytes); + + var version = FileHeaderParser.Parse(coreScanner, isLenientParsing, Log); + maxVersion = Math.Max(maxVersion, version.Version); + infos.Add((coreScanner, version)); + } + writer.InitializePdf(maxVersion); foreach (var fileIndex in Enumerable.Range(0, files.Count)) { @@ -140,9 +154,7 @@ } var inputBytes = files[fileIndex]; - var coreScanner = new CoreTokenScanner(inputBytes); - - var version = FileHeaderParser.Parse(coreScanner, isLenientParsing, Log); + var (coreScanner, version) = infos[fileIndex]; var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log), new Parser.Parts.CrossReference.CrossReferenceStreamParser(FilterProvider)); @@ -165,7 +177,7 @@ var documentCatalog = CatalogFactory.Create(crossReference.Trailer.Root, catalogDictionaryToken, pdfScanner, isLenientParsing); - documentBuilder.AppendDocument(documentCatalog, version.Version, pdfScanner, pages); + documentBuilder.AppendDocument(documentCatalog, pdfScanner, pages); } documentBuilder.Build(); @@ -201,24 +213,21 @@ private class DocumentMerger { - private const decimal DefaultVersion = 1.2m; - private const int ARTIFICIAL_NODE_LIMIT = 100; - private readonly PdfStreamWriter context; + private readonly IPdfStreamWriter context; private readonly List pagesTokenReferences = new List(); private readonly IndirectReferenceToken rootPagesReference; - private decimal currentVersion = DefaultVersion; private int pageCount = 0; - public DocumentMerger(Stream baseStream) + public DocumentMerger(IPdfStreamWriter writer) { - context = new PdfStreamWriter(baseStream, false); - rootPagesReference = context.ReserveNumberToken(); + context = writer; + rootPagesReference = context.ReserveObjectNumber(); } - public void AppendDocument(Catalog catalog, decimal version, IPdfTokenScanner tokenScanner, IReadOnlyList pages) + public void AppendDocument(Catalog catalog, IPdfTokenScanner tokenScanner, IReadOnlyList pages) { IEnumerable pageIndices; if (pages == null) @@ -240,11 +249,9 @@ pageIndices = pages; } - currentVersion = Math.Max(version, currentVersion); - var referencesFromDocument = new Dictionary(); - var currentNodeReference = context.ReserveNumberToken(); + var currentNodeReference = context.ReserveObjectNumber(); var pagesReferences = new List(); var resources = new Dictionary(); @@ -323,7 +330,8 @@ } var pagesDictionary = new DictionaryToken(newPagesNode); - pagesTokenReferences.Add(context.WriteToken(pagesDictionary, (int)currentNodeReference.Data.ObjectNumber)); + context.WriteToken(pagesDictionary, currentNodeReference); + pagesTokenReferences.Add(currentNodeReference); pageCount += pagesReferences.Count; }; @@ -335,7 +343,7 @@ { CreateTree(); - currentNodeReference = context.ReserveNumberToken(); + currentNodeReference = context.ReserveObjectNumber(); pagesReferences = new List(); resources = new Dictionary(); } @@ -366,7 +374,7 @@ { NameToken.Count, new NumericToken(pageCount) } }); - var pagesRef = context.WriteToken(pagesDictionary, (int)rootPagesReference.Data.ObjectNumber); + var pagesRef = context.WriteToken(pagesDictionary, rootPagesReference); var catalog = new DictionaryToken(new Dictionary { @@ -376,7 +384,7 @@ var catalogRef = context.WriteToken(catalog); - context.Flush(currentVersion, catalogRef); + context.CompletePdf(catalogRef); Close(); } @@ -423,67 +431,7 @@ /// A reference of the token that was copied. With all the reference updated private IToken CopyToken(IToken tokenToCopy, IPdfTokenScanner tokenScanner, IDictionary referencesFromDocument) { - // This token need to be deep copied, because they could contain reference. So we have to update them. - switch (tokenToCopy) - { - case DictionaryToken dictionaryToken: - { - var newContent = new Dictionary(); - foreach (var setPair in dictionaryToken.Data) - { - var name = setPair.Key; - var token = setPair.Value; - newContent.Add(NameToken.Create(name), CopyToken(token, tokenScanner, referencesFromDocument)); - } - - return new DictionaryToken(newContent); - } - case ArrayToken arrayToken: - { - var newArray = new List(arrayToken.Length); - foreach (var token in arrayToken.Data) - { - newArray.Add(CopyToken(token, tokenScanner, referencesFromDocument)); - } - - return new ArrayToken(newArray); - } - case IndirectReferenceToken referenceToken: - { - if (referencesFromDocument.TryGetValue(referenceToken.Data, out var newReferenceToken)) - { - return newReferenceToken; - } - - //we add the token to referencesFromDocument to prevent stackoverflow on references cycles - newReferenceToken = context.ReserveNumberToken(); - referencesFromDocument.Add(referenceToken.Data, newReferenceToken); - - var tokenObject = DirectObjectFinder.Get(referenceToken.Data, tokenScanner); - Debug.Assert(!(tokenObject is IndirectReferenceToken)); - - var newToken = CopyToken(tokenObject, tokenScanner, referencesFromDocument); - context.WriteToken(newReferenceToken, newToken); - return newReferenceToken; - } - case StreamToken streamToken: - { - var properties = CopyToken(streamToken.StreamDictionary, tokenScanner, referencesFromDocument) as DictionaryToken; - Debug.Assert(properties != null); - - var bytes = streamToken.Data; - return new StreamToken(properties, bytes); - } - - case ObjectToken _: - { - // Since we don't write token directly to the stream. - // We can't know the offset. Therefore the token would be invalid - throw new NotSupportedException("Copying a Object token is not supported"); - } - } - - return tokenToCopy; + return WriterUtil.CopyToken(context, tokenToCopy, tokenScanner, referencesFromDocument); } } } diff --git a/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs index 96ce9077..b50ac3cc 100644 --- a/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfPageBuilder.cs @@ -29,7 +29,9 @@ public class PdfPageBuilder { private readonly PdfDocumentBuilder documentBuilder; - private readonly List contentStreams; + private IPageContentStream currentStream; + internal readonly List contentStreams; + internal readonly Dictionary additionalPageProperties = new Dictionary(); private readonly Dictionary resourcesDictionary = new Dictionary(); //a sequence number of ShowText operation to determine whether letters belong to same operation or not (letters that belong to different operations have less changes to belong to same word) @@ -52,34 +54,44 @@ /// /// Access to the underlying data structures for advanced use cases. /// - public ContentStream CurrentStream { get; private set; } + public IContentStream CurrentStream => currentStream; /// /// Access to /// - public IReadOnlyList ContentStreams { get; } + public IReadOnlyList ContentStreams => contentStreams; internal PdfPageBuilder(int number, PdfDocumentBuilder documentBuilder) { this.documentBuilder = documentBuilder ?? throw new ArgumentNullException(nameof(documentBuilder)); PageNumber = number; - CurrentStream = new ContentStream(); - ContentStreams = contentStreams = new List() - { - CurrentStream - }; + currentStream = new DefaultContentStream(); + contentStreams = new List() {currentStream}; } + internal PdfPageBuilder(int number, PdfDocumentBuilder documentBuilder, IEnumerable copied, + Dictionary existingResources, Dictionary pageDict) + { + this.documentBuilder = documentBuilder ?? throw new ArgumentNullException(nameof(documentBuilder)); + PageNumber = number; + contentStreams = new List(); + contentStreams.AddRange(copied); + currentStream = new DefaultContentStream(); + contentStreams.Add(currentStream); + additionalPageProperties =pageDict ?? new Dictionary(); + resourcesDictionary = existingResources; + } + /// /// Allow to append a new content stream before the current one and select it /// public void NewContentStreamBefore() { - var index = Math.Max(contentStreams.IndexOf(CurrentStream) - 1, 0); + var index = Math.Max(contentStreams.IndexOf(currentStream) - 1, 0); - CurrentStream = new ContentStream(); - contentStreams.Insert(index, CurrentStream); + currentStream = new DefaultContentStream(); + contentStreams.Insert(index, currentStream); } /// @@ -87,10 +99,10 @@ /// public void NewContentStreamAfter() { - var index = Math.Min(contentStreams.IndexOf(CurrentStream) + 1, contentStreams.Count); + var index = Math.Min(contentStreams.IndexOf(currentStream) + 1, contentStreams.Count); - CurrentStream = new ContentStream(); - contentStreams.Insert(index, CurrentStream); + currentStream = new DefaultContentStream(); + contentStreams.Insert(index, currentStream); } /// @@ -99,12 +111,12 @@ /// index of the content stream to be selected public void SelectContentStream(int index) { - if (index < 0 || index >= ContentStreams.Count) + if (index < 0 || index >= contentStreams.Count) { throw new IndexOutOfRangeException(nameof(index)); } - CurrentStream = ContentStreams[index]; + currentStream = contentStreams[index]; } /// @@ -117,16 +129,16 @@ { if (lineWidth != 1) { - CurrentStream.Add(new SetLineWidth(lineWidth)); + currentStream.Add(new SetLineWidth(lineWidth)); } - CurrentStream.Add(new BeginNewSubpath((decimal)from.X, (decimal)from.Y)); - CurrentStream.Add(new AppendStraightLineSegment((decimal)to.X, (decimal)to.Y)); - CurrentStream.Add(StrokePath.Value); + currentStream.Add(new BeginNewSubpath((decimal)from.X, (decimal)from.Y)); + currentStream.Add(new AppendStraightLineSegment((decimal)to.X, (decimal)to.Y)); + currentStream.Add(StrokePath.Value); if (lineWidth != 1) { - CurrentStream.Add(new SetLineWidth(1)); + currentStream.Add(new SetLineWidth(1)); } } @@ -142,23 +154,23 @@ { if (lineWidth != 1) { - CurrentStream.Add(new SetLineWidth(lineWidth)); + currentStream.Add(new SetLineWidth(lineWidth)); } - CurrentStream.Add(new AppendRectangle((decimal)position.X, (decimal)position.Y, width, height)); + currentStream.Add(new AppendRectangle((decimal)position.X, (decimal)position.Y, width, height)); if (fill) { - CurrentStream.Add(FillPathEvenOddRuleAndStroke.Value); + currentStream.Add(FillPathEvenOddRuleAndStroke.Value); } else { - CurrentStream.Add(StrokePath.Value); + currentStream.Add(StrokePath.Value); } if (lineWidth != 1) { - CurrentStream.Add(new SetLineWidth(lineWidth)); + currentStream.Add(new SetLineWidth(lineWidth)); } } @@ -170,8 +182,8 @@ /// Blue - 0 to 255 public void SetStrokeColor(byte r, byte g, byte b) { - CurrentStream.Add(Push.Value); - CurrentStream.Add(new SetStrokeColorDeviceRgb(RgbToDecimal(r), RgbToDecimal(g), RgbToDecimal(b))); + currentStream.Add(Push.Value); + currentStream.Add(new SetStrokeColorDeviceRgb(RgbToDecimal(r), RgbToDecimal(g), RgbToDecimal(b))); } /// @@ -182,8 +194,8 @@ /// Blue - 0 to 1 internal void SetStrokeColorExact(decimal r, decimal g, decimal b) { - CurrentStream.Add(Push.Value); - CurrentStream.Add(new SetStrokeColorDeviceRgb(CheckRgbDecimal(r, nameof(r)), + currentStream.Add(Push.Value); + currentStream.Add(new SetStrokeColorDeviceRgb(CheckRgbDecimal(r, nameof(r)), CheckRgbDecimal(g, nameof(g)), CheckRgbDecimal(b, nameof(b)))); } @@ -195,8 +207,8 @@ /// Blue - 0 to 255 public void SetTextAndFillColor(byte r, byte g, byte b) { - CurrentStream.Add(Push.Value); - CurrentStream.Add(new SetNonStrokeColorDeviceRgb(RgbToDecimal(r), RgbToDecimal(g), RgbToDecimal(b))); + currentStream.Add(Push.Value); + currentStream.Add(new SetNonStrokeColorDeviceRgb(RgbToDecimal(r), RgbToDecimal(g), RgbToDecimal(b))); } /// @@ -204,7 +216,7 @@ /// public void ResetColor() { - CurrentStream.Add(Pop.Value); + currentStream.Add(Pop.Value); } /// @@ -294,15 +306,15 @@ var letters = DrawLetters(text, fontProgram, fm, fontSize, textMatrix); - CurrentStream.Add(BeginText.Value); - CurrentStream.Add(new SetFontAndSize(font.Name, fontSize)); - CurrentStream.Add(new MoveToNextLineWithOffset((decimal)position.X, (decimal)position.Y)); + currentStream.Add(BeginText.Value); + currentStream.Add(new SetFontAndSize(font.Name, fontSize)); + currentStream.Add(new MoveToNextLineWithOffset((decimal)position.X, (decimal)position.Y)); var bytesPerShow = new List(); foreach (var letter in text) { if (char.IsWhiteSpace(letter)) { - CurrentStream.Add(new ShowText(bytesPerShow.ToArray())); + currentStream.Add(new ShowText(bytesPerShow.ToArray())); bytesPerShow.Clear(); } @@ -312,10 +324,10 @@ if (bytesPerShow.Count > 0) { - CurrentStream.Add(new ShowText(bytesPerShow.ToArray())); + currentStream.Add(new ShowText(bytesPerShow.ToArray())); } - CurrentStream.Add(EndText.Value); + currentStream.Add(EndText.Value); return letters; } @@ -370,20 +382,20 @@ var key = NameToken.Create($"I{imageKey++}"); - resourcesDictionary[NameToken.Xobject] = xobjects.With(key, new IndirectReferenceToken(reference)); + resourcesDictionary[NameToken.Xobject] = xobjects.With(key, reference); - CurrentStream.Add(Push.Value); + currentStream.Add(Push.Value); // This needs to be the placement rectangle. - CurrentStream.Add(new ModifyCurrentTransformationMatrix(new [] + currentStream.Add(new ModifyCurrentTransformationMatrix(new [] { (decimal)placementRectangle.Width, 0, 0, (decimal)placementRectangle.Height, (decimal)placementRectangle.BottomLeft.X, (decimal)placementRectangle.BottomLeft.Y })); - CurrentStream.Add(new InvokeNamedXObject(key)); - CurrentStream.Add(Pop.Value); + currentStream.Add(new InvokeNamedXObject(key)); + currentStream.Add(Pop.Value); - return new AddedImage(reference, info.Width, info.Height); + return new AddedImage(reference.Data, info.Width, info.Height); } /// @@ -411,16 +423,16 @@ resourcesDictionary[NameToken.Xobject] = xobjects.With(key, new IndirectReferenceToken(image.Reference)); - CurrentStream.Add(Push.Value); + currentStream.Add(Push.Value); // This needs to be the placement rectangle. - CurrentStream.Add(new ModifyCurrentTransformationMatrix(new[] + currentStream.Add(new ModifyCurrentTransformationMatrix(new[] { (decimal)placementRectangle.Width, 0, 0, (decimal)placementRectangle.Height, (decimal)placementRectangle.BottomLeft.X, (decimal)placementRectangle.BottomLeft.Y })); - CurrentStream.Add(new InvokeNamedXObject(key)); - CurrentStream.Add(Pop.Value); + currentStream.Add(new InvokeNamedXObject(key)); + currentStream.Add(Pop.Value); } /// @@ -487,20 +499,20 @@ var key = NameToken.Create($"I{imageKey++}"); - resourcesDictionary[NameToken.Xobject] = xobjects.With(key, new IndirectReferenceToken(reference)); + resourcesDictionary[NameToken.Xobject] = xobjects.With(key, reference); - CurrentStream.Add(Push.Value); + currentStream.Add(Push.Value); // This needs to be the placement rectangle. - CurrentStream.Add(new ModifyCurrentTransformationMatrix(new[] + currentStream.Add(new ModifyCurrentTransformationMatrix(new[] { (decimal)placementRectangle.Width, 0, 0, (decimal)placementRectangle.Height, (decimal)placementRectangle.BottomLeft.X, (decimal)placementRectangle.BottomLeft.Y })); - CurrentStream.Add(new InvokeNamedXObject(key)); - CurrentStream.Add(Pop.Value); + currentStream.Add(new InvokeNamedXObject(key)); + currentStream.Add(Pop.Value); - return new AddedImage(reference, png.Width, png.Height); + return new AddedImage(reference.Data, png.Width, png.Height); } /// @@ -509,13 +521,12 @@ /// Page to be copied public void CopyFrom(Page srcPage) { - ContentStream destinationStream = null; - if (CurrentStream.Operations.Count > 0) + if (currentStream.Operations.Count > 0) { NewContentStreamAfter(); } - destinationStream = CurrentStream; + var destinationStream = currentStream; if (!srcPage.Dictionary.TryGet(NameToken.Resources, srcPage.pdfScanner, out DictionaryToken srcResourceDictionary)) { @@ -547,7 +558,7 @@ { // It means that this type of resources doesn't currently exist in the page, so we can copy it // with no problem - resourcesDictionary[nameToken] = documentBuilder.CopyToken(set.Value, srcPage.pdfScanner); + resourcesDictionary[nameToken] = documentBuilder.CopyToken(srcPage.pdfScanner, set.Value); continue; } @@ -604,7 +615,7 @@ throw new PdfDocumentFormatException($"Expected a IndirectReferenceToken for the font, got a {fontSet.Value.GetType().Name}"); } - pageFontsDictionary.Add(NameToken.Create(fontName), documentBuilder.CopyToken(fontReferenceToken, srcPage.pdfScanner)); + pageFontsDictionary.Add(NameToken.Create(fontName), documentBuilder.CopyToken(srcPage.pdfScanner, fontReferenceToken)); } resourcesDictionary[NameToken.Font] = new DictionaryToken(pageFontsDictionary); @@ -657,7 +668,7 @@ throw new PdfDocumentFormatException($"Expected a IndirectReferenceToken for the XObject, got a {xobjectSet.Value.GetType().Name}"); } - pageXobjectsDictionary.Add(NameToken.Create(xobjectName), documentBuilder.CopyToken(fontReferenceToken, srcPage.pdfScanner)); + pageXobjectsDictionary.Add(NameToken.Create(xobjectName), documentBuilder.CopyToken(srcPage.pdfScanner, fontReferenceToken)); } resourcesDictionary[NameToken.Xobject] = new DictionaryToken(pageXobjectsDictionary); @@ -741,30 +752,90 @@ return value; } - /// - /// Provides access to the raw page data structures for advanced editing use cases. - /// - public class ContentStream + internal interface IPageContentStream : IContentStream { - /// - /// The operations making up the page content stream. - /// - public List Operations { get; } + bool ReadOnly { get; } + void Add(IGraphicsStateOperation operation); + IndirectReferenceToken Write(IPdfStreamWriter writer); + } - /// - /// Create a new . + /// + /// Provides access to the raw page data structures for advanced editing use cases. + /// + public interface IContentStream + { + /// + /// The operations making up the page content stream. /// - internal ContentStream() + List Operations { get; } + } + + internal class DefaultContentStream : IPageContentStream + { + private readonly List operations; + + public DefaultContentStream() : this(new List()) { - Operations = new List(); + + } + public DefaultContentStream(List operations) + { + this.operations = operations; } - internal void Add(IGraphicsStateOperation newOperation) + public bool ReadOnly => false; + + public void Add(IGraphicsStateOperation operation) { - Operations.Add(newOperation); + operations.Add(operation); + } + + public List Operations => operations; + + public IndirectReferenceToken Write(IPdfStreamWriter writer) + { + using (var memoryStream = new MemoryStream()) + { + foreach (var operation in operations) + { + operation.Write(memoryStream); + } + + var bytes = memoryStream.ToArray(); + + var stream = DataCompresser.CompressToStream(bytes); + + return writer.WriteToken(stream); + } + } } + internal class CopiedContentStream : IPageContentStream + { + private readonly IndirectReferenceToken token; + public CopiedContentStream(IndirectReferenceToken indirectReferenceToken) + { + token = indirectReferenceToken; + } + public bool ReadOnly => true; + + + public IndirectReferenceToken Write(IPdfStreamWriter writer) + { + return token; + } + + public void Add(IGraphicsStateOperation operation) + { + throw new NotSupportedException("Writing to a copied content stream is not supported."); + } + + public List Operations => + throw new NotSupportedException("Reading raw operations is not supported from a copied content stream."); + } + + /// /// A key representing an image available to use for the current document builder. /// Create it by adding an image to a page using . diff --git a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs index 81f1c77a..abcb3f17 100644 --- a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs @@ -11,17 +11,16 @@ /// /// This class would lazily flush all token. Allowing us to make changes to references without need to rewrite the whole stream /// - internal class PdfStreamWriter : IDisposable + internal class PdfStreamWriter : IPdfStreamWriter { - private readonly List reservedNumbers = new List(); + private Dictionary offsets = new Dictionary(); + private const decimal DefaultVersion = 1.2m; + private bool Initialized { get; set; } + private int CurrentNumber { get; set; } = 1; - private readonly Dictionary tokenReferences = new Dictionary(); + public Stream Stream { get; set; } - public int CurrentNumber { get; private set; } = 1; - - public Stream Stream { get; private set; } - - public bool DisposeStream { get; set; } + private bool DisposeStream { get; set; } public PdfStreamWriter(Stream baseStream, bool disposeStream = true) { @@ -34,13 +33,8 @@ DisposeStream = disposeStream; } - public void Flush(decimal version, IndirectReferenceToken catalogReference) + public void InitializePdf(decimal version) { - if (catalogReference == null) - { - throw new ArgumentNullException(nameof(catalogReference)); - } - WriteString($"%PDF-{version.ToString("0.0", CultureInfo.InvariantCulture)}", Stream); Stream.WriteText("%"); @@ -49,67 +43,6 @@ Stream.WriteByte(196); Stream.WriteByte(210); Stream.WriteNewLine(); - - var offsets = new Dictionary(); - ObjectToken catalogToken = null; - foreach (var pair in tokenReferences) - { - var referenceToken = pair.Key; - var token = pair.Value; - var offset = Stream.Position; - var obj = new ObjectToken(offset, referenceToken.Data, token); - - TokenWriter.WriteToken(obj, Stream); - - offsets.Add(referenceToken.Data, offset); - - if (catalogToken == null && referenceToken == catalogReference) - { - catalogToken = obj; - } - } - - if (catalogToken == null) - { - throw new Exception("Catalog object wasn't found"); - } - - // TODO: Support document information - TokenWriter.WriteCrossReferenceTable(offsets, catalogToken, Stream, null); - } - - public IndirectReferenceToken WriteToken(IToken token, int? reservedNumber = null) - { - if (!reservedNumber.HasValue) - { - return AddToken(token, CurrentNumber++); - } - - if (!reservedNumbers.Remove(reservedNumber.Value)) - { - throw new InvalidOperationException("You can't reuse a reserved number"); - } - - // When we end up writing this token, all of his child would already have been added and checked for duplicate - return AddToken(token, reservedNumber.Value); - } - - public void WriteToken(IndirectReferenceToken referenceToken, IToken token) - { - tokenReferences.Add(referenceToken, token); - } - - public int ReserveNumber() - { - var reserved = CurrentNumber; - reservedNumbers.Add(reserved); - CurrentNumber++; - return reserved; - } - - public IndirectReferenceToken ReserveNumberToken() - { - return new IndirectReferenceToken(new IndirectReference(ReserveNumber(), 0)); } public void Dispose() @@ -124,13 +57,6 @@ Stream = null; } - private IndirectReferenceToken AddToken(IToken token, int reservedNumber) - { - var reference = new IndirectReference(reservedNumber, 0); - var referenceToken = new IndirectReferenceToken(reference); - tokenReferences.Add(referenceToken, token); - return referenceToken; - } private static void WriteString(string text, Stream stream) { @@ -138,5 +64,43 @@ stream.Write(bytes, 0, bytes.Length); stream.WriteNewLine(); } + + public IndirectReferenceToken WriteToken(IToken token) + { + if (!Initialized) + { + InitializePdf(DefaultVersion); + } + + var ir = ReserveObjectNumber(); + offsets.Add(ir.Data, Stream.Position); + var obj = new ObjectToken(Stream.Position, ir.Data, token); + TokenWriter.WriteToken(obj, Stream); + return ir; + } + + public IndirectReferenceToken WriteToken(IToken token, IndirectReferenceToken indirectReference) + { + if (!Initialized) + { + InitializePdf(DefaultVersion); + } + + offsets.Add(indirectReference.Data, Stream.Position); + var obj = new ObjectToken(Stream.Position, indirectReference.Data, token); + TokenWriter.WriteToken(obj, Stream); + return indirectReference; + } + + public IndirectReferenceToken ReserveObjectNumber() + { + return new IndirectReferenceToken(new IndirectReference(CurrentNumber++, 0)); + } + + + public void CompletePdf(IndirectReferenceToken catalogReference, IndirectReferenceToken documentInformationReference=null) + { + TokenWriter.WriteCrossReferenceTable(offsets, catalogReference.Data, Stream, documentInformationReference?.Data); + } } } diff --git a/src/UglyToad.PdfPig/Writer/PdfWriterType.cs b/src/UglyToad.PdfPig/Writer/PdfWriterType.cs new file mode 100644 index 00000000..c0730803 --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/PdfWriterType.cs @@ -0,0 +1,21 @@ +namespace UglyToad.PdfPig.Writer +{ + using System; + using System.Collections.Generic; + using System.Text; + + /// + /// Type of pdf writer to use. + /// + public enum PdfWriterType + { + /// + /// Default output writer + /// + Default, + /// + /// De-duplicates objects while writing but requires keeping in memory reference. + /// + ObjectInMemoryDedup + } +} diff --git a/src/UglyToad.PdfPig/Writer/TokenWriter.cs b/src/UglyToad.PdfPig/Writer/TokenWriter.cs index 88c53482..ee8ae638 100644 --- a/src/UglyToad.PdfPig/Writer/TokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/TokenWriter.cs @@ -131,7 +131,7 @@ /// The output stream to write to. /// The object reference for the document information dictionary if present. internal static void WriteCrossReferenceTable(IReadOnlyDictionary objectOffsets, - ObjectToken catalogToken, + IndirectReference catalogToken, Stream outputStream, IndirectReference? documentInformationReference) { @@ -201,7 +201,7 @@ { // 1 for the free entry. {NameToken.Size, new NumericToken(objectOffsets.Count + 1)}, - {NameToken.Root, new IndirectReferenceToken(catalogToken.Number)}, + {NameToken.Root, new IndirectReferenceToken(catalogToken)}, {NameToken.Id, identifier} }; @@ -225,6 +225,32 @@ outputStream.Write(Eof, 0, Eof.Length); } + /// + /// Writes pre-serialized token as an object token to the output stream. + /// + /// Object number of the indirect object. + /// Generation of the indirect object. + /// Pre-serialized object contents. + /// The stream to write the token to. + internal static void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream) + { + WriteLong(objectNumber, outputStream); + WriteWhitespace(outputStream); + + WriteInt(generation, outputStream); + WriteWhitespace(outputStream); + + outputStream.Write(ObjStart, 0, ObjStart.Length); + WriteLineBreak(outputStream); + + outputStream.Write(data, 0, data.Length); + + WriteLineBreak(outputStream); + outputStream.Write(ObjEnd, 0, ObjEnd.Length); + + WriteLineBreak(outputStream); + } + private static void WriteHex(HexToken hex, Stream stream) { stream.WriteByte(HexStart); diff --git a/src/UglyToad.PdfPig/Writer/WriterUtil.cs b/src/UglyToad.PdfPig/Writer/WriterUtil.cs new file mode 100644 index 00000000..4c585541 --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/WriterUtil.cs @@ -0,0 +1,135 @@ +namespace UglyToad.PdfPig.Writer +{ + using Content; + using Core; + using Parser.Parts; + using System; + using System.Collections.Generic; + using System.Diagnostics; + using System.Linq; + using System.Text; + using Tokenization.Scanner; + using Tokens; + internal class WriterUtil + { + /// + /// The purpose of this method is to resolve indirect reference. That mean copy the reference's content to the new document's stream + /// and replace the indirect reference with the correct/new one + /// + /// PDF stream writer + /// Token to inspect for reference + /// scanner get the content from the original document + /// Map of previously copied tokens for original document. + /// Call stack of indirect references + /// A reference of the token that was copied. With all the reference updated + public static IToken CopyToken(IPdfStreamWriter writer, IToken tokenToCopy, IPdfTokenScanner tokenScanner, + IDictionary referencesFromDocument, Dictionary callstack=null) + { + callstack ??= new Dictionary(); + // This token need to be deep copied, because they could contain reference. So we have to update them. + switch (tokenToCopy) + { + case DictionaryToken dictionaryToken: + { + var newContent = new Dictionary(); + foreach (var setPair in dictionaryToken.Data) + { + var name = setPair.Key; + var token = setPair.Value; + newContent.Add(NameToken.Create(name), CopyToken(writer, token, tokenScanner, referencesFromDocument, callstack)); + } + + return new DictionaryToken(newContent); + } + case ArrayToken arrayToken: + { + var newArray = new List(arrayToken.Length); + foreach (var token in arrayToken.Data) + { + newArray.Add(CopyToken(writer, token, tokenScanner, referencesFromDocument, callstack)); + } + + return new ArrayToken(newArray); + } + case IndirectReferenceToken referenceToken: + { + if (referencesFromDocument.TryGetValue(referenceToken.Data, out var newReferenceToken)) + { + return newReferenceToken; + } + + if (callstack.ContainsKey(referenceToken.Data) && callstack[referenceToken.Data] == null) + { + newReferenceToken = writer.ReserveObjectNumber(); + callstack[referenceToken.Data] = newReferenceToken; + referencesFromDocument.Add(referenceToken.Data, newReferenceToken); + return newReferenceToken; + } + + callstack.Add(referenceToken.Data, null); + + // we add the token to referencesFromDocument to prevent stackoverflow on references cycles + // newReferenceToken = context.ReserveNumberToken(); + // callstack.Add(newReferenceToken.Data.ObjectNumber); + // referencesFromDocument.Add(referenceToken.Data, newReferenceToken); + // + var tokenObject = DirectObjectFinder.Get(referenceToken.Data, tokenScanner); + Debug.Assert(!(tokenObject is IndirectReferenceToken)); + var result = CopyToken(writer, tokenObject, tokenScanner, referencesFromDocument, callstack); + + if (callstack[referenceToken.Data] != null) + { + return writer.WriteToken(result, callstack[referenceToken.Data]); + } + + newReferenceToken = writer.WriteToken(result); + referencesFromDocument.Add(referenceToken.Data, newReferenceToken); + return newReferenceToken; + } + case StreamToken streamToken: + { + var properties = CopyToken(writer, streamToken.StreamDictionary, tokenScanner, referencesFromDocument, callstack) as DictionaryToken; + Debug.Assert(properties != null); + + var bytes = streamToken.Data; + return new StreamToken(properties, bytes); + } + + case ObjectToken _: + { + // Since we don't write token directly to the stream. + // We can't know the offset. Therefore the token would be invalid + throw new NotSupportedException("Copying a Object token is not supported"); + } + } + + return tokenToCopy; + } + + internal static IEnumerable<(DictionaryToken, List)> WalkTree(PageTreeNode node, List parents=null) + { + if (parents == null) + { + parents = new List(); + } + + if (node.IsPage) + { + yield return (node.NodeDictionary, parents); + yield break; + } + + parents = parents.ToList(); + parents.Add(node.NodeDictionary); + foreach (var child in node.Children) + { + foreach (var item in WalkTree(child, parents)) + { + yield return item; + } + } + } + } + +} +