diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/outline.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/outline.pdf new file mode 100644 index 00000000..33ac0de7 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/outline.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs index a9ca12da..fbadf78e 100644 --- a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs @@ -101,6 +101,33 @@ } } + [Fact] + public void CanFastAddPageAndStripLinkAnnots() + { + var first = IntegrationHelpers.GetDocumentPath("outline.pdf"); + var contents = File.ReadAllBytes(first); + + var annotCount = 0; + byte[] results = null; + using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) + using (var output = new PdfDocumentBuilder()) + { + output.AddPage(existing, 1); + results = output.Build(); + var pg = existing.GetPage(1); + var annots = pg.ExperimentalAccess.GetAnnotations().ToList(); + annotCount = annots.Count; + Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link); + } + + using (var rewritten = PdfDocument.Open(results, ParsingOptions.LenientParsingOff)) + { + var pg = rewritten.GetPage(1); + var annots = pg.ExperimentalAccess.GetAnnotations().ToList(); + Assert.Equal(annotCount - 1, annots.Count); + Assert.DoesNotContain(annots, x => x.Type == Annotations.AnnotationType.Link); + } + } [Fact] public void CanReadSingleBlankPage() diff --git a/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs b/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs index 13281f7a..035260bc 100644 --- a/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs @@ -8,6 +8,13 @@ internal interface IPdfStreamWriter : IDisposable { + /// + /// Sets if the stream writer should attempt to deduplicate objects. + /// May not have any affect if does not + /// support deduplication. + /// + bool AttemptDeduplication { get; set; } + /// /// The underlying stream used by the writer. /// diff --git a/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs b/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs index 38c77cbc..0d1cb1af 100644 --- a/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs @@ -23,13 +23,16 @@ ms.SetLength(0); TokenWriter.WriteToken(token, ms); var contents = ms.ToArray(); - if (hashes.TryGetValue(contents, out var value)) + if (AttemptDeduplication && hashes.TryGetValue(contents, out var value)) { return value; } var ir = ReserveObjectNumber(); - hashes.Add(contents, ir); + if (AttemptDeduplication) + { + hashes.Add(contents, ir); + } offsets.Add(ir.Data, Stream.Position); TokenWriter.WriteObject(ir.Data.ObjectNumber, ir.Data.Generation, contents, Stream); diff --git a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs index 0620dfa9..a50b3870 100644 --- a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs @@ -330,6 +330,10 @@ namespace UglyToad.PdfPig.Writer var streams = new List(); if (pageInfo.Page.TryGet(NameToken.Contents, out IToken contentsToken)) { + // Adobe Acrobat errors if content streams ref'd by multiple pages, turn off + // dedup if on to avoid issues + var prev = context.AttemptDeduplication; + context.AttemptDeduplication = false; if (contentsToken is ArrayToken array) { foreach (var item in array.Data) @@ -347,6 +351,7 @@ namespace UglyToad.PdfPig.Writer streams.Add(new PdfPageBuilder.CopiedContentStream( WriterUtil.CopyToken(context, ir, document.Structure.TokenScanner, refs) as IndirectReferenceToken)); } + context.AttemptDeduplication = prev; } // manually copy page dict / resources as we need to modify some @@ -379,15 +384,55 @@ namespace UglyToad.PdfPig.Writer { if (kvp.Key == NameToken.Contents || kvp.Key == NameToken.Parent || kvp.Key == NameToken.Type) { + // don't copy these as they'll be handled during page tree writing continue; } if (kvp.Key == NameToken.Resources) { + // merge parent resources into child CopyResourceDict(kvp.Value, resources); continue; } + if (kvp.Key == NameToken.Annots) + { + var val = kvp.Value; + if (kvp.Value is IndirectReferenceToken ir) + { + val = document.Structure.TokenScanner.Get(ir.Data).Data; + } + + if (!(val is ArrayToken arr)) + { + // should be array... ignore and remove bad dict + continue; + } + + // -> ignore links to resolve issues with refencing non-existing pages + // at some point should add support for copying the links if the + // pages are copied as well but for now just fix corruption + var toAdd = new List(); + foreach (var annot in arr.Data) + { + DictionaryToken tk = GetRemoteDict(annot); + if (tk == null) + { + // malformed + continue; + } + if (tk.TryGet(NameToken.Subtype, out var st) && st is NameToken nm && nm == NameToken.Link) + { + // link -> ignore + continue; + } + toAdd.Add(WriterUtil.CopyToken(context, tk, document.Structure.TokenScanner, refs)); + } + // copy rest + copiedPageDict[NameToken.Annots] = new ArrayToken(toAdd); + continue; + } + copiedPageDict[NameToken.Create(kvp.Key)] = WriterUtil.CopyToken(context, kvp.Value, document.Structure.TokenScanner, refs); } @@ -508,10 +553,14 @@ namespace UglyToad.PdfPig.Writer pageDictionary[NameToken.MediaBox] = RectangleToArray(page.Value.PageSize); } + // Adobe Acrobat errors if content streams ref'd by multiple pages, turn off + // dedup if on to avoid issues + var prev = context.AttemptDeduplication; + context.AttemptDeduplication = false; + var toWrite = page.Value.contentStreams.Where(x => x.HasContent).ToList(); if (toWrite.Count == 0) { - // write empty pageDictionary[NameToken.Contents] = new PdfPageBuilder.DefaultContentStream().Write(context); } else if (toWrite.Count == 1) @@ -529,7 +578,7 @@ namespace UglyToad.PdfPig.Writer } pageDictionary[NameToken.Contents] = new ArrayToken(streams); } - + context.AttemptDeduplication = prev;; leafChildren[leafNum].Add(context.WriteToken(new DictionaryToken(pageDictionary))); diff --git a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs index f444e3dc..65d7ec23 100644 --- a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs @@ -30,8 +30,9 @@ DisposeStream = disposeStream; } - public Stream Stream { get; protected set; } - + public Stream Stream { get; protected set; } + public bool AttemptDeduplication { get; set; } = true; + public virtual IndirectReferenceToken WriteToken(IToken token) { if (!Initialized) diff --git a/src/UglyToad.PdfPig/Writer/TokenWriter.cs b/src/UglyToad.PdfPig/Writer/TokenWriter.cs index ee8ae638..2dacb681 100644 --- a/src/UglyToad.PdfPig/Writer/TokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/TokenWriter.cs @@ -80,7 +80,11 @@ /// The token to write to the stream. /// The stream to write the token to. public static void WriteToken(IToken token, Stream outputStream) - { + { + if (token == null) + { + throw new ArgumentNullException(nameof(token)); + } switch (token) { case ArrayToken array: @@ -119,7 +123,9 @@ break; case StringToken stringToken: WriteString(stringToken, outputStream); - break; + break; + default: + throw new PdfDocumentFormatException($"Attempted to write token type of {token.GetType()} but was not known."); } } @@ -294,8 +300,16 @@ foreach (var pair in dictionary.Data) { - WriteName(pair.Key, outputStream); - WriteToken(pair.Value, outputStream); + WriteName(pair.Key, outputStream); + + // handle scenario where PdfPig has a null value under some circumstances + if (pair.Value == null) + { + WriteToken(NullToken.Instance, outputStream); + } else + { + WriteToken(pair.Value, outputStream); + } } outputStream.Write(DictionaryEnd, 0, DictionaryEnd.Length);