From 928c2ef2fc73911667ab7e812d68aeebe8b16c2f Mon Sep 17 00:00:00 2001 From: mvantzet Date: Wed, 21 Jun 2023 14:54:11 +0200 Subject: [PATCH] Prevent reading (and modifying!) non-content streams, reducing chances of PDF corruption. Added skipping operation MoveToNextLineShowText as well. Also duplicate the original stream's dictionary which solves disappearing elements (due to missing SubType / BBox for example). --- .../Writer/PdfDocumentBuilderTests.cs | 1 + .../Writer/IPdfStreamWriter.cs | 7 +++- src/UglyToad.PdfPig/Writer/ITokenWriter.cs | 5 +++ .../Writer/NoTextTokenWriter.cs | 42 +++++++++++++++++-- .../Writer/PdfDocumentBuilder.cs | 3 +- src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs | 6 +++ src/UglyToad.PdfPig/Writer/PdfTextRemover.cs | 7 +++- src/UglyToad.PdfPig/Writer/TokenWriter.cs | 6 +++ 8 files changed, 68 insertions(+), 9 deletions(-) diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs index fc58ae2d..ea31263b 100644 --- a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs @@ -1279,6 +1279,7 @@ public int Tokens { get; private set; } public int Objects { get; private set; } public bool WroteCrossReferenceTable { get; private set; } + public bool WritingPageContents { get; set; } public void WriteToken(IToken token, Stream outputStream) { diff --git a/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs b/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs index 035260bc..cac1b1de 100644 --- a/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs @@ -1,9 +1,7 @@ namespace UglyToad.PdfPig.Writer { using System; - using System.Collections.Generic; using System.IO; - using System.Text; using Tokens; internal interface IPdfStreamWriter : IDisposable @@ -20,6 +18,11 @@ /// Stream Stream { get; } + /// + /// Hints that the stream writer is used for writing page contents. + /// + bool WritingPageContents { get; set; } + /// /// Writes a single token to the stream. /// diff --git a/src/UglyToad.PdfPig/Writer/ITokenWriter.cs b/src/UglyToad.PdfPig/Writer/ITokenWriter.cs index a8efc980..4b9f449f 100644 --- a/src/UglyToad.PdfPig/Writer/ITokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/ITokenWriter.cs @@ -37,5 +37,10 @@ IReadOnlyDictionary objectOffsets, IndirectReference catalogToken, Stream outputStream, IndirectReference? documentInformationReference); + + /// + /// Hints to the token writer that we are currently writing page contents. + /// + bool WritingPageContents { get; set; } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs index fe77829a..418bacd1 100644 --- a/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs @@ -18,6 +18,11 @@ namespace UglyToad.PdfPig.Writer /// internal class NoTextTokenWriter : TokenWriter { + /// + /// Set this value prior to processing page to get the right page number in log messages + /// + internal int Page { get; set; } + /// /// Write stream without or operations /// @@ -25,10 +30,16 @@ namespace UglyToad.PdfPig.Writer /// protected override void WriteStream(StreamToken streamToken, Stream outputStream) { - if (!TryGetStreamWithoutText(streamToken, out var outputStreamToken)) + StreamToken outputStreamToken; + if (!WritingPageContents && !IsFormStream(streamToken)) { outputStreamToken = streamToken; } + else if (!TryGetStreamWithoutText(streamToken, out outputStreamToken)) + { + outputStreamToken = streamToken; + } + WriteDictionary(outputStreamToken.StreamDictionary, outputStream); WriteLineBreak(outputStream); outputStream.Write(StreamStart, 0, StreamStart.Length); @@ -38,6 +49,12 @@ namespace UglyToad.PdfPig.Writer outputStream.Write(StreamEnd, 0, StreamEnd.Length); } + private bool IsFormStream(StreamToken streamToken) + { + return streamToken.StreamDictionary.Data.TryGetValue(NameToken.Subtype.Data, out var value) + && (NameToken)value == NameToken.Form; + } + /// /// Try get a stream without or operations. /// @@ -63,7 +80,7 @@ namespace UglyToad.PdfPig.Writer IReadOnlyList operations; try { - operations = pageContentParser.Parse(1, new ByteArrayInputBytes(bytes), new NoOpLog()); + operations = pageContentParser.Parse(Page, new ByteArrayInputBytes(bytes), new NoOpLog()); } catch (Exception) { @@ -76,7 +93,9 @@ namespace UglyToad.PdfPig.Writer var haveText = false; foreach (var op in operations) { - if (op.Operator == ShowText.Symbol || op.Operator == ShowTextsWithPositioning.Symbol) + if (op.Operator == ShowText.Symbol + || op.Operator == ShowTextsWithPositioning.Symbol + || op.Operator == MoveToNextLineShowText.Symbol) { haveText = true; continue; @@ -89,7 +108,22 @@ namespace UglyToad.PdfPig.Writer return false; } outputStreamT.Seek(0, SeekOrigin.Begin); - outputStreamToken = DataCompresser.CompressToStream(outputStreamT.ToArray()); + + var compressedBytes = DataCompresser.CompressBytes(outputStreamT.ToArray()); + var outputStreamDictionary = new Dictionary() + { + { NameToken.Length, new NumericToken(compressedBytes.Length) }, + { NameToken.Filter, NameToken.FlateDecode } + }; + foreach (var kv in streamToken.StreamDictionary.Data) + { + var key = NameToken.Create(kv.Key); + if (!outputStreamDictionary.ContainsKey(key)) + { + outputStreamDictionary[key] = kv.Value; + } + }; + outputStreamToken = new StreamToken(new DictionaryToken(outputStreamDictionary), compressedBytes); return true; } } diff --git a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs index de114a9c..9f710260 100644 --- a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs @@ -360,6 +360,7 @@ namespace UglyToad.PdfPig.Writer // dedup if on to avoid issues var prev = context.AttemptDeduplication; context.AttemptDeduplication = false; + context.WritingPageContents = true; if (contentsToken is ArrayToken array) { foreach (var item in array.Data) @@ -378,6 +379,7 @@ namespace UglyToad.PdfPig.Writer WriterUtil.CopyToken(context, ir, document.Structure.TokenScanner, refs) as IndirectReferenceToken)); } context.AttemptDeduplication = prev; + context.WritingPageContents = false; } // manually copy page dict / resources as we need to modify some @@ -406,7 +408,6 @@ namespace UglyToad.PdfPig.Writer } } - foreach (var kvp in pageInfo.Page.Data) { if (kvp.Key == NameToken.Contents || kvp.Key == NameToken.Parent || kvp.Key == NameToken.Type) diff --git a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs index 078fc2c2..34058c58 100644 --- a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs @@ -25,6 +25,12 @@ public bool AttemptDeduplication { get; set; } = true; + public bool WritingPageContents + { + get => TokenWriter.WritingPageContents; + set => TokenWriter.WritingPageContents = value; + } + internal PdfStreamWriter( Stream baseStream, bool disposeStream = true, diff --git a/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs b/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs index 046b4c7a..ed407fe1 100644 --- a/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs +++ b/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs @@ -83,19 +83,22 @@ namespace UglyToad.PdfPig.Writer /// public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList pagesBundle = null) { - using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter())) + var tokenWriter = new NoTextTokenWriter(); + using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: tokenWriter)) { if (pagesBundle == null) { for (var i = 1; i <= file.NumberOfPages; i++) { + tokenWriter.Page = i; document.AddPage(file, i); } - } + } else { foreach (var i in pagesBundle) { + tokenWriter.Page = i; document.AddPage(file, i); } } diff --git a/src/UglyToad.PdfPig/Writer/TokenWriter.cs b/src/UglyToad.PdfPig/Writer/TokenWriter.cs index 2d0429b4..058a3849 100644 --- a/src/UglyToad.PdfPig/Writer/TokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/TokenWriter.cs @@ -276,6 +276,12 @@ outputStream.Write(Eof, 0, Eof.Length); } + /// + /// Indicates that we are writing page contents. + /// Can be used by a derived class. + /// + public bool WritingPageContents { get; set; } + /// public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream) {