Prevent reading (and modifying!) non-content streams, reducing chances of PDF corruption.

Added skipping operation MoveToNextLineShowText as well.
Also duplicate the original stream's dictionary which solves disappearing elements (due to
missing SubType / BBox for example).
This commit is contained in:
mvantzet
2023-06-21 14:54:11 +02:00
parent ae83f39e28
commit 928c2ef2fc
8 changed files with 68 additions and 9 deletions

View File

@@ -1279,6 +1279,7 @@
public int Tokens { get; private set; }
public int Objects { get; private set; }
public bool WroteCrossReferenceTable { get; private set; }
public bool WritingPageContents { get; set; }
public void WriteToken(IToken token, Stream outputStream)
{

View File

@@ -1,9 +1,7 @@
namespace UglyToad.PdfPig.Writer
{
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using Tokens;
internal interface IPdfStreamWriter : IDisposable
@@ -20,6 +18,11 @@
/// </summary>
Stream Stream { get; }
/// <summary>
/// Hints that the stream writer is used for writing page contents.
/// </summary>
bool WritingPageContents { get; set; }
/// <summary>
/// Writes a single token to the stream.
/// </summary>

View File

@@ -37,5 +37,10 @@
IReadOnlyDictionary<IndirectReference, long> objectOffsets,
IndirectReference catalogToken, Stream outputStream,
IndirectReference? documentInformationReference);
/// <summary>
/// Hints to the token writer that we are currently writing page contents.
/// </summary>
bool WritingPageContents { get; set; }
}
}

View File

@@ -18,6 +18,11 @@ namespace UglyToad.PdfPig.Writer
/// </summary>
internal class NoTextTokenWriter : TokenWriter
{
/// <summary>
/// Set this value prior to processing page to get the right page number in log messages
/// </summary>
internal int Page { get; set; }
/// <summary>
/// Write stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations
/// </summary>
@@ -25,10 +30,16 @@ namespace UglyToad.PdfPig.Writer
/// <param name="outputStream"></param>
protected override void WriteStream(StreamToken streamToken, Stream outputStream)
{
if (!TryGetStreamWithoutText(streamToken, out var outputStreamToken))
StreamToken outputStreamToken;
if (!WritingPageContents && !IsFormStream(streamToken))
{
outputStreamToken = streamToken;
}
else if (!TryGetStreamWithoutText(streamToken, out outputStreamToken))
{
outputStreamToken = streamToken;
}
WriteDictionary(outputStreamToken.StreamDictionary, outputStream);
WriteLineBreak(outputStream);
outputStream.Write(StreamStart, 0, StreamStart.Length);
@@ -38,6 +49,12 @@ namespace UglyToad.PdfPig.Writer
outputStream.Write(StreamEnd, 0, StreamEnd.Length);
}
private bool IsFormStream(StreamToken streamToken)
{
return streamToken.StreamDictionary.Data.TryGetValue(NameToken.Subtype.Data, out var value)
&& (NameToken)value == NameToken.Form;
}
/// <summary>
/// Try get a stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations.
/// </summary>
@@ -63,7 +80,7 @@ namespace UglyToad.PdfPig.Writer
IReadOnlyList<IGraphicsStateOperation> operations;
try
{
operations = pageContentParser.Parse(1, new ByteArrayInputBytes(bytes), new NoOpLog());
operations = pageContentParser.Parse(Page, new ByteArrayInputBytes(bytes), new NoOpLog());
}
catch (Exception)
{
@@ -76,7 +93,9 @@ namespace UglyToad.PdfPig.Writer
var haveText = false;
foreach (var op in operations)
{
if (op.Operator == ShowText.Symbol || op.Operator == ShowTextsWithPositioning.Symbol)
if (op.Operator == ShowText.Symbol
|| op.Operator == ShowTextsWithPositioning.Symbol
|| op.Operator == MoveToNextLineShowText.Symbol)
{
haveText = true;
continue;
@@ -89,7 +108,22 @@ namespace UglyToad.PdfPig.Writer
return false;
}
outputStreamT.Seek(0, SeekOrigin.Begin);
outputStreamToken = DataCompresser.CompressToStream(outputStreamT.ToArray());
var compressedBytes = DataCompresser.CompressBytes(outputStreamT.ToArray());
var outputStreamDictionary = new Dictionary<NameToken, IToken>()
{
{ NameToken.Length, new NumericToken(compressedBytes.Length) },
{ NameToken.Filter, NameToken.FlateDecode }
};
foreach (var kv in streamToken.StreamDictionary.Data)
{
var key = NameToken.Create(kv.Key);
if (!outputStreamDictionary.ContainsKey(key))
{
outputStreamDictionary[key] = kv.Value;
}
};
outputStreamToken = new StreamToken(new DictionaryToken(outputStreamDictionary), compressedBytes);
return true;
}
}

View File

@@ -360,6 +360,7 @@ namespace UglyToad.PdfPig.Writer
// dedup if on to avoid issues
var prev = context.AttemptDeduplication;
context.AttemptDeduplication = false;
context.WritingPageContents = true;
if (contentsToken is ArrayToken array)
{
foreach (var item in array.Data)
@@ -378,6 +379,7 @@ namespace UglyToad.PdfPig.Writer
WriterUtil.CopyToken(context, ir, document.Structure.TokenScanner, refs) as IndirectReferenceToken));
}
context.AttemptDeduplication = prev;
context.WritingPageContents = false;
}
// manually copy page dict / resources as we need to modify some
@@ -406,7 +408,6 @@ namespace UglyToad.PdfPig.Writer
}
}
foreach (var kvp in pageInfo.Page.Data)
{
if (kvp.Key == NameToken.Contents || kvp.Key == NameToken.Parent || kvp.Key == NameToken.Type)

View File

@@ -25,6 +25,12 @@
public bool AttemptDeduplication { get; set; } = true;
public bool WritingPageContents
{
get => TokenWriter.WritingPageContents;
set => TokenWriter.WritingPageContents = value;
}
internal PdfStreamWriter(
Stream baseStream,
bool disposeStream = true,

View File

@@ -83,19 +83,22 @@ namespace UglyToad.PdfPig.Writer
/// </summary>
public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList<int> pagesBundle = null)
{
using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter()))
var tokenWriter = new NoTextTokenWriter();
using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: tokenWriter))
{
if (pagesBundle == null)
{
for (var i = 1; i <= file.NumberOfPages; i++)
{
tokenWriter.Page = i;
document.AddPage(file, i);
}
}
}
else
{
foreach (var i in pagesBundle)
{
tokenWriter.Page = i;
document.AddPage(file, i);
}
}

View File

@@ -276,6 +276,12 @@
outputStream.Write(Eof, 0, Eof.Length);
}
/// <summary>
/// Indicates that we are writing page contents.
/// Can be used by a derived class.
/// </summary>
public bool WritingPageContents { get; set; }
/// <inheritdoc cref="ITokenWriter.WriteObject" />
public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream)
{