From 3594231c6715591d2b5c6866eb83560574f4e212 Mon Sep 17 00:00:00 2001 From: mvantzet Date: Thu, 15 Dec 2022 18:01:10 +0100 Subject: [PATCH 1/5] Introduce ITokenWriter / non-static TokenWriter class. This is the first step in making it possible to override methods in the token writer, for example to filter streams when writing using PdfDocumentBuilder. The second step is injecting ITokenWriter into PdfDocumentBuilder. --- .../Writer/TokenWriterTests.cs | 5 +- .../MarkedContent/BeginMarkedContent.cs | 2 + .../BeginMarkedContentWithProperties.cs | 2 + .../DesignateMarkedContentPoint.cs | 2 + ...signateMarkedContentPointWithProperties.cs | 2 + .../Operations/SetNonStrokeColorAdvanced.cs | 2 + .../Operations/SetNonStrokeColorSpace.cs | 2 + .../Operations/SetStrokeColorAdvanced.cs | 2 + .../Operations/SetStrokeColorSpace.cs | 2 + .../TextShowing/ShowTextsWithPositioning.cs | 2 + .../Writer/Fonts/ToUnicodeCMapBuilder.cs | 2 + src/UglyToad.PdfPig/Writer/ITokenWriter.cs | 17 ++ src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs | 1 + src/UglyToad.PdfPig/Writer/TokenWriter.cs | 145 ++++++++++++++---- 14 files changed, 157 insertions(+), 31 deletions(-) create mode 100644 src/UglyToad.PdfPig/Writer/ITokenWriter.cs diff --git a/src/UglyToad.PdfPig.Tests/Writer/TokenWriterTests.cs b/src/UglyToad.PdfPig.Tests/Writer/TokenWriterTests.cs index 017a9b80..18bc7d95 100644 --- a/src/UglyToad.PdfPig.Tests/Writer/TokenWriterTests.cs +++ b/src/UglyToad.PdfPig.Tests/Writer/TokenWriterTests.cs @@ -12,10 +12,11 @@ [Fact] public void EscapeSpecialCharacter() { + var writer = new TokenWriter(); using (var memStream = new MemoryStream()) { - TokenWriter.WriteToken(new StringToken("\\"), memStream); - TokenWriter.WriteToken(new StringToken("(Hello)"), memStream); + writer.WriteToken(new StringToken("\\"), memStream); + writer.WriteToken(new StringToken("(Hello)"), memStream); // Read Test memStream.Position = 0; diff --git a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContent.cs b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContent.cs index 17121168..b81c9759 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContent.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContent.cs @@ -10,6 +10,8 @@ /// public class BeginMarkedContent : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContentWithProperties.cs b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContentWithProperties.cs index 73e7cdbe..75063865 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContentWithProperties.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContentWithProperties.cs @@ -11,6 +11,8 @@ /// public class BeginMarkedContentWithProperties : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPoint.cs b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPoint.cs index a78d2b91..c8aeb3c5 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPoint.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPoint.cs @@ -10,6 +10,8 @@ /// public class DesignateMarkedContentPoint : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPointWithProperties.cs b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPointWithProperties.cs index 53fe8aeb..0f282fc0 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPointWithProperties.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPointWithProperties.cs @@ -11,6 +11,8 @@ /// public class DesignateMarkedContentPointWithProperties : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorAdvanced.cs b/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorAdvanced.cs index e1be2adb..42652862 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorAdvanced.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorAdvanced.cs @@ -13,6 +13,8 @@ /// public class SetNonStrokeColorAdvanced : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorSpace.cs b/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorSpace.cs index 7715e82d..5c06d741 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorSpace.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorSpace.cs @@ -11,6 +11,8 @@ /// public class SetNonStrokeColorSpace : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorAdvanced.cs b/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorAdvanced.cs index 852f4f89..825ece3d 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorAdvanced.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorAdvanced.cs @@ -13,6 +13,8 @@ /// public class SetStrokeColorAdvanced : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorSpace.cs b/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorSpace.cs index 8c03e731..4ae622da 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorSpace.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorSpace.cs @@ -11,6 +11,8 @@ /// public class SetStrokeColorSpace : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/TextShowing/ShowTextsWithPositioning.cs b/src/UglyToad.PdfPig/Graphics/Operations/TextShowing/ShowTextsWithPositioning.cs index 78e2d4b1..08695238 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/TextShowing/ShowTextsWithPositioning.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/TextShowing/ShowTextsWithPositioning.cs @@ -15,6 +15,8 @@ /// public class ShowTextsWithPositioning : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Writer/Fonts/ToUnicodeCMapBuilder.cs b/src/UglyToad.PdfPig/Writer/Fonts/ToUnicodeCMapBuilder.cs index 6015bf0d..41768266 100644 --- a/src/UglyToad.PdfPig/Writer/Fonts/ToUnicodeCMapBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/Fonts/ToUnicodeCMapBuilder.cs @@ -14,6 +14,8 @@ private const string DictToken = "dict"; private const string FindResourceToken = "findresource"; + private static readonly TokenWriter TokenWriter = new TokenWriter(); + public static IReadOnlyList ConvertToCMapStream(IReadOnlyDictionary unicodeToCharacterCode) { using (var memoryStream = new MemoryStream()) diff --git a/src/UglyToad.PdfPig/Writer/ITokenWriter.cs b/src/UglyToad.PdfPig/Writer/ITokenWriter.cs new file mode 100644 index 00000000..ea851127 --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/ITokenWriter.cs @@ -0,0 +1,17 @@ +namespace UglyToad.PdfPig.Writer; + +using System.IO; +using Tokens; + +/// +/// Writes any type of to the corresponding PDF document format output. +/// +public interface ITokenWriter +{ + /// + /// Writes the given input token to the output stream with the correct PDF format and encoding including whitespace and line breaks as applicable. + /// + /// The token to write to the stream. + /// The stream to write the token to. + void WriteToken(IToken token, Stream outputStream); +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs index 65d7ec23..364a57ef 100644 --- a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs @@ -18,6 +18,7 @@ protected bool DisposeStream { get; set; } protected bool Initialized { get; set; } protected int CurrentNumber { get; set; } = 1; + protected readonly static TokenWriter TokenWriter = new TokenWriter(); internal PdfStreamWriter(Stream baseStream, bool disposeStream = true) { diff --git a/src/UglyToad.PdfPig/Writer/TokenWriter.cs b/src/UglyToad.PdfPig/Writer/TokenWriter.cs index 88202ad6..50cbb2ef 100644 --- a/src/UglyToad.PdfPig/Writer/TokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/TokenWriter.cs @@ -14,10 +14,8 @@ /// /// Writes any type of to the corresponding PDF document format output. /// - public class TokenWriter + public class TokenWriter : ITokenWriter { - private static readonly byte Backslash = GetByte("\\"); - private static readonly byte ArrayStart = GetByte("["); private static readonly byte ArrayEnd = GetByte("]"); @@ -46,10 +44,18 @@ private static readonly byte[] StartXref = OtherEncodings.StringAsLatin1Bytes("startxref"); - private static readonly byte[] StreamStart = OtherEncodings.StringAsLatin1Bytes("stream"); - private static readonly byte[] StreamEnd = OtherEncodings.StringAsLatin1Bytes("endstream"); + /// + /// Bytes that indicate start of stream + /// + protected static readonly byte[] StreamStart = OtherEncodings.StringAsLatin1Bytes("stream"); + + /// + /// Bytes that indicate end start of stream + /// + protected static readonly byte[] StreamEnd = OtherEncodings.StringAsLatin1Bytes("endstream"); private static readonly byte StringStart = GetByte("("); + private static readonly byte StringEnd = GetByte(")"); private static readonly byte[] Trailer = OtherEncodings.StringAsLatin1Bytes("trailer"); @@ -79,7 +85,7 @@ /// /// The token to write to the stream. /// The stream to write the token to. - public static void WriteToken(IToken token, Stream outputStream) + public void WriteToken(IToken token, Stream outputStream) { if (token == null) { @@ -136,7 +142,7 @@ /// The object representing the catalog dictionary which is referenced from the trailer dictionary. /// The output stream to write to. /// The object reference for the document information dictionary if present. - internal static void WriteCrossReferenceTable(IReadOnlyDictionary objectOffsets, + internal void WriteCrossReferenceTable(IReadOnlyDictionary objectOffsets, IndirectReference catalogToken, Stream outputStream, IndirectReference? documentInformationReference) @@ -278,7 +284,7 @@ /// Generation of the indirect object. /// Pre-serialized object contents. /// The stream to write the token to. - internal static void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream) + internal void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream) { WriteLong(objectNumber, outputStream); WriteWhitespace(outputStream); @@ -297,14 +303,24 @@ WriteLineBreak(outputStream); } - private static void WriteHex(HexToken hex, Stream stream) + /// + /// Write a hex value to the output stream + /// + /// + /// + protected void WriteHex(HexToken hex, Stream stream) { stream.WriteByte(HexStart); stream.WriteText(hex.GetHexString()); stream.WriteByte(HexEnd); } - private static void WriteArray(ArrayToken array, Stream outputStream) + /// + /// Write an array to the output stream, with whitespace at the end. + /// + /// + /// + protected void WriteArray(ArrayToken array, Stream outputStream) { outputStream.WriteByte(ArrayStart); WriteWhitespace(outputStream); @@ -319,14 +335,24 @@ WriteWhitespace(outputStream); } - private static void WriteBoolean(BooleanToken boolean, Stream outputStream) + /// + /// Write a boolean "true" or "false" to the output stream, with whitespace at the end. + /// + /// + /// + protected void WriteBoolean(BooleanToken boolean, Stream outputStream) { var bytes = boolean.Data ? TrueBytes : FalseBytes; outputStream.Write(bytes, 0, bytes.Length); WriteWhitespace(outputStream); } - private static void WriteComment(CommentToken comment, Stream outputStream) + /// + /// Write a "%comment" in the output stream, with a line break at the end. + /// + /// + /// + protected void WriteComment(CommentToken comment, Stream outputStream) { var bytes = OtherEncodings.StringAsLatin1Bytes(comment.Data); outputStream.WriteByte(Comment); @@ -334,7 +360,12 @@ WriteLineBreak(outputStream); } - private static void WriteDictionary(DictionaryToken dictionary, Stream outputStream) + /// + /// Writes dictionary key/value pairs to output stream as Name/Token pairs. + /// + /// + /// + protected void WriteDictionary(DictionaryToken dictionary, Stream outputStream) { outputStream.Write(DictionaryStart, 0, DictionaryStart.Length); @@ -356,7 +387,12 @@ outputStream.Write(DictionaryEnd, 0, DictionaryEnd.Length); } - private static void WriteIndirectReference(IndirectReferenceToken reference, Stream outputStream) + /// + /// Write an indirect reference to the stream, with whitespace at the end. + /// + /// + /// + protected virtual void WriteIndirectReference(IndirectReferenceToken reference, Stream outputStream) { WriteLong(reference.Data.ObjectNumber, outputStream); WriteWhitespace(outputStream); @@ -368,12 +404,17 @@ WriteWhitespace(outputStream); } - private static void WriteName(NameToken name, Stream outputStream) + /// + /// Write a name to the stream, with whitespace at the end. + /// + /// + /// + protected virtual void WriteName(NameToken name, Stream outputStream) { WriteName(name.Data, outputStream); } - private static void WriteName(string name, Stream outputStream) + private void WriteName(string name, Stream outputStream) { /* * Beginning with PDF 1.2, any character except null (character code 0) may be @@ -404,7 +445,12 @@ WriteWhitespace(outputStream); } - private static void WriteNumber(NumericToken number, Stream outputStream) + /// + /// Write a number to the stream, with whitespace at the end. + /// + /// + /// + protected virtual void WriteNumber(NumericToken number, Stream outputStream) { if (!number.HasDecimalPlaces) { @@ -419,7 +465,15 @@ WriteWhitespace(outputStream); } - private static void WriteObject(ObjectToken objectToken, Stream outputStream) + /// + /// Write an object to the stream, with a line break at the end. It writes the following contents: + /// - "[ObjectNumber] [Generation] obj" + /// - Object data + /// - "endobj" + /// + /// + /// + protected virtual void WriteObject(ObjectToken objectToken, Stream outputStream) { WriteLong(objectToken.Number.ObjectNumber, outputStream); WriteWhitespace(outputStream); @@ -438,7 +492,16 @@ WriteLineBreak(outputStream); } - private static void WriteStream(StreamToken streamToken, Stream outputStream) + /// + /// Write a stream token to the output stream, with the following contents: + /// - Dictionary specifying the length of the stream, any applied compression filters and additional information. + /// - Stream start indicator + /// - Bytes in the StreamToken data + /// - Stream end indicator + /// + /// + /// + protected virtual void WriteStream(StreamToken streamToken, Stream outputStream) { WriteDictionary(streamToken.StreamDictionary, outputStream); WriteLineBreak(outputStream); @@ -449,15 +512,22 @@ outputStream.Write(StreamEnd, 0, StreamEnd.Length); } - private static int[] EscapeNeeded = new int[] + private static readonly int[] EscapeNeeded = new int[] { '\r', '\n', '\t', '\b', '\f', '\\' }; - private static int[] Escaped = new int[] + + private static readonly int[] Escaped = new int[] { 'r', 'n', 't', 'b', 'f', '\\' }; - private static void WriteString(StringToken stringToken, Stream outputStream) + + /// + /// Write string to the stream, with whitespace at the end + /// + /// + /// + protected virtual void WriteString(StringToken stringToken, Stream outputStream) { outputStream.WriteByte(StringStart); @@ -515,29 +585,47 @@ WriteWhitespace(outputStream); } - private static void WriteInt(int value, Stream outputStream) + /// + /// Write an integer to the stream + /// + /// + /// + protected virtual void WriteInt(int value, Stream outputStream) { var bytes = OtherEncodings.StringAsLatin1Bytes(value.ToString("G", CultureInfo.InvariantCulture)); outputStream.Write(bytes, 0, bytes.Length); } - private static void WriteLineBreak(Stream outputStream) + /// + /// Write a line break to the output stream + /// + /// + protected virtual void WriteLineBreak(Stream outputStream) { outputStream.WriteNewLine(); } - private static void WriteLong(long value, Stream outputStream) + /// + /// Write a long to the stream + /// + /// + /// + protected virtual void WriteLong(long value, Stream outputStream) { var bytes = OtherEncodings.StringAsLatin1Bytes(value.ToString("G", CultureInfo.InvariantCulture)); outputStream.Write(bytes, 0, bytes.Length); } - private static void WriteWhitespace(Stream outputStream) + /// + /// Write a space to the output stream + /// + /// + protected virtual void WriteWhitespace(Stream outputStream) { outputStream.WriteByte(Whitespace); } - private static void WriteFirstXrefEmptyEntry(Stream outputStream) + private void WriteFirstXrefEmptyEntry(Stream outputStream) { /* * The first entry in the table (object number 0) is always free and has a generation number of 65,535; @@ -591,5 +679,4 @@ } } } -} - +} \ No newline at end of file From 6125c000896fad7d10179c81954d8f3730e7c1ff Mon Sep 17 00:00:00 2001 From: mvantzet Date: Tue, 20 Dec 2022 10:50:41 +0100 Subject: [PATCH 2/5] Make it possible to inject a custom ITokenWriter in PdfDocumentBuilder. --- src/UglyToad.PdfPig/Writer/ITokenWriter.cs | 20 +++++++++++++++++++ .../Writer/PdfDedupStreamWriter.cs | 2 +- .../Writer/PdfDocumentBuilder.cs | 12 +++++------ src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs | 6 +++--- src/UglyToad.PdfPig/Writer/TokenWriter.cs | 20 ++++--------------- 5 files changed, 34 insertions(+), 26 deletions(-) diff --git a/src/UglyToad.PdfPig/Writer/ITokenWriter.cs b/src/UglyToad.PdfPig/Writer/ITokenWriter.cs index ea851127..1445f1e4 100644 --- a/src/UglyToad.PdfPig/Writer/ITokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/ITokenWriter.cs @@ -1,5 +1,7 @@ namespace UglyToad.PdfPig.Writer; +using Core; +using System.Collections.Generic; using System.IO; using Tokens; @@ -14,4 +16,22 @@ public interface ITokenWriter /// The token to write to the stream. /// The stream to write the token to. void WriteToken(IToken token, Stream outputStream); + + /// + /// Writes pre-serialized token as an object token to the output stream. + /// + /// Object number of the indirect object. + /// Generation of the indirect object. + /// Pre-serialized object contents. + /// The stream to write the token to. + void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream); + + /// + /// Writes a valid single section cross-reference (xref) table plus trailer dictionary to the output for the set of object offsets. + /// + /// The byte offset from the start of the document for each object in the document. + /// The object representing the catalog dictionary which is referenced from the trailer dictionary. + /// The output stream to write to. + /// The object reference for the document information dictionary if present. + void WriteCrossReferenceTable(IReadOnlyDictionary objectOffsets, IndirectReference catalogToken, Stream outputStream, IndirectReference? documentInformationReference); } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs b/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs index 0d1cb1af..bd547b89 100644 --- a/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs @@ -8,7 +8,7 @@ { private readonly Dictionary hashes = new Dictionary(new FNVByteComparison()); - public PdfDedupStreamWriter(Stream stream, bool dispose) : base(stream, dispose) + public PdfDedupStreamWriter(Stream stream, bool dispose, ITokenWriter tokenWriter = null) : base(stream, dispose, tokenWriter) { } diff --git a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs index 039223bf..56b102a3 100644 --- a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs @@ -3,7 +3,6 @@ namespace UglyToad.PdfPig.Writer { using System; using System.Collections.Generic; - using System.Diagnostics; using System.IO; using System.Linq; using Content; @@ -27,7 +26,7 @@ namespace UglyToad.PdfPig.Writer private readonly Dictionary pages = new Dictionary(); private readonly Dictionary fonts = new Dictionary(); private bool completed = false; - internal int fontId = 0; + private int fontId = 0; private readonly static ArrayToken DefaultProcSet = new ArrayToken(new List { @@ -90,20 +89,21 @@ namespace UglyToad.PdfPig.Writer /// If stream should be disposed when builder is. /// Type of pdf stream writer to use /// Pdf version to use in header. - public PdfDocumentBuilder(Stream stream, bool disposeStream = false, PdfWriterType type = PdfWriterType.Default, decimal version = 1.7m) + /// Token writer to use + public PdfDocumentBuilder(Stream stream, bool disposeStream = false, PdfWriterType type = PdfWriterType.Default, decimal version = 1.7m, ITokenWriter tokenWriter = null) { switch (type) { case PdfWriterType.ObjectInMemoryDedup: - context = new PdfDedupStreamWriter(stream, disposeStream); + context = new PdfDedupStreamWriter(stream, disposeStream, tokenWriter); break; default: - context = new PdfStreamWriter(stream, disposeStream); + context = new PdfStreamWriter(stream, disposeStream, tokenWriter); break; } context.InitializePdf(version); } - + /// /// Determines whether the bytes of the TrueType font file provided can be used in a PDF document. /// diff --git a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs index 364a57ef..0f0e6f7f 100644 --- a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs @@ -18,17 +18,17 @@ protected bool DisposeStream { get; set; } protected bool Initialized { get; set; } protected int CurrentNumber { get; set; } = 1; - protected readonly static TokenWriter TokenWriter = new TokenWriter(); + protected readonly ITokenWriter TokenWriter; - internal PdfStreamWriter(Stream baseStream, bool disposeStream = true) + internal PdfStreamWriter(Stream baseStream, bool disposeStream = true, ITokenWriter tokenWriter = null) { Stream = baseStream ?? throw new ArgumentNullException(nameof(baseStream)); if (!baseStream.CanWrite) { throw new ArgumentException("Output stream must be writable"); } - DisposeStream = disposeStream; + TokenWriter = tokenWriter ?? new TokenWriter(); } public Stream Stream { get; protected set; } diff --git a/src/UglyToad.PdfPig/Writer/TokenWriter.cs b/src/UglyToad.PdfPig/Writer/TokenWriter.cs index 50cbb2ef..05cac5b2 100644 --- a/src/UglyToad.PdfPig/Writer/TokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/TokenWriter.cs @@ -135,14 +135,8 @@ } } - /// - /// Writes a valid single section cross-reference (xref) table plus trailer dictionary to the output for the set of object offsets. - /// - /// The byte offset from the start of the document for each object in the document. - /// The object representing the catalog dictionary which is referenced from the trailer dictionary. - /// The output stream to write to. - /// The object reference for the document information dictionary if present. - internal void WriteCrossReferenceTable(IReadOnlyDictionary objectOffsets, + /// + public void WriteCrossReferenceTable(IReadOnlyDictionary objectOffsets, IndirectReference catalogToken, Stream outputStream, IndirectReference? documentInformationReference) @@ -277,14 +271,8 @@ outputStream.Write(Eof, 0, Eof.Length); } - /// - /// Writes pre-serialized token as an object token to the output stream. - /// - /// Object number of the indirect object. - /// Generation of the indirect object. - /// Pre-serialized object contents. - /// The stream to write the token to. - internal void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream) + /// + public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream) { WriteLong(objectNumber, outputStream); WriteWhitespace(outputStream); From 6ef6c4d780100cf819df7ba13ad5da9a5a7baf40 Mon Sep 17 00:00:00 2001 From: mvantzet Date: Tue, 20 Dec 2022 21:31:15 +0100 Subject: [PATCH 3/5] Added a PdfTextRemover utility that uses a NoTextTokenWriter, to output PDFs without text contents. Also added unit tests to test: - If we can use a custom ITokenWriter with PdfDocumentBuilder - If removing text works. --- .../Writer/PdfDocumentBuilderTests.cs | 211 +++++++++++------- .../Writer/PdfTextRemoverTests.cs | 35 +++ .../Writer/NoTextTokenWriter.cs | 97 ++++++++ src/UglyToad.PdfPig/Writer/PdfTextRemover.cs | 100 +++++++++ 4 files changed, 361 insertions(+), 82 deletions(-) create mode 100644 src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs create mode 100644 src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs create mode 100644 src/UglyToad.PdfPig/Writer/PdfTextRemover.cs diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs index 84a0f02f..45b4b24f 100644 --- a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs @@ -1,73 +1,74 @@ namespace UglyToad.PdfPig.Tests.Writer { using System.IO; - using System.Linq; - using Content; - using Integration; - using PdfPig.Core; - using PdfPig.Fonts.Standard14Fonts; - using PdfPig.Tokens; - using PdfPig.Writer; + using System.Linq; + using Content; + using Integration; + using PdfPig.Core; + using PdfPig.Fonts.Standard14Fonts; + using PdfPig.Tokens; + using PdfPig.Writer; + using System.Collections.Generic; using Tests.Fonts.TrueType; using Xunit; - public class PdfDocumentBuilderTests - { - [Fact] - public void CanWriteSingleBlankPage() - { - var result = CreateSingleBlankPage(); - - WriteFile(nameof(CanWriteSinglePageHelloWorld), result); - - Assert.NotEmpty(result); - - var str = OtherEncodings.BytesAsLatin1String(result); - Assert.StartsWith("%PDF", str); - Assert.EndsWith("%%EOF", str); - } - - [Fact] - public void CanCreateSingleCustomPageSize() - { - var builder = new PdfDocumentBuilder(); - - var page = builder.AddPage(120, 250); - - var font = builder.AddStandard14Font(Standard14Font.Helvetica); - - page.AddText("Small page.", 12, new PdfPoint(25, 200), font); - - var bytes = builder.Build(); - - WriteFile(nameof(CanCreateSingleCustomPageSize), bytes); - - using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff)) - { - Assert.Equal(1, document.NumberOfPages); - - var page1 = document.GetPage(1); - - Assert.Equal(120, page1.Width); - Assert.Equal(250, page1.Height); - - Assert.Equal("Small page.", page1.Text); - } + public class PdfDocumentBuilderTests + { + [Fact] + public void CanWriteSingleBlankPage() + { + var result = CreateSingleBlankPage(); + + WriteFile(nameof(CanWriteSinglePageHelloWorld), result); + + Assert.NotEmpty(result); + + var str = OtherEncodings.BytesAsLatin1String(result); + Assert.StartsWith("%PDF", str); + Assert.EndsWith("%%EOF", str); } - [Fact] - public void CanFastAddPageAndInheritProps() - { - var first = IntegrationHelpers.GetDocumentPath("inherited_mediabox.pdf"); - var contents = File.ReadAllBytes(first); - + [Fact] + public void CanCreateSingleCustomPageSize() + { + var builder = new PdfDocumentBuilder(); - byte[] results = null; - using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) - using (var output = new PdfDocumentBuilder()) + var page = builder.AddPage(120, 250); + + var font = builder.AddStandard14Font(Standard14Font.Helvetica); + + page.AddText("Small page.", 12, new PdfPoint(25, 200), font); + + var bytes = builder.Build(); + + WriteFile(nameof(CanCreateSingleCustomPageSize), bytes); + + using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff)) + { + Assert.Equal(1, document.NumberOfPages); + + var page1 = document.GetPage(1); + + Assert.Equal(120, page1.Width); + Assert.Equal(250, page1.Height); + + Assert.Equal("Small page.", page1.Text); + } + } + + [Fact] + public void CanFastAddPageAndInheritProps() + { + var first = IntegrationHelpers.GetDocumentPath("inherited_mediabox.pdf"); + var contents = File.ReadAllBytes(first); + + + byte[] results = null; + using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) + using (var output = new PdfDocumentBuilder()) { output.AddPage(existing, 1); - results = output.Build(); + results = output.Build(); } using (var rewritted = PdfDocument.Open(results, ParsingOptions.LenientParsingOff)) @@ -75,22 +76,22 @@ var pg = rewritted.GetPage(1); Assert.Equal(200, pg.MediaBox.Bounds.Width); Assert.Equal(100, pg.MediaBox.Bounds.Height); - } + } } - [Fact] - public void CanFastAddPageWithStreamSubtype() - { - var first = IntegrationHelpers.GetDocumentPath("steam_in_page_dict.pdf"); - var contents = File.ReadAllBytes(first); - + [Fact] + public void CanFastAddPageWithStreamSubtype() + { + var first = IntegrationHelpers.GetDocumentPath("steam_in_page_dict.pdf"); + var contents = File.ReadAllBytes(first); - byte[] results = null; - using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) - using (var output = new PdfDocumentBuilder()) + + byte[] results = null; + using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) + using (var output = new PdfDocumentBuilder()) { output.AddPage(existing, 1); - results = output.Build(); + results = output.Build(); } using (var rewritted = PdfDocument.Open(results, ParsingOptions.LenientParsingOff)) @@ -98,26 +99,26 @@ // really just checking for no exception... var pg = rewritted.GetPage(1); Assert.NotNull(pg.Content); - } + } } - [Fact] - public void CanFastAddPageAndStripLinkAnnots() - { - var first = IntegrationHelpers.GetDocumentPath("outline.pdf"); - var contents = File.ReadAllBytes(first); - + [Fact] + public void CanFastAddPageAndStripLinkAnnots() + { + var first = IntegrationHelpers.GetDocumentPath("outline.pdf"); + var contents = File.ReadAllBytes(first); + var annotCount = 0; - byte[] results = null; - using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) - using (var output = new PdfDocumentBuilder()) + byte[] results = null; + using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) + using (var output = new PdfDocumentBuilder()) { output.AddPage(existing, 1); results = output.Build(); var pg = existing.GetPage(1); var annots = pg.ExperimentalAccess.GetAnnotations().ToList(); annotCount = annots.Count; - Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link); + Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link); } using (var rewritten = PdfDocument.Open(results, ParsingOptions.LenientParsingOff)) @@ -126,7 +127,7 @@ var annots = pg.ExperimentalAccess.GetAnnotations().ToList(); Assert.Equal(annotCount - 1, annots.Count); Assert.DoesNotContain(annots, x => x.Type == Annotations.AnnotationType.Link); - } + } } [Fact] @@ -1114,6 +1115,27 @@ } } + [Fact] + public void CanUseCustomTokenWriter() + { + var docPath = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf"); + var tw = new TestTokenWriter(); + + using (var doc = PdfDocument.Open(docPath)) + using (var ms = new MemoryStream()) + using (var builder = new PdfDocumentBuilder(ms, tokenWriter: tw)) + { + for (var i = 1; i <= doc.NumberOfPages; i++) + { + builder.AddPage(doc, i); + } + builder.Build(); + } + Assert.Equal(tw.Objects, 0); // No objects in sample file + Assert.True(tw.Tokens > 1000); // Roughly 1065 + Assert.True(tw.WroteCrossReferenceTable); + } + private static void WriteFile(string name, byte[] bytes, string extension = "pdf") { try @@ -1133,4 +1155,29 @@ } } } + + public class TestTokenWriter : ITokenWriter + { + public int Tokens { get; private set; } + public int Objects { get; private set; } + public bool WroteCrossReferenceTable { get; private set; } + + public void WriteToken(IToken token, Stream outputStream) + { + Tokens++; + } + + public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream) + { + Objects++; + } + + public void WriteCrossReferenceTable(IReadOnlyDictionary objectOffsets, + IndirectReference catalogToken, + Stream outputStream, + IndirectReference? documentInformationReference) + { + WroteCrossReferenceTable = true; + } + } } diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs new file mode 100644 index 00000000..00814bfc --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs @@ -0,0 +1,35 @@ +using UglyToad.PdfPig.Tests.Integration; +using UglyToad.PdfPig.Writer; +using System.IO; +using Xunit; + +namespace UglyToad.PdfPig.Tests.Writer +{ + public class PdfTextRemoverTests + { + [Theory] + [InlineData("Two Page Text Only - from libre office.pdf")] + [InlineData("cat-genetics.pdf")] + [InlineData("Motor Insurance claim form.pdf")] + [InlineData("Single Page Images - from libre office.pdf")] + public void TextRemoverRemovesText(string file) + { + var filePath = IntegrationHelpers.GetDocumentPath(file); + using (var document = PdfDocument.Open(filePath)) + { + var withoutText = PdfTextRemover.RemoveText(filePath); + File.WriteAllBytes(@"C:\temp\_tx.pdf", withoutText); + using (var documentWithoutText = PdfDocument.Open(withoutText)) + { + Assert.Equal(document.NumberOfPages, documentWithoutText.NumberOfPages); + for (var i = 1; i <= documentWithoutText.NumberOfPages; i++) + { + Assert.NotEqual(document.GetPage(i).Text, string.Empty); + Assert.Equal(documentWithoutText.GetPage(i).Text, string.Empty); + } + + } + } + } + } +} diff --git a/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs new file mode 100644 index 00000000..fe77829a --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs @@ -0,0 +1,97 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using UglyToad.PdfPig.Core; +using UglyToad.PdfPig.Filters; +using UglyToad.PdfPig.Graphics.Operations.TextShowing; +using UglyToad.PdfPig.Graphics.Operations; +using UglyToad.PdfPig.Graphics; +using UglyToad.PdfPig.Logging; +using UglyToad.PdfPig.Parser; +using UglyToad.PdfPig.Tokens; + +namespace UglyToad.PdfPig.Writer +{ + /// + /// Derived class of that does not write or operations in streams + /// + internal class NoTextTokenWriter : TokenWriter + { + /// + /// Write stream without or operations + /// + /// + /// + protected override void WriteStream(StreamToken streamToken, Stream outputStream) + { + if (!TryGetStreamWithoutText(streamToken, out var outputStreamToken)) + { + outputStreamToken = streamToken; + } + WriteDictionary(outputStreamToken.StreamDictionary, outputStream); + WriteLineBreak(outputStream); + outputStream.Write(StreamStart, 0, StreamStart.Length); + WriteLineBreak(outputStream); + outputStream.Write(outputStreamToken.Data.ToArray(), 0, outputStreamToken.Data.Count); + WriteLineBreak(outputStream); + outputStream.Write(StreamEnd, 0, StreamEnd.Length); + } + + /// + /// Try get a stream without or operations. + /// + /// + /// + /// true if any text operation found (and we have a valid without the text operations), + /// false if no text operation found (in which case is null) + private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken outputStreamToken) + { + var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance); + IReadOnlyList bytes; + try + { + bytes = streamToken.Decode(filterProvider); + } + catch + { + outputStreamToken = null; + return false; + } + + var pageContentParser = new PageContentParser(new ReflectionGraphicsStateOperationFactory()); + IReadOnlyList operations; + try + { + operations = pageContentParser.Parse(1, new ByteArrayInputBytes(bytes), new NoOpLog()); + } + catch (Exception) + { + outputStreamToken = null; + return false; + } + + using (var outputStreamT = new MemoryStream()) + { + var haveText = false; + foreach (var op in operations) + { + if (op.Operator == ShowText.Symbol || op.Operator == ShowTextsWithPositioning.Symbol) + { + haveText = true; + continue; + } + op.Write(outputStreamT); + } + if (!haveText) + { + outputStreamToken = null; + return false; + } + outputStreamT.Seek(0, SeekOrigin.Begin); + outputStreamToken = DataCompresser.CompressToStream(outputStreamT.ToArray()); + return true; + } + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs b/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs new file mode 100644 index 00000000..369c55b3 --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs @@ -0,0 +1,100 @@ +using System; +using System.Collections.Generic; +using System.IO; + +namespace UglyToad.PdfPig.Writer +{ + /// + /// Class to remove text from PDFs, useful as a preprocessing step for Optical Character Recognition (OCR) + /// + public static class PdfTextRemover + { + /// + /// Return PDF without text as bytes + /// Path to PDF + /// List of pages to emit; if null all pages are emitted + /// + public static byte[] RemoveText(string filePath, IReadOnlyList pagesBundle = null) + { + using (var output = new MemoryStream()) + { + RemoveText(output, filePath, pagesBundle); + return output.ToArray(); + } + } + + /// + /// Write PDF without text to the output stream. The caller must manage disposing the output stream. + /// Must be writable + /// Path to PDF + /// List of pages to emit; if null all pages are emitted + /// + public static void RemoveText(Stream output, string filePath, IReadOnlyList pagesBundle = null) + { + using (var stream = File.OpenRead(filePath)) + { + RemoveText(stream, output, pagesBundle); + } + } + + /// + /// Remove text from the PDF (passed in as a byte array) and return it as a new byte array + /// PDF document (as byte array) + /// List of pages to emit; if null all pages are emitted + /// PDF without text (as a byte array) + /// + public static byte[] RemoveText(byte[] file, IReadOnlyList pagesBundle = null) + { + _ = file ?? throw new ArgumentNullException(nameof(file)); + + using (var output = new MemoryStream()) + { + RemoveText(PdfDocument.Open(file), output, pagesBundle); + return output.ToArray(); + } + } + + /// + /// Remove text from the PDF in the input stream and write it to the output stream. + /// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream. + /// Streams for the file contents, this must support reading and seeking. + /// Must be writable + /// List of pages to emit; if null all pages are emitted + /// + public static void RemoveText(Stream stream, Stream output, IReadOnlyList pagesBundle = null) + { + _ = stream ?? throw new ArgumentNullException(nameof(stream)); + _ = output ?? throw new ArgumentNullException(nameof(output)); + + RemoveText(PdfDocument.Open(stream), output, pagesBundle); + } + + /// + /// Remove text from the PDF and write it to the output stream. + /// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream. + /// PDF document + /// Must be writable + /// List of pages to emit; if null all pages are emitted + /// + public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList pagesBundle = null) + { + using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter())) + { + if (pagesBundle == null) + { + for (var i = 1; i <= file.NumberOfPages; i++) + { + document.AddPage(file, i); + } + } + else + { + foreach (var i in pagesBundle) + { + document.AddPage(file, i); + } + } + } + } + } +} From 371e148c63bace622559adb3359da6d5cf871850 Mon Sep 17 00:00:00 2001 From: mvantzet Date: Tue, 20 Dec 2022 21:31:35 +0100 Subject: [PATCH 4/5] Remove unreachable code --- src/UglyToad.PdfPig/PdfFonts/Cmap/CharacterMapBuilder.cs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/UglyToad.PdfPig/PdfFonts/Cmap/CharacterMapBuilder.cs b/src/UglyToad.PdfPig/PdfFonts/Cmap/CharacterMapBuilder.cs index 388967aa..bf254fa4 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Cmap/CharacterMapBuilder.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Cmap/CharacterMapBuilder.cs @@ -96,8 +96,6 @@ } return CharacterIdentifierSystemInfo; - - throw new InvalidOperationException("The Character Identifier System Information was never set."); } public void UseCMap(CMap other) From 94c62e1b656920e2f1fdfba0a4cef74ae0fb84e6 Mon Sep 17 00:00:00 2001 From: mvantzet Date: Wed, 21 Dec 2022 09:35:29 +0100 Subject: [PATCH 5/5] Forgot to commit this updated test, to allow ITokenWriter and PdfTextRemover to be public. --- src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 3e3b1ef3..f110d4b5 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -217,9 +217,11 @@ "UglyToad.PdfPig.Util.DefaultWordExtractor", "UglyToad.PdfPig.Util.DateFormatHelper", "UglyToad.PdfPig.Util.WhitespaceSizeStatistics", + "UglyToad.PdfPig.Writer.ITokenWriter", "UglyToad.PdfPig.Writer.PdfAStandard", "UglyToad.PdfPig.Writer.PdfDocumentBuilder", "UglyToad.PdfPig.Writer.PdfMerger", + "UglyToad.PdfPig.Writer.PdfTextRemover", "UglyToad.PdfPig.Writer.PdfWriterType", "UglyToad.PdfPig.Writer.PdfPageBuilder", "UglyToad.PdfPig.Writer.TokenWriter",