diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 3e3b1ef3..f110d4b5 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -217,9 +217,11 @@ "UglyToad.PdfPig.Util.DefaultWordExtractor", "UglyToad.PdfPig.Util.DateFormatHelper", "UglyToad.PdfPig.Util.WhitespaceSizeStatistics", + "UglyToad.PdfPig.Writer.ITokenWriter", "UglyToad.PdfPig.Writer.PdfAStandard", "UglyToad.PdfPig.Writer.PdfDocumentBuilder", "UglyToad.PdfPig.Writer.PdfMerger", + "UglyToad.PdfPig.Writer.PdfTextRemover", "UglyToad.PdfPig.Writer.PdfWriterType", "UglyToad.PdfPig.Writer.PdfPageBuilder", "UglyToad.PdfPig.Writer.TokenWriter", diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs index 84a0f02f..45b4b24f 100644 --- a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs @@ -1,73 +1,74 @@ namespace UglyToad.PdfPig.Tests.Writer { using System.IO; - using System.Linq; - using Content; - using Integration; - using PdfPig.Core; - using PdfPig.Fonts.Standard14Fonts; - using PdfPig.Tokens; - using PdfPig.Writer; + using System.Linq; + using Content; + using Integration; + using PdfPig.Core; + using PdfPig.Fonts.Standard14Fonts; + using PdfPig.Tokens; + using PdfPig.Writer; + using System.Collections.Generic; using Tests.Fonts.TrueType; using Xunit; - public class PdfDocumentBuilderTests - { - [Fact] - public void CanWriteSingleBlankPage() - { - var result = CreateSingleBlankPage(); - - WriteFile(nameof(CanWriteSinglePageHelloWorld), result); - - Assert.NotEmpty(result); - - var str = OtherEncodings.BytesAsLatin1String(result); - Assert.StartsWith("%PDF", str); - Assert.EndsWith("%%EOF", str); - } - - [Fact] - public void CanCreateSingleCustomPageSize() - { - var builder = new PdfDocumentBuilder(); - - var page = builder.AddPage(120, 250); - - var font = builder.AddStandard14Font(Standard14Font.Helvetica); - - page.AddText("Small page.", 12, new PdfPoint(25, 200), font); - - var bytes = builder.Build(); - - WriteFile(nameof(CanCreateSingleCustomPageSize), bytes); - - using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff)) - { - Assert.Equal(1, document.NumberOfPages); - - var page1 = document.GetPage(1); - - Assert.Equal(120, page1.Width); - Assert.Equal(250, page1.Height); - - Assert.Equal("Small page.", page1.Text); - } + public class PdfDocumentBuilderTests + { + [Fact] + public void CanWriteSingleBlankPage() + { + var result = CreateSingleBlankPage(); + + WriteFile(nameof(CanWriteSinglePageHelloWorld), result); + + Assert.NotEmpty(result); + + var str = OtherEncodings.BytesAsLatin1String(result); + Assert.StartsWith("%PDF", str); + Assert.EndsWith("%%EOF", str); } - [Fact] - public void CanFastAddPageAndInheritProps() - { - var first = IntegrationHelpers.GetDocumentPath("inherited_mediabox.pdf"); - var contents = File.ReadAllBytes(first); - + [Fact] + public void CanCreateSingleCustomPageSize() + { + var builder = new PdfDocumentBuilder(); - byte[] results = null; - using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) - using (var output = new PdfDocumentBuilder()) + var page = builder.AddPage(120, 250); + + var font = builder.AddStandard14Font(Standard14Font.Helvetica); + + page.AddText("Small page.", 12, new PdfPoint(25, 200), font); + + var bytes = builder.Build(); + + WriteFile(nameof(CanCreateSingleCustomPageSize), bytes); + + using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff)) + { + Assert.Equal(1, document.NumberOfPages); + + var page1 = document.GetPage(1); + + Assert.Equal(120, page1.Width); + Assert.Equal(250, page1.Height); + + Assert.Equal("Small page.", page1.Text); + } + } + + [Fact] + public void CanFastAddPageAndInheritProps() + { + var first = IntegrationHelpers.GetDocumentPath("inherited_mediabox.pdf"); + var contents = File.ReadAllBytes(first); + + + byte[] results = null; + using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) + using (var output = new PdfDocumentBuilder()) { output.AddPage(existing, 1); - results = output.Build(); + results = output.Build(); } using (var rewritted = PdfDocument.Open(results, ParsingOptions.LenientParsingOff)) @@ -75,22 +76,22 @@ var pg = rewritted.GetPage(1); Assert.Equal(200, pg.MediaBox.Bounds.Width); Assert.Equal(100, pg.MediaBox.Bounds.Height); - } + } } - [Fact] - public void CanFastAddPageWithStreamSubtype() - { - var first = IntegrationHelpers.GetDocumentPath("steam_in_page_dict.pdf"); - var contents = File.ReadAllBytes(first); - + [Fact] + public void CanFastAddPageWithStreamSubtype() + { + var first = IntegrationHelpers.GetDocumentPath("steam_in_page_dict.pdf"); + var contents = File.ReadAllBytes(first); - byte[] results = null; - using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) - using (var output = new PdfDocumentBuilder()) + + byte[] results = null; + using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) + using (var output = new PdfDocumentBuilder()) { output.AddPage(existing, 1); - results = output.Build(); + results = output.Build(); } using (var rewritted = PdfDocument.Open(results, ParsingOptions.LenientParsingOff)) @@ -98,26 +99,26 @@ // really just checking for no exception... var pg = rewritted.GetPage(1); Assert.NotNull(pg.Content); - } + } } - [Fact] - public void CanFastAddPageAndStripLinkAnnots() - { - var first = IntegrationHelpers.GetDocumentPath("outline.pdf"); - var contents = File.ReadAllBytes(first); - + [Fact] + public void CanFastAddPageAndStripLinkAnnots() + { + var first = IntegrationHelpers.GetDocumentPath("outline.pdf"); + var contents = File.ReadAllBytes(first); + var annotCount = 0; - byte[] results = null; - using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) - using (var output = new PdfDocumentBuilder()) + byte[] results = null; + using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff)) + using (var output = new PdfDocumentBuilder()) { output.AddPage(existing, 1); results = output.Build(); var pg = existing.GetPage(1); var annots = pg.ExperimentalAccess.GetAnnotations().ToList(); annotCount = annots.Count; - Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link); + Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link); } using (var rewritten = PdfDocument.Open(results, ParsingOptions.LenientParsingOff)) @@ -126,7 +127,7 @@ var annots = pg.ExperimentalAccess.GetAnnotations().ToList(); Assert.Equal(annotCount - 1, annots.Count); Assert.DoesNotContain(annots, x => x.Type == Annotations.AnnotationType.Link); - } + } } [Fact] @@ -1114,6 +1115,27 @@ } } + [Fact] + public void CanUseCustomTokenWriter() + { + var docPath = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf"); + var tw = new TestTokenWriter(); + + using (var doc = PdfDocument.Open(docPath)) + using (var ms = new MemoryStream()) + using (var builder = new PdfDocumentBuilder(ms, tokenWriter: tw)) + { + for (var i = 1; i <= doc.NumberOfPages; i++) + { + builder.AddPage(doc, i); + } + builder.Build(); + } + Assert.Equal(tw.Objects, 0); // No objects in sample file + Assert.True(tw.Tokens > 1000); // Roughly 1065 + Assert.True(tw.WroteCrossReferenceTable); + } + private static void WriteFile(string name, byte[] bytes, string extension = "pdf") { try @@ -1133,4 +1155,29 @@ } } } + + public class TestTokenWriter : ITokenWriter + { + public int Tokens { get; private set; } + public int Objects { get; private set; } + public bool WroteCrossReferenceTable { get; private set; } + + public void WriteToken(IToken token, Stream outputStream) + { + Tokens++; + } + + public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream) + { + Objects++; + } + + public void WriteCrossReferenceTable(IReadOnlyDictionary objectOffsets, + IndirectReference catalogToken, + Stream outputStream, + IndirectReference? documentInformationReference) + { + WroteCrossReferenceTable = true; + } + } } diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs new file mode 100644 index 00000000..00814bfc --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs @@ -0,0 +1,35 @@ +using UglyToad.PdfPig.Tests.Integration; +using UglyToad.PdfPig.Writer; +using System.IO; +using Xunit; + +namespace UglyToad.PdfPig.Tests.Writer +{ + public class PdfTextRemoverTests + { + [Theory] + [InlineData("Two Page Text Only - from libre office.pdf")] + [InlineData("cat-genetics.pdf")] + [InlineData("Motor Insurance claim form.pdf")] + [InlineData("Single Page Images - from libre office.pdf")] + public void TextRemoverRemovesText(string file) + { + var filePath = IntegrationHelpers.GetDocumentPath(file); + using (var document = PdfDocument.Open(filePath)) + { + var withoutText = PdfTextRemover.RemoveText(filePath); + File.WriteAllBytes(@"C:\temp\_tx.pdf", withoutText); + using (var documentWithoutText = PdfDocument.Open(withoutText)) + { + Assert.Equal(document.NumberOfPages, documentWithoutText.NumberOfPages); + for (var i = 1; i <= documentWithoutText.NumberOfPages; i++) + { + Assert.NotEqual(document.GetPage(i).Text, string.Empty); + Assert.Equal(documentWithoutText.GetPage(i).Text, string.Empty); + } + + } + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/Writer/TokenWriterTests.cs b/src/UglyToad.PdfPig.Tests/Writer/TokenWriterTests.cs index 017a9b80..18bc7d95 100644 --- a/src/UglyToad.PdfPig.Tests/Writer/TokenWriterTests.cs +++ b/src/UglyToad.PdfPig.Tests/Writer/TokenWriterTests.cs @@ -12,10 +12,11 @@ [Fact] public void EscapeSpecialCharacter() { + var writer = new TokenWriter(); using (var memStream = new MemoryStream()) { - TokenWriter.WriteToken(new StringToken("\\"), memStream); - TokenWriter.WriteToken(new StringToken("(Hello)"), memStream); + writer.WriteToken(new StringToken("\\"), memStream); + writer.WriteToken(new StringToken("(Hello)"), memStream); // Read Test memStream.Position = 0; diff --git a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContent.cs b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContent.cs index 17121168..b81c9759 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContent.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContent.cs @@ -10,6 +10,8 @@ /// public class BeginMarkedContent : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContentWithProperties.cs b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContentWithProperties.cs index 73e7cdbe..75063865 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContentWithProperties.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/BeginMarkedContentWithProperties.cs @@ -11,6 +11,8 @@ /// public class BeginMarkedContentWithProperties : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPoint.cs b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPoint.cs index a78d2b91..c8aeb3c5 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPoint.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPoint.cs @@ -10,6 +10,8 @@ /// public class DesignateMarkedContentPoint : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPointWithProperties.cs b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPointWithProperties.cs index 53fe8aeb..0f282fc0 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPointWithProperties.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/MarkedContent/DesignateMarkedContentPointWithProperties.cs @@ -11,6 +11,8 @@ /// public class DesignateMarkedContentPointWithProperties : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorAdvanced.cs b/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorAdvanced.cs index e1be2adb..42652862 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorAdvanced.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorAdvanced.cs @@ -13,6 +13,8 @@ /// public class SetNonStrokeColorAdvanced : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorSpace.cs b/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorSpace.cs index 7715e82d..5c06d741 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorSpace.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/SetNonStrokeColorSpace.cs @@ -11,6 +11,8 @@ /// public class SetNonStrokeColorSpace : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorAdvanced.cs b/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorAdvanced.cs index 852f4f89..825ece3d 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorAdvanced.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorAdvanced.cs @@ -13,6 +13,8 @@ /// public class SetStrokeColorAdvanced : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorSpace.cs b/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorSpace.cs index 8c03e731..4ae622da 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorSpace.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/SetStrokeColorSpace.cs @@ -11,6 +11,8 @@ /// public class SetStrokeColorSpace : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/TextShowing/ShowTextsWithPositioning.cs b/src/UglyToad.PdfPig/Graphics/Operations/TextShowing/ShowTextsWithPositioning.cs index 78e2d4b1..08695238 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/TextShowing/ShowTextsWithPositioning.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/TextShowing/ShowTextsWithPositioning.cs @@ -15,6 +15,8 @@ /// public class ShowTextsWithPositioning : IGraphicsStateOperation { + private static readonly TokenWriter TokenWriter = new TokenWriter(); + /// /// The symbol for this operation in a stream. /// diff --git a/src/UglyToad.PdfPig/PdfFonts/Cmap/CharacterMapBuilder.cs b/src/UglyToad.PdfPig/PdfFonts/Cmap/CharacterMapBuilder.cs index 388967aa..bf254fa4 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Cmap/CharacterMapBuilder.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Cmap/CharacterMapBuilder.cs @@ -96,8 +96,6 @@ } return CharacterIdentifierSystemInfo; - - throw new InvalidOperationException("The Character Identifier System Information was never set."); } public void UseCMap(CMap other) diff --git a/src/UglyToad.PdfPig/Writer/Fonts/ToUnicodeCMapBuilder.cs b/src/UglyToad.PdfPig/Writer/Fonts/ToUnicodeCMapBuilder.cs index 6015bf0d..41768266 100644 --- a/src/UglyToad.PdfPig/Writer/Fonts/ToUnicodeCMapBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/Fonts/ToUnicodeCMapBuilder.cs @@ -14,6 +14,8 @@ private const string DictToken = "dict"; private const string FindResourceToken = "findresource"; + private static readonly TokenWriter TokenWriter = new TokenWriter(); + public static IReadOnlyList ConvertToCMapStream(IReadOnlyDictionary unicodeToCharacterCode) { using (var memoryStream = new MemoryStream()) diff --git a/src/UglyToad.PdfPig/Writer/ITokenWriter.cs b/src/UglyToad.PdfPig/Writer/ITokenWriter.cs new file mode 100644 index 00000000..1445f1e4 --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/ITokenWriter.cs @@ -0,0 +1,37 @@ +namespace UglyToad.PdfPig.Writer; + +using Core; +using System.Collections.Generic; +using System.IO; +using Tokens; + +/// +/// Writes any type of to the corresponding PDF document format output. +/// +public interface ITokenWriter +{ + /// + /// Writes the given input token to the output stream with the correct PDF format and encoding including whitespace and line breaks as applicable. + /// + /// The token to write to the stream. + /// The stream to write the token to. + void WriteToken(IToken token, Stream outputStream); + + /// + /// Writes pre-serialized token as an object token to the output stream. + /// + /// Object number of the indirect object. + /// Generation of the indirect object. + /// Pre-serialized object contents. + /// The stream to write the token to. + void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream); + + /// + /// Writes a valid single section cross-reference (xref) table plus trailer dictionary to the output for the set of object offsets. + /// + /// The byte offset from the start of the document for each object in the document. + /// The object representing the catalog dictionary which is referenced from the trailer dictionary. + /// The output stream to write to. + /// The object reference for the document information dictionary if present. + void WriteCrossReferenceTable(IReadOnlyDictionary objectOffsets, IndirectReference catalogToken, Stream outputStream, IndirectReference? documentInformationReference); +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs new file mode 100644 index 00000000..fe77829a --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs @@ -0,0 +1,97 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using UglyToad.PdfPig.Core; +using UglyToad.PdfPig.Filters; +using UglyToad.PdfPig.Graphics.Operations.TextShowing; +using UglyToad.PdfPig.Graphics.Operations; +using UglyToad.PdfPig.Graphics; +using UglyToad.PdfPig.Logging; +using UglyToad.PdfPig.Parser; +using UglyToad.PdfPig.Tokens; + +namespace UglyToad.PdfPig.Writer +{ + /// + /// Derived class of that does not write or operations in streams + /// + internal class NoTextTokenWriter : TokenWriter + { + /// + /// Write stream without or operations + /// + /// + /// + protected override void WriteStream(StreamToken streamToken, Stream outputStream) + { + if (!TryGetStreamWithoutText(streamToken, out var outputStreamToken)) + { + outputStreamToken = streamToken; + } + WriteDictionary(outputStreamToken.StreamDictionary, outputStream); + WriteLineBreak(outputStream); + outputStream.Write(StreamStart, 0, StreamStart.Length); + WriteLineBreak(outputStream); + outputStream.Write(outputStreamToken.Data.ToArray(), 0, outputStreamToken.Data.Count); + WriteLineBreak(outputStream); + outputStream.Write(StreamEnd, 0, StreamEnd.Length); + } + + /// + /// Try get a stream without or operations. + /// + /// + /// + /// true if any text operation found (and we have a valid without the text operations), + /// false if no text operation found (in which case is null) + private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken outputStreamToken) + { + var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance); + IReadOnlyList bytes; + try + { + bytes = streamToken.Decode(filterProvider); + } + catch + { + outputStreamToken = null; + return false; + } + + var pageContentParser = new PageContentParser(new ReflectionGraphicsStateOperationFactory()); + IReadOnlyList operations; + try + { + operations = pageContentParser.Parse(1, new ByteArrayInputBytes(bytes), new NoOpLog()); + } + catch (Exception) + { + outputStreamToken = null; + return false; + } + + using (var outputStreamT = new MemoryStream()) + { + var haveText = false; + foreach (var op in operations) + { + if (op.Operator == ShowText.Symbol || op.Operator == ShowTextsWithPositioning.Symbol) + { + haveText = true; + continue; + } + op.Write(outputStreamT); + } + if (!haveText) + { + outputStreamToken = null; + return false; + } + outputStreamT.Seek(0, SeekOrigin.Begin); + outputStreamToken = DataCompresser.CompressToStream(outputStreamT.ToArray()); + return true; + } + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs b/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs index 0d1cb1af..bd547b89 100644 --- a/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs @@ -8,7 +8,7 @@ { private readonly Dictionary hashes = new Dictionary(new FNVByteComparison()); - public PdfDedupStreamWriter(Stream stream, bool dispose) : base(stream, dispose) + public PdfDedupStreamWriter(Stream stream, bool dispose, ITokenWriter tokenWriter = null) : base(stream, dispose, tokenWriter) { } diff --git a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs index 26c110e3..a13ea14d 100644 --- a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs @@ -26,7 +26,7 @@ namespace UglyToad.PdfPig.Writer private readonly Dictionary pages = new Dictionary(); private readonly Dictionary fonts = new Dictionary(); private bool completed = false; - internal int fontId = 0; + private int fontId = 0; private readonly static ArrayToken DefaultProcSet = new ArrayToken(new List { @@ -89,20 +89,21 @@ namespace UglyToad.PdfPig.Writer /// If stream should be disposed when builder is. /// Type of pdf stream writer to use /// Pdf version to use in header. - public PdfDocumentBuilder(Stream stream, bool disposeStream = false, PdfWriterType type = PdfWriterType.Default, decimal version = 1.7m) + /// Token writer to use + public PdfDocumentBuilder(Stream stream, bool disposeStream = false, PdfWriterType type = PdfWriterType.Default, decimal version = 1.7m, ITokenWriter tokenWriter = null) { switch (type) { case PdfWriterType.ObjectInMemoryDedup: - context = new PdfDedupStreamWriter(stream, disposeStream); + context = new PdfDedupStreamWriter(stream, disposeStream, tokenWriter); break; default: - context = new PdfStreamWriter(stream, disposeStream); + context = new PdfStreamWriter(stream, disposeStream, tokenWriter); break; } context.InitializePdf(version); } - + /// /// Determines whether the bytes of the TrueType font file provided can be used in a PDF document. /// diff --git a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs index 65d7ec23..0f0e6f7f 100644 --- a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs @@ -18,16 +18,17 @@ protected bool DisposeStream { get; set; } protected bool Initialized { get; set; } protected int CurrentNumber { get; set; } = 1; + protected readonly ITokenWriter TokenWriter; - internal PdfStreamWriter(Stream baseStream, bool disposeStream = true) + internal PdfStreamWriter(Stream baseStream, bool disposeStream = true, ITokenWriter tokenWriter = null) { Stream = baseStream ?? throw new ArgumentNullException(nameof(baseStream)); if (!baseStream.CanWrite) { throw new ArgumentException("Output stream must be writable"); } - DisposeStream = disposeStream; + TokenWriter = tokenWriter ?? new TokenWriter(); } public Stream Stream { get; protected set; } diff --git a/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs b/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs new file mode 100644 index 00000000..369c55b3 --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs @@ -0,0 +1,100 @@ +using System; +using System.Collections.Generic; +using System.IO; + +namespace UglyToad.PdfPig.Writer +{ + /// + /// Class to remove text from PDFs, useful as a preprocessing step for Optical Character Recognition (OCR) + /// + public static class PdfTextRemover + { + /// + /// Return PDF without text as bytes + /// Path to PDF + /// List of pages to emit; if null all pages are emitted + /// + public static byte[] RemoveText(string filePath, IReadOnlyList pagesBundle = null) + { + using (var output = new MemoryStream()) + { + RemoveText(output, filePath, pagesBundle); + return output.ToArray(); + } + } + + /// + /// Write PDF without text to the output stream. The caller must manage disposing the output stream. + /// Must be writable + /// Path to PDF + /// List of pages to emit; if null all pages are emitted + /// + public static void RemoveText(Stream output, string filePath, IReadOnlyList pagesBundle = null) + { + using (var stream = File.OpenRead(filePath)) + { + RemoveText(stream, output, pagesBundle); + } + } + + /// + /// Remove text from the PDF (passed in as a byte array) and return it as a new byte array + /// PDF document (as byte array) + /// List of pages to emit; if null all pages are emitted + /// PDF without text (as a byte array) + /// + public static byte[] RemoveText(byte[] file, IReadOnlyList pagesBundle = null) + { + _ = file ?? throw new ArgumentNullException(nameof(file)); + + using (var output = new MemoryStream()) + { + RemoveText(PdfDocument.Open(file), output, pagesBundle); + return output.ToArray(); + } + } + + /// + /// Remove text from the PDF in the input stream and write it to the output stream. + /// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream. + /// Streams for the file contents, this must support reading and seeking. + /// Must be writable + /// List of pages to emit; if null all pages are emitted + /// + public static void RemoveText(Stream stream, Stream output, IReadOnlyList pagesBundle = null) + { + _ = stream ?? throw new ArgumentNullException(nameof(stream)); + _ = output ?? throw new ArgumentNullException(nameof(output)); + + RemoveText(PdfDocument.Open(stream), output, pagesBundle); + } + + /// + /// Remove text from the PDF and write it to the output stream. + /// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream. + /// PDF document + /// Must be writable + /// List of pages to emit; if null all pages are emitted + /// + public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList pagesBundle = null) + { + using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter())) + { + if (pagesBundle == null) + { + for (var i = 1; i <= file.NumberOfPages; i++) + { + document.AddPage(file, i); + } + } + else + { + foreach (var i in pagesBundle) + { + document.AddPage(file, i); + } + } + } + } + } +} diff --git a/src/UglyToad.PdfPig/Writer/TokenWriter.cs b/src/UglyToad.PdfPig/Writer/TokenWriter.cs index 88202ad6..05cac5b2 100644 --- a/src/UglyToad.PdfPig/Writer/TokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/TokenWriter.cs @@ -14,10 +14,8 @@ /// /// Writes any type of to the corresponding PDF document format output. /// - public class TokenWriter + public class TokenWriter : ITokenWriter { - private static readonly byte Backslash = GetByte("\\"); - private static readonly byte ArrayStart = GetByte("["); private static readonly byte ArrayEnd = GetByte("]"); @@ -46,10 +44,18 @@ private static readonly byte[] StartXref = OtherEncodings.StringAsLatin1Bytes("startxref"); - private static readonly byte[] StreamStart = OtherEncodings.StringAsLatin1Bytes("stream"); - private static readonly byte[] StreamEnd = OtherEncodings.StringAsLatin1Bytes("endstream"); + /// + /// Bytes that indicate start of stream + /// + protected static readonly byte[] StreamStart = OtherEncodings.StringAsLatin1Bytes("stream"); + + /// + /// Bytes that indicate end start of stream + /// + protected static readonly byte[] StreamEnd = OtherEncodings.StringAsLatin1Bytes("endstream"); private static readonly byte StringStart = GetByte("("); + private static readonly byte StringEnd = GetByte(")"); private static readonly byte[] Trailer = OtherEncodings.StringAsLatin1Bytes("trailer"); @@ -79,7 +85,7 @@ /// /// The token to write to the stream. /// The stream to write the token to. - public static void WriteToken(IToken token, Stream outputStream) + public void WriteToken(IToken token, Stream outputStream) { if (token == null) { @@ -129,14 +135,8 @@ } } - /// - /// Writes a valid single section cross-reference (xref) table plus trailer dictionary to the output for the set of object offsets. - /// - /// The byte offset from the start of the document for each object in the document. - /// The object representing the catalog dictionary which is referenced from the trailer dictionary. - /// The output stream to write to. - /// The object reference for the document information dictionary if present. - internal static void WriteCrossReferenceTable(IReadOnlyDictionary objectOffsets, + /// + public void WriteCrossReferenceTable(IReadOnlyDictionary objectOffsets, IndirectReference catalogToken, Stream outputStream, IndirectReference? documentInformationReference) @@ -271,14 +271,8 @@ outputStream.Write(Eof, 0, Eof.Length); } - /// - /// Writes pre-serialized token as an object token to the output stream. - /// - /// Object number of the indirect object. - /// Generation of the indirect object. - /// Pre-serialized object contents. - /// The stream to write the token to. - internal static void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream) + /// + public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream) { WriteLong(objectNumber, outputStream); WriteWhitespace(outputStream); @@ -297,14 +291,24 @@ WriteLineBreak(outputStream); } - private static void WriteHex(HexToken hex, Stream stream) + /// + /// Write a hex value to the output stream + /// + /// + /// + protected void WriteHex(HexToken hex, Stream stream) { stream.WriteByte(HexStart); stream.WriteText(hex.GetHexString()); stream.WriteByte(HexEnd); } - private static void WriteArray(ArrayToken array, Stream outputStream) + /// + /// Write an array to the output stream, with whitespace at the end. + /// + /// + /// + protected void WriteArray(ArrayToken array, Stream outputStream) { outputStream.WriteByte(ArrayStart); WriteWhitespace(outputStream); @@ -319,14 +323,24 @@ WriteWhitespace(outputStream); } - private static void WriteBoolean(BooleanToken boolean, Stream outputStream) + /// + /// Write a boolean "true" or "false" to the output stream, with whitespace at the end. + /// + /// + /// + protected void WriteBoolean(BooleanToken boolean, Stream outputStream) { var bytes = boolean.Data ? TrueBytes : FalseBytes; outputStream.Write(bytes, 0, bytes.Length); WriteWhitespace(outputStream); } - private static void WriteComment(CommentToken comment, Stream outputStream) + /// + /// Write a "%comment" in the output stream, with a line break at the end. + /// + /// + /// + protected void WriteComment(CommentToken comment, Stream outputStream) { var bytes = OtherEncodings.StringAsLatin1Bytes(comment.Data); outputStream.WriteByte(Comment); @@ -334,7 +348,12 @@ WriteLineBreak(outputStream); } - private static void WriteDictionary(DictionaryToken dictionary, Stream outputStream) + /// + /// Writes dictionary key/value pairs to output stream as Name/Token pairs. + /// + /// + /// + protected void WriteDictionary(DictionaryToken dictionary, Stream outputStream) { outputStream.Write(DictionaryStart, 0, DictionaryStart.Length); @@ -356,7 +375,12 @@ outputStream.Write(DictionaryEnd, 0, DictionaryEnd.Length); } - private static void WriteIndirectReference(IndirectReferenceToken reference, Stream outputStream) + /// + /// Write an indirect reference to the stream, with whitespace at the end. + /// + /// + /// + protected virtual void WriteIndirectReference(IndirectReferenceToken reference, Stream outputStream) { WriteLong(reference.Data.ObjectNumber, outputStream); WriteWhitespace(outputStream); @@ -368,12 +392,17 @@ WriteWhitespace(outputStream); } - private static void WriteName(NameToken name, Stream outputStream) + /// + /// Write a name to the stream, with whitespace at the end. + /// + /// + /// + protected virtual void WriteName(NameToken name, Stream outputStream) { WriteName(name.Data, outputStream); } - private static void WriteName(string name, Stream outputStream) + private void WriteName(string name, Stream outputStream) { /* * Beginning with PDF 1.2, any character except null (character code 0) may be @@ -404,7 +433,12 @@ WriteWhitespace(outputStream); } - private static void WriteNumber(NumericToken number, Stream outputStream) + /// + /// Write a number to the stream, with whitespace at the end. + /// + /// + /// + protected virtual void WriteNumber(NumericToken number, Stream outputStream) { if (!number.HasDecimalPlaces) { @@ -419,7 +453,15 @@ WriteWhitespace(outputStream); } - private static void WriteObject(ObjectToken objectToken, Stream outputStream) + /// + /// Write an object to the stream, with a line break at the end. It writes the following contents: + /// - "[ObjectNumber] [Generation] obj" + /// - Object data + /// - "endobj" + /// + /// + /// + protected virtual void WriteObject(ObjectToken objectToken, Stream outputStream) { WriteLong(objectToken.Number.ObjectNumber, outputStream); WriteWhitespace(outputStream); @@ -438,7 +480,16 @@ WriteLineBreak(outputStream); } - private static void WriteStream(StreamToken streamToken, Stream outputStream) + /// + /// Write a stream token to the output stream, with the following contents: + /// - Dictionary specifying the length of the stream, any applied compression filters and additional information. + /// - Stream start indicator + /// - Bytes in the StreamToken data + /// - Stream end indicator + /// + /// + /// + protected virtual void WriteStream(StreamToken streamToken, Stream outputStream) { WriteDictionary(streamToken.StreamDictionary, outputStream); WriteLineBreak(outputStream); @@ -449,15 +500,22 @@ outputStream.Write(StreamEnd, 0, StreamEnd.Length); } - private static int[] EscapeNeeded = new int[] + private static readonly int[] EscapeNeeded = new int[] { '\r', '\n', '\t', '\b', '\f', '\\' }; - private static int[] Escaped = new int[] + + private static readonly int[] Escaped = new int[] { 'r', 'n', 't', 'b', 'f', '\\' }; - private static void WriteString(StringToken stringToken, Stream outputStream) + + /// + /// Write string to the stream, with whitespace at the end + /// + /// + /// + protected virtual void WriteString(StringToken stringToken, Stream outputStream) { outputStream.WriteByte(StringStart); @@ -515,29 +573,47 @@ WriteWhitespace(outputStream); } - private static void WriteInt(int value, Stream outputStream) + /// + /// Write an integer to the stream + /// + /// + /// + protected virtual void WriteInt(int value, Stream outputStream) { var bytes = OtherEncodings.StringAsLatin1Bytes(value.ToString("G", CultureInfo.InvariantCulture)); outputStream.Write(bytes, 0, bytes.Length); } - private static void WriteLineBreak(Stream outputStream) + /// + /// Write a line break to the output stream + /// + /// + protected virtual void WriteLineBreak(Stream outputStream) { outputStream.WriteNewLine(); } - private static void WriteLong(long value, Stream outputStream) + /// + /// Write a long to the stream + /// + /// + /// + protected virtual void WriteLong(long value, Stream outputStream) { var bytes = OtherEncodings.StringAsLatin1Bytes(value.ToString("G", CultureInfo.InvariantCulture)); outputStream.Write(bytes, 0, bytes.Length); } - private static void WriteWhitespace(Stream outputStream) + /// + /// Write a space to the output stream + /// + /// + protected virtual void WriteWhitespace(Stream outputStream) { outputStream.WriteByte(Whitespace); } - private static void WriteFirstXrefEmptyEntry(Stream outputStream) + private void WriteFirstXrefEmptyEntry(Stream outputStream) { /* * The first entry in the table (object number 0) is always free and has a generation number of 65,535; @@ -591,5 +667,4 @@ } } } -} - +} \ No newline at end of file