Added a PdfTextRemover utility that uses a NoTextTokenWriter, to output PDFs without text contents.

Also added unit tests to test: - If we can use a custom ITokenWriter with PdfDocumentBuilder - If removing text works.
2026-01-18 19:51:24 +08:00 · 2022-12-20 21:31:15 +01:00
parent 9273a43965
commit 6ef6c4d780
4 changed files with 361 additions and 82 deletions
--- a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs
@@ -1,73 +1,74 @@
 namespace UglyToad.PdfPig.Tests.Writer
 {
    using System.IO;
-    using System.Linq;
-    using Content;
-    using Integration;
-    using PdfPig.Core;
-    using PdfPig.Fonts.Standard14Fonts;
-    using PdfPig.Tokens;
-    using PdfPig.Writer;
+    using System.Linq;
+    using Content;
+    using Integration;
+    using PdfPig.Core;
+    using PdfPig.Fonts.Standard14Fonts;
+    using PdfPig.Tokens;
+    using PdfPig.Writer;
+    using System.Collections.Generic;
    using Tests.Fonts.TrueType;
    using Xunit;

-    public class PdfDocumentBuilderTests
-    {
-        [Fact]
-        public void CanWriteSingleBlankPage()
-        {
-            var result = CreateSingleBlankPage();
-
-            WriteFile(nameof(CanWriteSinglePageHelloWorld), result);
-
-            Assert.NotEmpty(result);
-
-            var str = OtherEncodings.BytesAsLatin1String(result);
-            Assert.StartsWith("%PDF", str);
-            Assert.EndsWith("%%EOF", str);
-        }
-
-        [Fact]
-        public void CanCreateSingleCustomPageSize()
-        {
-            var builder = new PdfDocumentBuilder();
-
-            var page = builder.AddPage(120, 250);
-
-            var font = builder.AddStandard14Font(Standard14Font.Helvetica);
-
-            page.AddText("Small page.", 12, new PdfPoint(25, 200), font);
-
-            var bytes = builder.Build();
-
-            WriteFile(nameof(CanCreateSingleCustomPageSize), bytes);
-
-            using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff))
-            {
-                Assert.Equal(1, document.NumberOfPages);
-
-                var page1 = document.GetPage(1);
-
-                Assert.Equal(120, page1.Width);
-                Assert.Equal(250, page1.Height);
-
-                Assert.Equal("Small page.", page1.Text);
-            }
+    public class PdfDocumentBuilderTests
+    {
+        [Fact]
+        public void CanWriteSingleBlankPage()
+        {
+            var result = CreateSingleBlankPage();
+
+            WriteFile(nameof(CanWriteSinglePageHelloWorld), result);
+
+            Assert.NotEmpty(result);
+
+            var str = OtherEncodings.BytesAsLatin1String(result);
+            Assert.StartsWith("%PDF", str);
+            Assert.EndsWith("%%EOF", str);
        }

-        [Fact]
-        public void CanFastAddPageAndInheritProps()
-        {
-            var first = IntegrationHelpers.GetDocumentPath("inherited_mediabox.pdf");
-            var contents = File.ReadAllBytes(first);
-
+        [Fact]
+        public void CanCreateSingleCustomPageSize()
+        {
+            var builder = new PdfDocumentBuilder();

-            byte[] results = null;
-            using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
-            using (var output = new PdfDocumentBuilder())
+            var page = builder.AddPage(120, 250);
+
+            var font = builder.AddStandard14Font(Standard14Font.Helvetica);
+
+            page.AddText("Small page.", 12, new PdfPoint(25, 200), font);
+
+            var bytes = builder.Build();
+
+            WriteFile(nameof(CanCreateSingleCustomPageSize), bytes);
+
+            using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff))
+            {
+                Assert.Equal(1, document.NumberOfPages);
+
+                var page1 = document.GetPage(1);
+
+                Assert.Equal(120, page1.Width);
+                Assert.Equal(250, page1.Height);
+
+                Assert.Equal("Small page.", page1.Text);
+            }
+        }
+
+        [Fact]
+        public void CanFastAddPageAndInheritProps()
+        {
+            var first = IntegrationHelpers.GetDocumentPath("inherited_mediabox.pdf");
+            var contents = File.ReadAllBytes(first);
+
+
+            byte[] results = null;
+            using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
+            using (var output = new PdfDocumentBuilder())
            {
                output.AddPage(existing, 1);
-                results = output.Build();
+                results = output.Build();
            }

            using (var rewritted = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
@@ -75,22 +76,22 @@
                var pg = rewritted.GetPage(1);
                Assert.Equal(200, pg.MediaBox.Bounds.Width);
                Assert.Equal(100, pg.MediaBox.Bounds.Height);
-            }
+            }
        }

-        [Fact]
-        public void CanFastAddPageWithStreamSubtype()
-        {
-            var first = IntegrationHelpers.GetDocumentPath("steam_in_page_dict.pdf");
-            var contents = File.ReadAllBytes(first);
-
+        [Fact]
+        public void CanFastAddPageWithStreamSubtype()
+        {
+            var first = IntegrationHelpers.GetDocumentPath("steam_in_page_dict.pdf");
+            var contents = File.ReadAllBytes(first);

-            byte[] results = null;
-            using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
-            using (var output = new PdfDocumentBuilder())
+
+            byte[] results = null;
+            using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
+            using (var output = new PdfDocumentBuilder())
            {
                output.AddPage(existing, 1);
-                results = output.Build();
+                results = output.Build();
            }

            using (var rewritted = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
@@ -98,26 +99,26 @@
                // really just checking for no exception...
                var pg = rewritted.GetPage(1);
                Assert.NotNull(pg.Content);
-            }
+            }
        }

-        [Fact]
-        public void CanFastAddPageAndStripLinkAnnots()
-        {
-            var first = IntegrationHelpers.GetDocumentPath("outline.pdf");
-            var contents = File.ReadAllBytes(first);
-
+        [Fact]
+        public void CanFastAddPageAndStripLinkAnnots()
+        {
+            var first = IntegrationHelpers.GetDocumentPath("outline.pdf");
+            var contents = File.ReadAllBytes(first);
+
            var annotCount = 0;
-            byte[] results = null;
-            using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
-            using (var output = new PdfDocumentBuilder())
+            byte[] results = null;
+            using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
+            using (var output = new PdfDocumentBuilder())
            {
                output.AddPage(existing, 1);
                results = output.Build();
                var pg = existing.GetPage(1);
                var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
                annotCount = annots.Count;
-                Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link);
+                Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link);
            }

            using (var rewritten = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
@@ -126,7 +127,7 @@
                var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
                Assert.Equal(annotCount - 1, annots.Count);
                Assert.DoesNotContain(annots, x => x.Type == Annotations.AnnotationType.Link);
-            }
+            }
        }

        [Fact]
@@ -1114,6 +1115,27 @@
            }
        }

+        [Fact]
+        public void CanUseCustomTokenWriter()
+        {
+            var docPath = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf");
+            var tw = new TestTokenWriter();
+
+            using (var doc = PdfDocument.Open(docPath))
+            using (var ms = new MemoryStream())
+            using (var builder = new PdfDocumentBuilder(ms, tokenWriter: tw))
+            {
+                for (var i = 1; i <= doc.NumberOfPages; i++)
+                {
+                    builder.AddPage(doc, i);
+                }
+                builder.Build();
+            }
+            Assert.Equal(tw.Objects, 0); // No objects in sample file
+            Assert.True(tw.Tokens > 1000); // Roughly 1065
+            Assert.True(tw.WroteCrossReferenceTable);
+        }
+
        private static void WriteFile(string name, byte[] bytes, string extension = "pdf")
        {
            try
@@ -1133,4 +1155,29 @@
            }
        }
    }
+
+    public class TestTokenWriter : ITokenWriter
+    {
+        public int Tokens { get; private set; }
+        public int Objects { get; private set; }
+        public bool WroteCrossReferenceTable { get; private set; }
+
+        public void WriteToken(IToken token, Stream outputStream)
+        {
+            Tokens++;
+        }
+
+        public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream)
+        {
+            Objects++;
+        }
+
+        public void WriteCrossReferenceTable(IReadOnlyDictionary<IndirectReference, long> objectOffsets,
+            IndirectReference catalogToken,
+            Stream outputStream,
+            IndirectReference? documentInformationReference)
+        {
+            WroteCrossReferenceTable = true;
+        }
+    }
 }
--- a/src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs
@@ -0,0 +1,35 @@
+using UglyToad.PdfPig.Tests.Integration;
+using UglyToad.PdfPig.Writer;
+using System.IO;
+using Xunit;
+
+namespace UglyToad.PdfPig.Tests.Writer
+{
+    public class PdfTextRemoverTests
+    {
+        [Theory]
+        [InlineData("Two Page Text Only - from libre office.pdf")]
+        [InlineData("cat-genetics.pdf")]
+        [InlineData("Motor Insurance claim form.pdf")]
+        [InlineData("Single Page Images - from libre office.pdf")]
+        public void TextRemoverRemovesText(string file)
+        {
+            var filePath = IntegrationHelpers.GetDocumentPath(file);
+            using (var document = PdfDocument.Open(filePath))
+            {
+                var withoutText = PdfTextRemover.RemoveText(filePath);
+                File.WriteAllBytes(@"C:\temp\_tx.pdf", withoutText);
+                using (var documentWithoutText = PdfDocument.Open(withoutText))
+                {
+                    Assert.Equal(document.NumberOfPages, documentWithoutText.NumberOfPages);
+                    for (var i = 1; i <= documentWithoutText.NumberOfPages; i++)
+                    {
+                        Assert.NotEqual(document.GetPage(i).Text, string.Empty);
+                        Assert.Equal(documentWithoutText.GetPage(i).Text, string.Empty);
+                    }
+
+                }
+            }
+        }
+    }
+}
--- a/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
+++ b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
@@ -0,0 +1,97 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using UglyToad.PdfPig.Core;
+using UglyToad.PdfPig.Filters;
+using UglyToad.PdfPig.Graphics.Operations.TextShowing;
+using UglyToad.PdfPig.Graphics.Operations;
+using UglyToad.PdfPig.Graphics;
+using UglyToad.PdfPig.Logging;
+using UglyToad.PdfPig.Parser;
+using UglyToad.PdfPig.Tokens;
+
+namespace UglyToad.PdfPig.Writer
+{
+    /// <summary>
+    /// Derived class of <see cref="TokenWriter"/> that does not write <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations in streams
+    /// </summary>
+    internal class NoTextTokenWriter : TokenWriter
+    {
+        /// <summary>
+        /// Write stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations
+        /// </summary>
+        /// <param name="streamToken"></param>
+        /// <param name="outputStream"></param>
+        protected override void WriteStream(StreamToken streamToken, Stream outputStream)
+        {
+            if (!TryGetStreamWithoutText(streamToken, out var outputStreamToken))
+            {
+                outputStreamToken = streamToken;
+            }
+            WriteDictionary(outputStreamToken.StreamDictionary, outputStream);
+            WriteLineBreak(outputStream);
+            outputStream.Write(StreamStart, 0, StreamStart.Length);
+            WriteLineBreak(outputStream);
+            outputStream.Write(outputStreamToken.Data.ToArray(), 0, outputStreamToken.Data.Count);
+            WriteLineBreak(outputStream);
+            outputStream.Write(StreamEnd, 0, StreamEnd.Length);
+        }
+
+        /// <summary>
+        /// Try get a stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations.
+        /// </summary>
+        /// <param name="streamToken"></param>
+        /// <param name="outputStreamToken"></param>
+        /// <returns>true if any text operation found (and we have a valid <paramref name="outputStreamToken"/> without the text operations),
+        /// false if no text operation found (in which case <paramref name="outputStreamToken"/> is null)</returns>
+        private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken outputStreamToken)
+        {
+            var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance);
+            IReadOnlyList<byte> bytes;
+            try
+            {
+                bytes = streamToken.Decode(filterProvider);
+            }
+            catch
+            {
+                outputStreamToken = null;
+                return false;
+            }
+
+            var pageContentParser = new PageContentParser(new ReflectionGraphicsStateOperationFactory());
+            IReadOnlyList<IGraphicsStateOperation> operations;
+            try
+            {
+                operations = pageContentParser.Parse(1, new ByteArrayInputBytes(bytes), new NoOpLog());
+            }
+            catch (Exception)
+            {
+                outputStreamToken = null;
+                return false;
+            }
+
+            using (var outputStreamT = new MemoryStream())
+            {
+                var haveText = false;
+                foreach (var op in operations)
+                {
+                    if (op.Operator == ShowText.Symbol || op.Operator == ShowTextsWithPositioning.Symbol)
+                    {
+                        haveText = true;
+                        continue;
+                    }
+                    op.Write(outputStreamT);
+                }
+                if (!haveText)
+                {
+                    outputStreamToken = null;
+                    return false;
+                }
+                outputStreamT.Seek(0, SeekOrigin.Begin);
+                outputStreamToken = DataCompresser.CompressToStream(outputStreamT.ToArray());
+                return true;
+            }
+        }
+    }
+}
--- a/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs
+++ b/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs
@@ -0,0 +1,100 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace UglyToad.PdfPig.Writer
+{
+    /// <summary>
+    /// Class to remove text from PDFs, useful as a preprocessing step for Optical Character Recognition (OCR)
+    /// </summary>
+    public static class PdfTextRemover
+    {
+        /// <summary>
+        /// Return PDF without text as bytes
+        /// <param name="filePath">Path to PDF</param>
+        /// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
+        /// </summary>
+        public static byte[] RemoveText(string filePath, IReadOnlyList<int> pagesBundle = null)
+        {
+            using (var output = new MemoryStream())
+            {
+                RemoveText(output, filePath, pagesBundle);
+                return output.ToArray();
+            }
+        }
+
+        /// <summary>
+        /// Write PDF without text to the output stream. The caller must manage disposing the output stream.
+        /// <param name="output">Must be writable</param>
+        /// <param name="filePath">Path to PDF</param>
+        /// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
+        /// </summary>
+        public static void RemoveText(Stream output, string filePath, IReadOnlyList<int> pagesBundle = null)
+        {
+            using (var stream = File.OpenRead(filePath))
+            {
+                RemoveText(stream, output, pagesBundle);
+            }
+        }
+
+        /// <summary>
+        /// Remove text from the PDF (passed in as a byte array) and return it as a new byte array
+        /// <param name="file">PDF document (as byte array)</param>
+        /// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
+        /// <returns>PDF without text (as a byte array)</returns>
+        /// </summary>
+        public static byte[] RemoveText(byte[] file, IReadOnlyList<int> pagesBundle = null)
+        {
+            _ = file ?? throw new ArgumentNullException(nameof(file));
+
+            using (var output = new MemoryStream())
+            {
+                RemoveText(PdfDocument.Open(file), output, pagesBundle);
+                return output.ToArray();
+            }
+        }
+
+        /// <summary>
+        /// Remove text from the PDF in the input stream and write it to the output stream.
+        /// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
+        /// <param name="stream">Streams for the file contents, this must support reading and seeking.</param>
+        /// <param name="output">Must be writable</param>
+        /// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
+        /// </summary>
+        public static void RemoveText(Stream stream, Stream output, IReadOnlyList<int> pagesBundle = null)
+        {
+            _ = stream ?? throw new ArgumentNullException(nameof(stream));
+            _ = output ?? throw new ArgumentNullException(nameof(output));
+
+            RemoveText(PdfDocument.Open(stream), output, pagesBundle);
+        }
+
+        /// <summary>
+        /// Remove text from the PDF and write it to the output stream.
+        /// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
+        /// <param name="file">PDF document</param>
+        /// <param name="output">Must be writable</param>
+        /// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
+        /// </summary>
+        public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList<int> pagesBundle = null)
+        {
+            using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter()))
+            {
+                if (pagesBundle == null)
+                {
+                    for (var i = 1; i <= file.NumberOfPages; i++)
+                    {
+                        document.AddPage(file, i);
+                    }
+                } 
+                else
+                {
+                    foreach (var i in pagesBundle)
+                    {
+                        document.AddPage(file, i);
+                    }
+                }
+            }
+        }
+    }
+}