Added a PdfTextRemover utility that uses a NoTextTokenWriter, to output PDFs without text contents.

Also added unit tests to test: - If we can use a custom ITokenWriter with PdfDocumentBuilder - If removing text works.
2026-03-10 00:23:29 +08:00 · 2022-12-20 21:31:15 +01:00
parent 9273a43965
commit 6ef6c4d780
4 changed files with 361 additions and 82 deletions
--- a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs
@@ -8,6 +8,7 @@
    using PdfPig.Fonts.Standard14Fonts;
    using PdfPig.Tokens;
    using PdfPig.Writer;
+    using System.Collections.Generic;
    using Tests.Fonts.TrueType;
    using Xunit;

@@ -1114,6 +1115,27 @@
            }
        }

+        [Fact]
+        public void CanUseCustomTokenWriter()
+        {
+            var docPath = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf");
+            var tw = new TestTokenWriter();
+
+            using (var doc = PdfDocument.Open(docPath))
+            using (var ms = new MemoryStream())
+            using (var builder = new PdfDocumentBuilder(ms, tokenWriter: tw))
+            {
+                for (var i = 1; i <= doc.NumberOfPages; i++)
+                {
+                    builder.AddPage(doc, i);
+                }
+                builder.Build();
+            }
+            Assert.Equal(tw.Objects, 0); // No objects in sample file
+            Assert.True(tw.Tokens > 1000); // Roughly 1065
+            Assert.True(tw.WroteCrossReferenceTable);
+        }
+
        private static void WriteFile(string name, byte[] bytes, string extension = "pdf")
        {
            try
@@ -1133,4 +1155,29 @@
            }
        }
    }
+
+    public class TestTokenWriter : ITokenWriter
+    {
+        public int Tokens { get; private set; }
+        public int Objects { get; private set; }
+        public bool WroteCrossReferenceTable { get; private set; }
+
+        public void WriteToken(IToken token, Stream outputStream)
+        {
+            Tokens++;
+        }
+
+        public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream)
+        {
+            Objects++;
+        }
+
+        public void WriteCrossReferenceTable(IReadOnlyDictionary<IndirectReference, long> objectOffsets,
+            IndirectReference catalogToken,
+            Stream outputStream,
+            IndirectReference? documentInformationReference)
+        {
+            WroteCrossReferenceTable = true;
+        }
+    }
 }
--- a/src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs
@@ -0,0 +1,35 @@
+using UglyToad.PdfPig.Tests.Integration;
+using UglyToad.PdfPig.Writer;
+using System.IO;
+using Xunit;
+
+namespace UglyToad.PdfPig.Tests.Writer
+{
+    public class PdfTextRemoverTests
+    {
+        [Theory]
+        [InlineData("Two Page Text Only - from libre office.pdf")]
+        [InlineData("cat-genetics.pdf")]
+        [InlineData("Motor Insurance claim form.pdf")]
+        [InlineData("Single Page Images - from libre office.pdf")]
+        public void TextRemoverRemovesText(string file)
+        {
+            var filePath = IntegrationHelpers.GetDocumentPath(file);
+            using (var document = PdfDocument.Open(filePath))
+            {
+                var withoutText = PdfTextRemover.RemoveText(filePath);
+                File.WriteAllBytes(@"C:\temp\_tx.pdf", withoutText);
+                using (var documentWithoutText = PdfDocument.Open(withoutText))
+                {
+                    Assert.Equal(document.NumberOfPages, documentWithoutText.NumberOfPages);
+                    for (var i = 1; i <= documentWithoutText.NumberOfPages; i++)
+                    {
+                        Assert.NotEqual(document.GetPage(i).Text, string.Empty);
+                        Assert.Equal(documentWithoutText.GetPage(i).Text, string.Empty);
+                    }
+
+                }
+            }
+        }
+    }
+}
--- a/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
+++ b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
@@ -0,0 +1,97 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using UglyToad.PdfPig.Core;
+using UglyToad.PdfPig.Filters;
+using UglyToad.PdfPig.Graphics.Operations.TextShowing;
+using UglyToad.PdfPig.Graphics.Operations;
+using UglyToad.PdfPig.Graphics;
+using UglyToad.PdfPig.Logging;
+using UglyToad.PdfPig.Parser;
+using UglyToad.PdfPig.Tokens;
+
+namespace UglyToad.PdfPig.Writer
+{
+    /// <summary>
+    /// Derived class of <see cref="TokenWriter"/> that does not write <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations in streams
+    /// </summary>
+    internal class NoTextTokenWriter : TokenWriter
+    {
+        /// <summary>
+        /// Write stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations
+        /// </summary>
+        /// <param name="streamToken"></param>
+        /// <param name="outputStream"></param>
+        protected override void WriteStream(StreamToken streamToken, Stream outputStream)
+        {
+            if (!TryGetStreamWithoutText(streamToken, out var outputStreamToken))
+            {
+                outputStreamToken = streamToken;
+            }
+            WriteDictionary(outputStreamToken.StreamDictionary, outputStream);
+            WriteLineBreak(outputStream);
+            outputStream.Write(StreamStart, 0, StreamStart.Length);
+            WriteLineBreak(outputStream);
+            outputStream.Write(outputStreamToken.Data.ToArray(), 0, outputStreamToken.Data.Count);
+            WriteLineBreak(outputStream);
+            outputStream.Write(StreamEnd, 0, StreamEnd.Length);
+        }
+
+        /// <summary>
+        /// Try get a stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations.
+        /// </summary>
+        /// <param name="streamToken"></param>
+        /// <param name="outputStreamToken"></param>
+        /// <returns>true if any text operation found (and we have a valid <paramref name="outputStreamToken"/> without the text operations),
+        /// false if no text operation found (in which case <paramref name="outputStreamToken"/> is null)</returns>
+        private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken outputStreamToken)
+        {
+            var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance);
+            IReadOnlyList<byte> bytes;
+            try
+            {
+                bytes = streamToken.Decode(filterProvider);
+            }
+            catch
+            {
+                outputStreamToken = null;
+                return false;
+            }
+
+            var pageContentParser = new PageContentParser(new ReflectionGraphicsStateOperationFactory());
+            IReadOnlyList<IGraphicsStateOperation> operations;
+            try
+            {
+                operations = pageContentParser.Parse(1, new ByteArrayInputBytes(bytes), new NoOpLog());
+            }
+            catch (Exception)
+            {
+                outputStreamToken = null;
+                return false;
+            }
+
+            using (var outputStreamT = new MemoryStream())
+            {
+                var haveText = false;
+                foreach (var op in operations)
+                {
+                    if (op.Operator == ShowText.Symbol || op.Operator == ShowTextsWithPositioning.Symbol)
+                    {
+                        haveText = true;
+                        continue;
+                    }
+                    op.Write(outputStreamT);
+                }
+                if (!haveText)
+                {
+                    outputStreamToken = null;
+                    return false;
+                }
+                outputStreamT.Seek(0, SeekOrigin.Begin);
+                outputStreamToken = DataCompresser.CompressToStream(outputStreamT.ToArray());
+                return true;
+            }
+        }
+    }
+}
--- a/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs
+++ b/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs
@@ -0,0 +1,100 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace UglyToad.PdfPig.Writer
+{
+    /// <summary>
+    /// Class to remove text from PDFs, useful as a preprocessing step for Optical Character Recognition (OCR)
+    /// </summary>
+    public static class PdfTextRemover
+    {
+        /// <summary>
+        /// Return PDF without text as bytes
+        /// <param name="filePath">Path to PDF</param>
+        /// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
+        /// </summary>
+        public static byte[] RemoveText(string filePath, IReadOnlyList<int> pagesBundle = null)
+        {
+            using (var output = new MemoryStream())
+            {
+                RemoveText(output, filePath, pagesBundle);
+                return output.ToArray();
+            }
+        }
+
+        /// <summary>
+        /// Write PDF without text to the output stream. The caller must manage disposing the output stream.
+        /// <param name="output">Must be writable</param>
+        /// <param name="filePath">Path to PDF</param>
+        /// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
+        /// </summary>
+        public static void RemoveText(Stream output, string filePath, IReadOnlyList<int> pagesBundle = null)
+        {
+            using (var stream = File.OpenRead(filePath))
+            {
+                RemoveText(stream, output, pagesBundle);
+            }
+        }
+
+        /// <summary>
+        /// Remove text from the PDF (passed in as a byte array) and return it as a new byte array
+        /// <param name="file">PDF document (as byte array)</param>
+        /// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
+        /// <returns>PDF without text (as a byte array)</returns>
+        /// </summary>
+        public static byte[] RemoveText(byte[] file, IReadOnlyList<int> pagesBundle = null)
+        {
+            _ = file ?? throw new ArgumentNullException(nameof(file));
+
+            using (var output = new MemoryStream())
+            {
+                RemoveText(PdfDocument.Open(file), output, pagesBundle);
+                return output.ToArray();
+            }
+        }
+
+        /// <summary>
+        /// Remove text from the PDF in the input stream and write it to the output stream.
+        /// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
+        /// <param name="stream">Streams for the file contents, this must support reading and seeking.</param>
+        /// <param name="output">Must be writable</param>
+        /// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
+        /// </summary>
+        public static void RemoveText(Stream stream, Stream output, IReadOnlyList<int> pagesBundle = null)
+        {
+            _ = stream ?? throw new ArgumentNullException(nameof(stream));
+            _ = output ?? throw new ArgumentNullException(nameof(output));
+
+            RemoveText(PdfDocument.Open(stream), output, pagesBundle);
+        }
+
+        /// <summary>
+        /// Remove text from the PDF and write it to the output stream.
+        /// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
+        /// <param name="file">PDF document</param>
+        /// <param name="output">Must be writable</param>
+        /// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
+        /// </summary>
+        public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList<int> pagesBundle = null)
+        {
+            using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter()))
+            {
+                if (pagesBundle == null)
+                {
+                    for (var i = 1; i <= file.NumberOfPages; i++)
+                    {
+                        document.AddPage(file, i);
+                    }
+                } 
+                else
+                {
+                    foreach (var i in pagesBundle)
+                    {
+                        document.AddPage(file, i);
+                    }
+                }
+            }
+        }
+    }
+}