using System; using System.Collections.Generic; using System.IO; namespace UglyToad.PdfPig.Writer { /// /// Class to remove text from PDFs, useful as a preprocessing step for Optical Character Recognition (OCR). /// Note that this should not be used to redact content from PDFs, this is not a secure or reliable way to redact text. /// public static class PdfTextRemover { /// /// Return PDF without text as bytes /// Path to PDF /// List of pages to emit; if null all pages are emitted /// public static byte[] RemoveText(string filePath, IReadOnlyList pagesBundle = null) { using (var output = new MemoryStream()) { RemoveText(output, filePath, pagesBundle); return output.ToArray(); } } /// /// Write PDF without text to the output stream. The caller must manage disposing the output stream. /// Must be writable /// Path to PDF /// List of pages to emit; if null all pages are emitted /// public static void RemoveText(Stream output, string filePath, IReadOnlyList pagesBundle = null) { using (var stream = File.OpenRead(filePath)) { RemoveText(stream, output, pagesBundle); } } /// /// Remove text from the PDF (passed in as a byte array) and return it as a new byte array /// PDF document (as byte array) /// List of pages to emit; if null all pages are emitted /// PDF without text (as a byte array) /// public static byte[] RemoveText(byte[] file, IReadOnlyList pagesBundle = null) { _ = file ?? throw new ArgumentNullException(nameof(file)); using (var output = new MemoryStream()) { RemoveText(PdfDocument.Open(file), output, pagesBundle); return output.ToArray(); } } /// /// Remove text from the PDF in the input stream and write it to the output stream. /// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream. /// Streams for the file contents, this must support reading and seeking. /// Must be writable /// List of pages to emit; if null all pages are emitted /// public static void RemoveText(Stream stream, Stream output, IReadOnlyList pagesBundle = null) { _ = stream ?? throw new ArgumentNullException(nameof(stream)); _ = output ?? throw new ArgumentNullException(nameof(output)); RemoveText(PdfDocument.Open(stream), output, pagesBundle); } /// /// Remove text from the PDF and write it to the output stream. /// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream. /// PDF document /// Must be writable /// List of pages to emit; if null all pages are emitted /// public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList pagesBundle = null) { using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter())) { if (pagesBundle == null) { for (var i = 1; i <= file.NumberOfPages; i++) { document.AddPage(file, i); } } else { foreach (var i in pagesBundle) { document.AddPage(file, i); } } } } } }