using System;
using System.Collections.Generic;
using System.IO;
namespace UglyToad.PdfPig.Writer
{
///
/// Class to remove text from PDFs, useful as a preprocessing step for Optical Character Recognition (OCR).
/// Note that this should not be used to redact content from PDFs, this is not a secure or reliable way to redact text.
///
public static class PdfTextRemover
{
///
/// Return PDF without text as bytes
/// Path to PDF
/// List of pages to emit; if null all pages are emitted
///
public static byte[] RemoveText(string filePath, IReadOnlyList pagesBundle = null)
{
using (var output = new MemoryStream())
{
RemoveText(output, filePath, pagesBundle);
return output.ToArray();
}
}
///
/// Write PDF without text to the output stream. The caller must manage disposing the output stream.
/// Must be writable
/// Path to PDF
/// List of pages to emit; if null all pages are emitted
///
public static void RemoveText(Stream output, string filePath, IReadOnlyList pagesBundle = null)
{
using (var stream = File.OpenRead(filePath))
{
RemoveText(stream, output, pagesBundle);
}
}
///
/// Remove text from the PDF (passed in as a byte array) and return it as a new byte array
/// PDF document (as byte array)
/// List of pages to emit; if null all pages are emitted
/// PDF without text (as a byte array)
///
public static byte[] RemoveText(byte[] file, IReadOnlyList pagesBundle = null)
{
_ = file ?? throw new ArgumentNullException(nameof(file));
using (var output = new MemoryStream())
{
RemoveText(PdfDocument.Open(file), output, pagesBundle);
return output.ToArray();
}
}
///
/// Remove text from the PDF in the input stream and write it to the output stream.
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
/// Streams for the file contents, this must support reading and seeking.
/// Must be writable
/// List of pages to emit; if null all pages are emitted
///
public static void RemoveText(Stream stream, Stream output, IReadOnlyList pagesBundle = null)
{
_ = stream ?? throw new ArgumentNullException(nameof(stream));
_ = output ?? throw new ArgumentNullException(nameof(output));
RemoveText(PdfDocument.Open(stream), output, pagesBundle);
}
///
/// Remove text from the PDF and write it to the output stream.
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
/// PDF document
/// Must be writable
/// List of pages to emit; if null all pages are emitted
///
public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList pagesBundle = null)
{
using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter()))
{
if (pagesBundle == null)
{
for (var i = 1; i <= file.NumberOfPages; i++)
{
document.AddPage(file, i);
}
}
else
{
foreach (var i in pagesBundle)
{
document.AddPage(file, i);
}
}
}
}
}
}