mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
Added a PdfTextRemover utility that uses a NoTextTokenWriter, to output PDFs without text contents.
Also added unit tests to test: - If we can use a custom ITokenWriter with PdfDocumentBuilder - If removing text works.
This commit is contained in:
@@ -8,6 +8,7 @@
|
||||
using PdfPig.Fonts.Standard14Fonts;
|
||||
using PdfPig.Tokens;
|
||||
using PdfPig.Writer;
|
||||
using System.Collections.Generic;
|
||||
using Tests.Fonts.TrueType;
|
||||
using Xunit;
|
||||
|
||||
@@ -1114,6 +1115,27 @@
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanUseCustomTokenWriter()
|
||||
{
|
||||
var docPath = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf");
|
||||
var tw = new TestTokenWriter();
|
||||
|
||||
using (var doc = PdfDocument.Open(docPath))
|
||||
using (var ms = new MemoryStream())
|
||||
using (var builder = new PdfDocumentBuilder(ms, tokenWriter: tw))
|
||||
{
|
||||
for (var i = 1; i <= doc.NumberOfPages; i++)
|
||||
{
|
||||
builder.AddPage(doc, i);
|
||||
}
|
||||
builder.Build();
|
||||
}
|
||||
Assert.Equal(tw.Objects, 0); // No objects in sample file
|
||||
Assert.True(tw.Tokens > 1000); // Roughly 1065
|
||||
Assert.True(tw.WroteCrossReferenceTable);
|
||||
}
|
||||
|
||||
private static void WriteFile(string name, byte[] bytes, string extension = "pdf")
|
||||
{
|
||||
try
|
||||
@@ -1133,4 +1155,29 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public class TestTokenWriter : ITokenWriter
|
||||
{
|
||||
public int Tokens { get; private set; }
|
||||
public int Objects { get; private set; }
|
||||
public bool WroteCrossReferenceTable { get; private set; }
|
||||
|
||||
public void WriteToken(IToken token, Stream outputStream)
|
||||
{
|
||||
Tokens++;
|
||||
}
|
||||
|
||||
public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream)
|
||||
{
|
||||
Objects++;
|
||||
}
|
||||
|
||||
public void WriteCrossReferenceTable(IReadOnlyDictionary<IndirectReference, long> objectOffsets,
|
||||
IndirectReference catalogToken,
|
||||
Stream outputStream,
|
||||
IndirectReference? documentInformationReference)
|
||||
{
|
||||
WroteCrossReferenceTable = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
35
src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs
Normal file
35
src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs
Normal file
@@ -0,0 +1,35 @@
|
||||
using UglyToad.PdfPig.Tests.Integration;
|
||||
using UglyToad.PdfPig.Writer;
|
||||
using System.IO;
|
||||
using Xunit;
|
||||
|
||||
namespace UglyToad.PdfPig.Tests.Writer
|
||||
{
|
||||
public class PdfTextRemoverTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData("Two Page Text Only - from libre office.pdf")]
|
||||
[InlineData("cat-genetics.pdf")]
|
||||
[InlineData("Motor Insurance claim form.pdf")]
|
||||
[InlineData("Single Page Images - from libre office.pdf")]
|
||||
public void TextRemoverRemovesText(string file)
|
||||
{
|
||||
var filePath = IntegrationHelpers.GetDocumentPath(file);
|
||||
using (var document = PdfDocument.Open(filePath))
|
||||
{
|
||||
var withoutText = PdfTextRemover.RemoveText(filePath);
|
||||
File.WriteAllBytes(@"C:\temp\_tx.pdf", withoutText);
|
||||
using (var documentWithoutText = PdfDocument.Open(withoutText))
|
||||
{
|
||||
Assert.Equal(document.NumberOfPages, documentWithoutText.NumberOfPages);
|
||||
for (var i = 1; i <= documentWithoutText.NumberOfPages; i++)
|
||||
{
|
||||
Assert.NotEqual(document.GetPage(i).Text, string.Empty);
|
||||
Assert.Equal(documentWithoutText.GetPage(i).Text, string.Empty);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
97
src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
Normal file
97
src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
Normal file
@@ -0,0 +1,97 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.Core;
|
||||
using UglyToad.PdfPig.Filters;
|
||||
using UglyToad.PdfPig.Graphics.Operations.TextShowing;
|
||||
using UglyToad.PdfPig.Graphics.Operations;
|
||||
using UglyToad.PdfPig.Graphics;
|
||||
using UglyToad.PdfPig.Logging;
|
||||
using UglyToad.PdfPig.Parser;
|
||||
using UglyToad.PdfPig.Tokens;
|
||||
|
||||
namespace UglyToad.PdfPig.Writer
|
||||
{
|
||||
/// <summary>
|
||||
/// Derived class of <see cref="TokenWriter"/> that does not write <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations in streams
|
||||
/// </summary>
|
||||
internal class NoTextTokenWriter : TokenWriter
|
||||
{
|
||||
/// <summary>
|
||||
/// Write stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations
|
||||
/// </summary>
|
||||
/// <param name="streamToken"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected override void WriteStream(StreamToken streamToken, Stream outputStream)
|
||||
{
|
||||
if (!TryGetStreamWithoutText(streamToken, out var outputStreamToken))
|
||||
{
|
||||
outputStreamToken = streamToken;
|
||||
}
|
||||
WriteDictionary(outputStreamToken.StreamDictionary, outputStream);
|
||||
WriteLineBreak(outputStream);
|
||||
outputStream.Write(StreamStart, 0, StreamStart.Length);
|
||||
WriteLineBreak(outputStream);
|
||||
outputStream.Write(outputStreamToken.Data.ToArray(), 0, outputStreamToken.Data.Count);
|
||||
WriteLineBreak(outputStream);
|
||||
outputStream.Write(StreamEnd, 0, StreamEnd.Length);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Try get a stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations.
|
||||
/// </summary>
|
||||
/// <param name="streamToken"></param>
|
||||
/// <param name="outputStreamToken"></param>
|
||||
/// <returns>true if any text operation found (and we have a valid <paramref name="outputStreamToken"/> without the text operations),
|
||||
/// false if no text operation found (in which case <paramref name="outputStreamToken"/> is null)</returns>
|
||||
private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken outputStreamToken)
|
||||
{
|
||||
var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance);
|
||||
IReadOnlyList<byte> bytes;
|
||||
try
|
||||
{
|
||||
bytes = streamToken.Decode(filterProvider);
|
||||
}
|
||||
catch
|
||||
{
|
||||
outputStreamToken = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
var pageContentParser = new PageContentParser(new ReflectionGraphicsStateOperationFactory());
|
||||
IReadOnlyList<IGraphicsStateOperation> operations;
|
||||
try
|
||||
{
|
||||
operations = pageContentParser.Parse(1, new ByteArrayInputBytes(bytes), new NoOpLog());
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
outputStreamToken = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
using (var outputStreamT = new MemoryStream())
|
||||
{
|
||||
var haveText = false;
|
||||
foreach (var op in operations)
|
||||
{
|
||||
if (op.Operator == ShowText.Symbol || op.Operator == ShowTextsWithPositioning.Symbol)
|
||||
{
|
||||
haveText = true;
|
||||
continue;
|
||||
}
|
||||
op.Write(outputStreamT);
|
||||
}
|
||||
if (!haveText)
|
||||
{
|
||||
outputStreamToken = null;
|
||||
return false;
|
||||
}
|
||||
outputStreamT.Seek(0, SeekOrigin.Begin);
|
||||
outputStreamToken = DataCompresser.CompressToStream(outputStreamT.ToArray());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
100
src/UglyToad.PdfPig/Writer/PdfTextRemover.cs
Normal file
100
src/UglyToad.PdfPig/Writer/PdfTextRemover.cs
Normal file
@@ -0,0 +1,100 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
|
||||
namespace UglyToad.PdfPig.Writer
|
||||
{
|
||||
/// <summary>
|
||||
/// Class to remove text from PDFs, useful as a preprocessing step for Optical Character Recognition (OCR)
|
||||
/// </summary>
|
||||
public static class PdfTextRemover
|
||||
{
|
||||
/// <summary>
|
||||
/// Return PDF without text as bytes
|
||||
/// <param name="filePath">Path to PDF</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// </summary>
|
||||
public static byte[] RemoveText(string filePath, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
using (var output = new MemoryStream())
|
||||
{
|
||||
RemoveText(output, filePath, pagesBundle);
|
||||
return output.ToArray();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Write PDF without text to the output stream. The caller must manage disposing the output stream.
|
||||
/// <param name="output">Must be writable</param>
|
||||
/// <param name="filePath">Path to PDF</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// </summary>
|
||||
public static void RemoveText(Stream output, string filePath, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
using (var stream = File.OpenRead(filePath))
|
||||
{
|
||||
RemoveText(stream, output, pagesBundle);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove text from the PDF (passed in as a byte array) and return it as a new byte array
|
||||
/// <param name="file">PDF document (as byte array)</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// <returns>PDF without text (as a byte array)</returns>
|
||||
/// </summary>
|
||||
public static byte[] RemoveText(byte[] file, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
_ = file ?? throw new ArgumentNullException(nameof(file));
|
||||
|
||||
using (var output = new MemoryStream())
|
||||
{
|
||||
RemoveText(PdfDocument.Open(file), output, pagesBundle);
|
||||
return output.ToArray();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove text from the PDF in the input stream and write it to the output stream.
|
||||
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
|
||||
/// <param name="stream">Streams for the file contents, this must support reading and seeking.</param>
|
||||
/// <param name="output">Must be writable</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// </summary>
|
||||
public static void RemoveText(Stream stream, Stream output, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
_ = stream ?? throw new ArgumentNullException(nameof(stream));
|
||||
_ = output ?? throw new ArgumentNullException(nameof(output));
|
||||
|
||||
RemoveText(PdfDocument.Open(stream), output, pagesBundle);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove text from the PDF and write it to the output stream.
|
||||
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
|
||||
/// <param name="file">PDF document</param>
|
||||
/// <param name="output">Must be writable</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// </summary>
|
||||
public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter()))
|
||||
{
|
||||
if (pagesBundle == null)
|
||||
{
|
||||
for (var i = 1; i <= file.NumberOfPages; i++)
|
||||
{
|
||||
document.AddPage(file, i);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
foreach (var i in pagesBundle)
|
||||
{
|
||||
document.AddPage(file, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user