Added a PdfTextRemover utility that uses a NoTextTokenWriter, to output PDFs without text contents.

Also added unit tests to test:
- If we can use a custom ITokenWriter with PdfDocumentBuilder
- If removing text works.
This commit is contained in:
mvantzet
2022-12-20 21:31:15 +01:00
parent 9273a43965
commit 6ef6c4d780
4 changed files with 361 additions and 82 deletions

View File

@@ -1,73 +1,74 @@
namespace UglyToad.PdfPig.Tests.Writer
{
using System.IO;
using System.Linq;
using Content;
using Integration;
using PdfPig.Core;
using PdfPig.Fonts.Standard14Fonts;
using PdfPig.Tokens;
using PdfPig.Writer;
using System.Linq;
using Content;
using Integration;
using PdfPig.Core;
using PdfPig.Fonts.Standard14Fonts;
using PdfPig.Tokens;
using PdfPig.Writer;
using System.Collections.Generic;
using Tests.Fonts.TrueType;
using Xunit;
public class PdfDocumentBuilderTests
{
[Fact]
public void CanWriteSingleBlankPage()
{
var result = CreateSingleBlankPage();
WriteFile(nameof(CanWriteSinglePageHelloWorld), result);
Assert.NotEmpty(result);
var str = OtherEncodings.BytesAsLatin1String(result);
Assert.StartsWith("%PDF", str);
Assert.EndsWith("%%EOF", str);
}
[Fact]
public void CanCreateSingleCustomPageSize()
{
var builder = new PdfDocumentBuilder();
var page = builder.AddPage(120, 250);
var font = builder.AddStandard14Font(Standard14Font.Helvetica);
page.AddText("Small page.", 12, new PdfPoint(25, 200), font);
var bytes = builder.Build();
WriteFile(nameof(CanCreateSingleCustomPageSize), bytes);
using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff))
{
Assert.Equal(1, document.NumberOfPages);
var page1 = document.GetPage(1);
Assert.Equal(120, page1.Width);
Assert.Equal(250, page1.Height);
Assert.Equal("Small page.", page1.Text);
}
public class PdfDocumentBuilderTests
{
[Fact]
public void CanWriteSingleBlankPage()
{
var result = CreateSingleBlankPage();
WriteFile(nameof(CanWriteSinglePageHelloWorld), result);
Assert.NotEmpty(result);
var str = OtherEncodings.BytesAsLatin1String(result);
Assert.StartsWith("%PDF", str);
Assert.EndsWith("%%EOF", str);
}
[Fact]
public void CanFastAddPageAndInheritProps()
{
var first = IntegrationHelpers.GetDocumentPath("inherited_mediabox.pdf");
var contents = File.ReadAllBytes(first);
[Fact]
public void CanCreateSingleCustomPageSize()
{
var builder = new PdfDocumentBuilder();
byte[] results = null;
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
using (var output = new PdfDocumentBuilder())
var page = builder.AddPage(120, 250);
var font = builder.AddStandard14Font(Standard14Font.Helvetica);
page.AddText("Small page.", 12, new PdfPoint(25, 200), font);
var bytes = builder.Build();
WriteFile(nameof(CanCreateSingleCustomPageSize), bytes);
using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff))
{
Assert.Equal(1, document.NumberOfPages);
var page1 = document.GetPage(1);
Assert.Equal(120, page1.Width);
Assert.Equal(250, page1.Height);
Assert.Equal("Small page.", page1.Text);
}
}
[Fact]
public void CanFastAddPageAndInheritProps()
{
var first = IntegrationHelpers.GetDocumentPath("inherited_mediabox.pdf");
var contents = File.ReadAllBytes(first);
byte[] results = null;
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
using (var output = new PdfDocumentBuilder())
{
output.AddPage(existing, 1);
results = output.Build();
results = output.Build();
}
using (var rewritted = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
@@ -75,22 +76,22 @@
var pg = rewritted.GetPage(1);
Assert.Equal(200, pg.MediaBox.Bounds.Width);
Assert.Equal(100, pg.MediaBox.Bounds.Height);
}
}
}
[Fact]
public void CanFastAddPageWithStreamSubtype()
{
var first = IntegrationHelpers.GetDocumentPath("steam_in_page_dict.pdf");
var contents = File.ReadAllBytes(first);
[Fact]
public void CanFastAddPageWithStreamSubtype()
{
var first = IntegrationHelpers.GetDocumentPath("steam_in_page_dict.pdf");
var contents = File.ReadAllBytes(first);
byte[] results = null;
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
using (var output = new PdfDocumentBuilder())
byte[] results = null;
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
using (var output = new PdfDocumentBuilder())
{
output.AddPage(existing, 1);
results = output.Build();
results = output.Build();
}
using (var rewritted = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
@@ -98,26 +99,26 @@
// really just checking for no exception...
var pg = rewritted.GetPage(1);
Assert.NotNull(pg.Content);
}
}
}
[Fact]
public void CanFastAddPageAndStripLinkAnnots()
{
var first = IntegrationHelpers.GetDocumentPath("outline.pdf");
var contents = File.ReadAllBytes(first);
[Fact]
public void CanFastAddPageAndStripLinkAnnots()
{
var first = IntegrationHelpers.GetDocumentPath("outline.pdf");
var contents = File.ReadAllBytes(first);
var annotCount = 0;
byte[] results = null;
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
using (var output = new PdfDocumentBuilder())
byte[] results = null;
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
using (var output = new PdfDocumentBuilder())
{
output.AddPage(existing, 1);
results = output.Build();
var pg = existing.GetPage(1);
var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
annotCount = annots.Count;
Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link);
Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link);
}
using (var rewritten = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
@@ -126,7 +127,7 @@
var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
Assert.Equal(annotCount - 1, annots.Count);
Assert.DoesNotContain(annots, x => x.Type == Annotations.AnnotationType.Link);
}
}
}
[Fact]
@@ -1114,6 +1115,27 @@
}
}
[Fact]
public void CanUseCustomTokenWriter()
{
var docPath = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf");
var tw = new TestTokenWriter();
using (var doc = PdfDocument.Open(docPath))
using (var ms = new MemoryStream())
using (var builder = new PdfDocumentBuilder(ms, tokenWriter: tw))
{
for (var i = 1; i <= doc.NumberOfPages; i++)
{
builder.AddPage(doc, i);
}
builder.Build();
}
Assert.Equal(tw.Objects, 0); // No objects in sample file
Assert.True(tw.Tokens > 1000); // Roughly 1065
Assert.True(tw.WroteCrossReferenceTable);
}
private static void WriteFile(string name, byte[] bytes, string extension = "pdf")
{
try
@@ -1133,4 +1155,29 @@
}
}
}
public class TestTokenWriter : ITokenWriter
{
public int Tokens { get; private set; }
public int Objects { get; private set; }
public bool WroteCrossReferenceTable { get; private set; }
public void WriteToken(IToken token, Stream outputStream)
{
Tokens++;
}
public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream)
{
Objects++;
}
public void WriteCrossReferenceTable(IReadOnlyDictionary<IndirectReference, long> objectOffsets,
IndirectReference catalogToken,
Stream outputStream,
IndirectReference? documentInformationReference)
{
WroteCrossReferenceTable = true;
}
}
}

View File

@@ -0,0 +1,35 @@
using UglyToad.PdfPig.Tests.Integration;
using UglyToad.PdfPig.Writer;
using System.IO;
using Xunit;
namespace UglyToad.PdfPig.Tests.Writer
{
public class PdfTextRemoverTests
{
[Theory]
[InlineData("Two Page Text Only - from libre office.pdf")]
[InlineData("cat-genetics.pdf")]
[InlineData("Motor Insurance claim form.pdf")]
[InlineData("Single Page Images - from libre office.pdf")]
public void TextRemoverRemovesText(string file)
{
var filePath = IntegrationHelpers.GetDocumentPath(file);
using (var document = PdfDocument.Open(filePath))
{
var withoutText = PdfTextRemover.RemoveText(filePath);
File.WriteAllBytes(@"C:\temp\_tx.pdf", withoutText);
using (var documentWithoutText = PdfDocument.Open(withoutText))
{
Assert.Equal(document.NumberOfPages, documentWithoutText.NumberOfPages);
for (var i = 1; i <= documentWithoutText.NumberOfPages; i++)
{
Assert.NotEqual(document.GetPage(i).Text, string.Empty);
Assert.Equal(documentWithoutText.GetPage(i).Text, string.Empty);
}
}
}
}
}
}

View File

@@ -0,0 +1,97 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using UglyToad.PdfPig.Core;
using UglyToad.PdfPig.Filters;
using UglyToad.PdfPig.Graphics.Operations.TextShowing;
using UglyToad.PdfPig.Graphics.Operations;
using UglyToad.PdfPig.Graphics;
using UglyToad.PdfPig.Logging;
using UglyToad.PdfPig.Parser;
using UglyToad.PdfPig.Tokens;
namespace UglyToad.PdfPig.Writer
{
/// <summary>
/// Derived class of <see cref="TokenWriter"/> that does not write <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations in streams
/// </summary>
internal class NoTextTokenWriter : TokenWriter
{
/// <summary>
/// Write stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations
/// </summary>
/// <param name="streamToken"></param>
/// <param name="outputStream"></param>
protected override void WriteStream(StreamToken streamToken, Stream outputStream)
{
if (!TryGetStreamWithoutText(streamToken, out var outputStreamToken))
{
outputStreamToken = streamToken;
}
WriteDictionary(outputStreamToken.StreamDictionary, outputStream);
WriteLineBreak(outputStream);
outputStream.Write(StreamStart, 0, StreamStart.Length);
WriteLineBreak(outputStream);
outputStream.Write(outputStreamToken.Data.ToArray(), 0, outputStreamToken.Data.Count);
WriteLineBreak(outputStream);
outputStream.Write(StreamEnd, 0, StreamEnd.Length);
}
/// <summary>
/// Try get a stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations.
/// </summary>
/// <param name="streamToken"></param>
/// <param name="outputStreamToken"></param>
/// <returns>true if any text operation found (and we have a valid <paramref name="outputStreamToken"/> without the text operations),
/// false if no text operation found (in which case <paramref name="outputStreamToken"/> is null)</returns>
private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken outputStreamToken)
{
var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance);
IReadOnlyList<byte> bytes;
try
{
bytes = streamToken.Decode(filterProvider);
}
catch
{
outputStreamToken = null;
return false;
}
var pageContentParser = new PageContentParser(new ReflectionGraphicsStateOperationFactory());
IReadOnlyList<IGraphicsStateOperation> operations;
try
{
operations = pageContentParser.Parse(1, new ByteArrayInputBytes(bytes), new NoOpLog());
}
catch (Exception)
{
outputStreamToken = null;
return false;
}
using (var outputStreamT = new MemoryStream())
{
var haveText = false;
foreach (var op in operations)
{
if (op.Operator == ShowText.Symbol || op.Operator == ShowTextsWithPositioning.Symbol)
{
haveText = true;
continue;
}
op.Write(outputStreamT);
}
if (!haveText)
{
outputStreamToken = null;
return false;
}
outputStreamT.Seek(0, SeekOrigin.Begin);
outputStreamToken = DataCompresser.CompressToStream(outputStreamT.ToArray());
return true;
}
}
}
}

View File

@@ -0,0 +1,100 @@
using System;
using System.Collections.Generic;
using System.IO;
namespace UglyToad.PdfPig.Writer
{
/// <summary>
/// Class to remove text from PDFs, useful as a preprocessing step for Optical Character Recognition (OCR)
/// </summary>
public static class PdfTextRemover
{
/// <summary>
/// Return PDF without text as bytes
/// <param name="filePath">Path to PDF</param>
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
/// </summary>
public static byte[] RemoveText(string filePath, IReadOnlyList<int> pagesBundle = null)
{
using (var output = new MemoryStream())
{
RemoveText(output, filePath, pagesBundle);
return output.ToArray();
}
}
/// <summary>
/// Write PDF without text to the output stream. The caller must manage disposing the output stream.
/// <param name="output">Must be writable</param>
/// <param name="filePath">Path to PDF</param>
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
/// </summary>
public static void RemoveText(Stream output, string filePath, IReadOnlyList<int> pagesBundle = null)
{
using (var stream = File.OpenRead(filePath))
{
RemoveText(stream, output, pagesBundle);
}
}
/// <summary>
/// Remove text from the PDF (passed in as a byte array) and return it as a new byte array
/// <param name="file">PDF document (as byte array)</param>
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
/// <returns>PDF without text (as a byte array)</returns>
/// </summary>
public static byte[] RemoveText(byte[] file, IReadOnlyList<int> pagesBundle = null)
{
_ = file ?? throw new ArgumentNullException(nameof(file));
using (var output = new MemoryStream())
{
RemoveText(PdfDocument.Open(file), output, pagesBundle);
return output.ToArray();
}
}
/// <summary>
/// Remove text from the PDF in the input stream and write it to the output stream.
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
/// <param name="stream">Streams for the file contents, this must support reading and seeking.</param>
/// <param name="output">Must be writable</param>
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
/// </summary>
public static void RemoveText(Stream stream, Stream output, IReadOnlyList<int> pagesBundle = null)
{
_ = stream ?? throw new ArgumentNullException(nameof(stream));
_ = output ?? throw new ArgumentNullException(nameof(output));
RemoveText(PdfDocument.Open(stream), output, pagesBundle);
}
/// <summary>
/// Remove text from the PDF and write it to the output stream.
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
/// <param name="file">PDF document</param>
/// <param name="output">Must be writable</param>
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
/// </summary>
public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList<int> pagesBundle = null)
{
using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter()))
{
if (pagesBundle == null)
{
for (var i = 1; i <= file.NumberOfPages; i++)
{
document.AddPage(file, i);
}
}
else
{
foreach (var i in pagesBundle)
{
document.AddPage(file, i);
}
}
}
}
}
}