mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-01-18 19:51:24 +08:00
Added a PdfTextRemover utility that uses a NoTextTokenWriter, to output PDFs without text contents.
Also added unit tests to test: - If we can use a custom ITokenWriter with PdfDocumentBuilder - If removing text works.
This commit is contained in:
@@ -1,73 +1,74 @@
|
||||
namespace UglyToad.PdfPig.Tests.Writer
|
||||
{
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Integration;
|
||||
using PdfPig.Core;
|
||||
using PdfPig.Fonts.Standard14Fonts;
|
||||
using PdfPig.Tokens;
|
||||
using PdfPig.Writer;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Integration;
|
||||
using PdfPig.Core;
|
||||
using PdfPig.Fonts.Standard14Fonts;
|
||||
using PdfPig.Tokens;
|
||||
using PdfPig.Writer;
|
||||
using System.Collections.Generic;
|
||||
using Tests.Fonts.TrueType;
|
||||
using Xunit;
|
||||
|
||||
public class PdfDocumentBuilderTests
|
||||
{
|
||||
[Fact]
|
||||
public void CanWriteSingleBlankPage()
|
||||
{
|
||||
var result = CreateSingleBlankPage();
|
||||
|
||||
WriteFile(nameof(CanWriteSinglePageHelloWorld), result);
|
||||
|
||||
Assert.NotEmpty(result);
|
||||
|
||||
var str = OtherEncodings.BytesAsLatin1String(result);
|
||||
Assert.StartsWith("%PDF", str);
|
||||
Assert.EndsWith("%%EOF", str);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanCreateSingleCustomPageSize()
|
||||
{
|
||||
var builder = new PdfDocumentBuilder();
|
||||
|
||||
var page = builder.AddPage(120, 250);
|
||||
|
||||
var font = builder.AddStandard14Font(Standard14Font.Helvetica);
|
||||
|
||||
page.AddText("Small page.", 12, new PdfPoint(25, 200), font);
|
||||
|
||||
var bytes = builder.Build();
|
||||
|
||||
WriteFile(nameof(CanCreateSingleCustomPageSize), bytes);
|
||||
|
||||
using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
Assert.Equal(1, document.NumberOfPages);
|
||||
|
||||
var page1 = document.GetPage(1);
|
||||
|
||||
Assert.Equal(120, page1.Width);
|
||||
Assert.Equal(250, page1.Height);
|
||||
|
||||
Assert.Equal("Small page.", page1.Text);
|
||||
}
|
||||
public class PdfDocumentBuilderTests
|
||||
{
|
||||
[Fact]
|
||||
public void CanWriteSingleBlankPage()
|
||||
{
|
||||
var result = CreateSingleBlankPage();
|
||||
|
||||
WriteFile(nameof(CanWriteSinglePageHelloWorld), result);
|
||||
|
||||
Assert.NotEmpty(result);
|
||||
|
||||
var str = OtherEncodings.BytesAsLatin1String(result);
|
||||
Assert.StartsWith("%PDF", str);
|
||||
Assert.EndsWith("%%EOF", str);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanFastAddPageAndInheritProps()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("inherited_mediabox.pdf");
|
||||
var contents = File.ReadAllBytes(first);
|
||||
|
||||
[Fact]
|
||||
public void CanCreateSingleCustomPageSize()
|
||||
{
|
||||
var builder = new PdfDocumentBuilder();
|
||||
|
||||
byte[] results = null;
|
||||
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
|
||||
using (var output = new PdfDocumentBuilder())
|
||||
var page = builder.AddPage(120, 250);
|
||||
|
||||
var font = builder.AddStandard14Font(Standard14Font.Helvetica);
|
||||
|
||||
page.AddText("Small page.", 12, new PdfPoint(25, 200), font);
|
||||
|
||||
var bytes = builder.Build();
|
||||
|
||||
WriteFile(nameof(CanCreateSingleCustomPageSize), bytes);
|
||||
|
||||
using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
Assert.Equal(1, document.NumberOfPages);
|
||||
|
||||
var page1 = document.GetPage(1);
|
||||
|
||||
Assert.Equal(120, page1.Width);
|
||||
Assert.Equal(250, page1.Height);
|
||||
|
||||
Assert.Equal("Small page.", page1.Text);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanFastAddPageAndInheritProps()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("inherited_mediabox.pdf");
|
||||
var contents = File.ReadAllBytes(first);
|
||||
|
||||
|
||||
byte[] results = null;
|
||||
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
|
||||
using (var output = new PdfDocumentBuilder())
|
||||
{
|
||||
output.AddPage(existing, 1);
|
||||
results = output.Build();
|
||||
results = output.Build();
|
||||
}
|
||||
|
||||
using (var rewritted = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
|
||||
@@ -75,22 +76,22 @@
|
||||
var pg = rewritted.GetPage(1);
|
||||
Assert.Equal(200, pg.MediaBox.Bounds.Width);
|
||||
Assert.Equal(100, pg.MediaBox.Bounds.Height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanFastAddPageWithStreamSubtype()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("steam_in_page_dict.pdf");
|
||||
var contents = File.ReadAllBytes(first);
|
||||
|
||||
[Fact]
|
||||
public void CanFastAddPageWithStreamSubtype()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("steam_in_page_dict.pdf");
|
||||
var contents = File.ReadAllBytes(first);
|
||||
|
||||
byte[] results = null;
|
||||
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
|
||||
using (var output = new PdfDocumentBuilder())
|
||||
|
||||
byte[] results = null;
|
||||
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
|
||||
using (var output = new PdfDocumentBuilder())
|
||||
{
|
||||
output.AddPage(existing, 1);
|
||||
results = output.Build();
|
||||
results = output.Build();
|
||||
}
|
||||
|
||||
using (var rewritted = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
|
||||
@@ -98,26 +99,26 @@
|
||||
// really just checking for no exception...
|
||||
var pg = rewritted.GetPage(1);
|
||||
Assert.NotNull(pg.Content);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanFastAddPageAndStripLinkAnnots()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("outline.pdf");
|
||||
var contents = File.ReadAllBytes(first);
|
||||
|
||||
[Fact]
|
||||
public void CanFastAddPageAndStripLinkAnnots()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("outline.pdf");
|
||||
var contents = File.ReadAllBytes(first);
|
||||
|
||||
var annotCount = 0;
|
||||
byte[] results = null;
|
||||
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
|
||||
using (var output = new PdfDocumentBuilder())
|
||||
byte[] results = null;
|
||||
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
|
||||
using (var output = new PdfDocumentBuilder())
|
||||
{
|
||||
output.AddPage(existing, 1);
|
||||
results = output.Build();
|
||||
var pg = existing.GetPage(1);
|
||||
var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
|
||||
annotCount = annots.Count;
|
||||
Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link);
|
||||
Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link);
|
||||
}
|
||||
|
||||
using (var rewritten = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
|
||||
@@ -126,7 +127,7 @@
|
||||
var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
|
||||
Assert.Equal(annotCount - 1, annots.Count);
|
||||
Assert.DoesNotContain(annots, x => x.Type == Annotations.AnnotationType.Link);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -1114,6 +1115,27 @@
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanUseCustomTokenWriter()
|
||||
{
|
||||
var docPath = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf");
|
||||
var tw = new TestTokenWriter();
|
||||
|
||||
using (var doc = PdfDocument.Open(docPath))
|
||||
using (var ms = new MemoryStream())
|
||||
using (var builder = new PdfDocumentBuilder(ms, tokenWriter: tw))
|
||||
{
|
||||
for (var i = 1; i <= doc.NumberOfPages; i++)
|
||||
{
|
||||
builder.AddPage(doc, i);
|
||||
}
|
||||
builder.Build();
|
||||
}
|
||||
Assert.Equal(tw.Objects, 0); // No objects in sample file
|
||||
Assert.True(tw.Tokens > 1000); // Roughly 1065
|
||||
Assert.True(tw.WroteCrossReferenceTable);
|
||||
}
|
||||
|
||||
private static void WriteFile(string name, byte[] bytes, string extension = "pdf")
|
||||
{
|
||||
try
|
||||
@@ -1133,4 +1155,29 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public class TestTokenWriter : ITokenWriter
|
||||
{
|
||||
public int Tokens { get; private set; }
|
||||
public int Objects { get; private set; }
|
||||
public bool WroteCrossReferenceTable { get; private set; }
|
||||
|
||||
public void WriteToken(IToken token, Stream outputStream)
|
||||
{
|
||||
Tokens++;
|
||||
}
|
||||
|
||||
public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream)
|
||||
{
|
||||
Objects++;
|
||||
}
|
||||
|
||||
public void WriteCrossReferenceTable(IReadOnlyDictionary<IndirectReference, long> objectOffsets,
|
||||
IndirectReference catalogToken,
|
||||
Stream outputStream,
|
||||
IndirectReference? documentInformationReference)
|
||||
{
|
||||
WroteCrossReferenceTable = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
35
src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs
Normal file
35
src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs
Normal file
@@ -0,0 +1,35 @@
|
||||
using UglyToad.PdfPig.Tests.Integration;
|
||||
using UglyToad.PdfPig.Writer;
|
||||
using System.IO;
|
||||
using Xunit;
|
||||
|
||||
namespace UglyToad.PdfPig.Tests.Writer
|
||||
{
|
||||
public class PdfTextRemoverTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData("Two Page Text Only - from libre office.pdf")]
|
||||
[InlineData("cat-genetics.pdf")]
|
||||
[InlineData("Motor Insurance claim form.pdf")]
|
||||
[InlineData("Single Page Images - from libre office.pdf")]
|
||||
public void TextRemoverRemovesText(string file)
|
||||
{
|
||||
var filePath = IntegrationHelpers.GetDocumentPath(file);
|
||||
using (var document = PdfDocument.Open(filePath))
|
||||
{
|
||||
var withoutText = PdfTextRemover.RemoveText(filePath);
|
||||
File.WriteAllBytes(@"C:\temp\_tx.pdf", withoutText);
|
||||
using (var documentWithoutText = PdfDocument.Open(withoutText))
|
||||
{
|
||||
Assert.Equal(document.NumberOfPages, documentWithoutText.NumberOfPages);
|
||||
for (var i = 1; i <= documentWithoutText.NumberOfPages; i++)
|
||||
{
|
||||
Assert.NotEqual(document.GetPage(i).Text, string.Empty);
|
||||
Assert.Equal(documentWithoutText.GetPage(i).Text, string.Empty);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
97
src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
Normal file
97
src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
Normal file
@@ -0,0 +1,97 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.Core;
|
||||
using UglyToad.PdfPig.Filters;
|
||||
using UglyToad.PdfPig.Graphics.Operations.TextShowing;
|
||||
using UglyToad.PdfPig.Graphics.Operations;
|
||||
using UglyToad.PdfPig.Graphics;
|
||||
using UglyToad.PdfPig.Logging;
|
||||
using UglyToad.PdfPig.Parser;
|
||||
using UglyToad.PdfPig.Tokens;
|
||||
|
||||
namespace UglyToad.PdfPig.Writer
|
||||
{
|
||||
/// <summary>
|
||||
/// Derived class of <see cref="TokenWriter"/> that does not write <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations in streams
|
||||
/// </summary>
|
||||
internal class NoTextTokenWriter : TokenWriter
|
||||
{
|
||||
/// <summary>
|
||||
/// Write stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations
|
||||
/// </summary>
|
||||
/// <param name="streamToken"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected override void WriteStream(StreamToken streamToken, Stream outputStream)
|
||||
{
|
||||
if (!TryGetStreamWithoutText(streamToken, out var outputStreamToken))
|
||||
{
|
||||
outputStreamToken = streamToken;
|
||||
}
|
||||
WriteDictionary(outputStreamToken.StreamDictionary, outputStream);
|
||||
WriteLineBreak(outputStream);
|
||||
outputStream.Write(StreamStart, 0, StreamStart.Length);
|
||||
WriteLineBreak(outputStream);
|
||||
outputStream.Write(outputStreamToken.Data.ToArray(), 0, outputStreamToken.Data.Count);
|
||||
WriteLineBreak(outputStream);
|
||||
outputStream.Write(StreamEnd, 0, StreamEnd.Length);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Try get a stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations.
|
||||
/// </summary>
|
||||
/// <param name="streamToken"></param>
|
||||
/// <param name="outputStreamToken"></param>
|
||||
/// <returns>true if any text operation found (and we have a valid <paramref name="outputStreamToken"/> without the text operations),
|
||||
/// false if no text operation found (in which case <paramref name="outputStreamToken"/> is null)</returns>
|
||||
private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken outputStreamToken)
|
||||
{
|
||||
var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance);
|
||||
IReadOnlyList<byte> bytes;
|
||||
try
|
||||
{
|
||||
bytes = streamToken.Decode(filterProvider);
|
||||
}
|
||||
catch
|
||||
{
|
||||
outputStreamToken = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
var pageContentParser = new PageContentParser(new ReflectionGraphicsStateOperationFactory());
|
||||
IReadOnlyList<IGraphicsStateOperation> operations;
|
||||
try
|
||||
{
|
||||
operations = pageContentParser.Parse(1, new ByteArrayInputBytes(bytes), new NoOpLog());
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
outputStreamToken = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
using (var outputStreamT = new MemoryStream())
|
||||
{
|
||||
var haveText = false;
|
||||
foreach (var op in operations)
|
||||
{
|
||||
if (op.Operator == ShowText.Symbol || op.Operator == ShowTextsWithPositioning.Symbol)
|
||||
{
|
||||
haveText = true;
|
||||
continue;
|
||||
}
|
||||
op.Write(outputStreamT);
|
||||
}
|
||||
if (!haveText)
|
||||
{
|
||||
outputStreamToken = null;
|
||||
return false;
|
||||
}
|
||||
outputStreamT.Seek(0, SeekOrigin.Begin);
|
||||
outputStreamToken = DataCompresser.CompressToStream(outputStreamT.ToArray());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
100
src/UglyToad.PdfPig/Writer/PdfTextRemover.cs
Normal file
100
src/UglyToad.PdfPig/Writer/PdfTextRemover.cs
Normal file
@@ -0,0 +1,100 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
|
||||
namespace UglyToad.PdfPig.Writer
|
||||
{
|
||||
/// <summary>
|
||||
/// Class to remove text from PDFs, useful as a preprocessing step for Optical Character Recognition (OCR)
|
||||
/// </summary>
|
||||
public static class PdfTextRemover
|
||||
{
|
||||
/// <summary>
|
||||
/// Return PDF without text as bytes
|
||||
/// <param name="filePath">Path to PDF</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// </summary>
|
||||
public static byte[] RemoveText(string filePath, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
using (var output = new MemoryStream())
|
||||
{
|
||||
RemoveText(output, filePath, pagesBundle);
|
||||
return output.ToArray();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Write PDF without text to the output stream. The caller must manage disposing the output stream.
|
||||
/// <param name="output">Must be writable</param>
|
||||
/// <param name="filePath">Path to PDF</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// </summary>
|
||||
public static void RemoveText(Stream output, string filePath, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
using (var stream = File.OpenRead(filePath))
|
||||
{
|
||||
RemoveText(stream, output, pagesBundle);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove text from the PDF (passed in as a byte array) and return it as a new byte array
|
||||
/// <param name="file">PDF document (as byte array)</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// <returns>PDF without text (as a byte array)</returns>
|
||||
/// </summary>
|
||||
public static byte[] RemoveText(byte[] file, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
_ = file ?? throw new ArgumentNullException(nameof(file));
|
||||
|
||||
using (var output = new MemoryStream())
|
||||
{
|
||||
RemoveText(PdfDocument.Open(file), output, pagesBundle);
|
||||
return output.ToArray();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove text from the PDF in the input stream and write it to the output stream.
|
||||
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
|
||||
/// <param name="stream">Streams for the file contents, this must support reading and seeking.</param>
|
||||
/// <param name="output">Must be writable</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// </summary>
|
||||
public static void RemoveText(Stream stream, Stream output, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
_ = stream ?? throw new ArgumentNullException(nameof(stream));
|
||||
_ = output ?? throw new ArgumentNullException(nameof(output));
|
||||
|
||||
RemoveText(PdfDocument.Open(stream), output, pagesBundle);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove text from the PDF and write it to the output stream.
|
||||
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
|
||||
/// <param name="file">PDF document</param>
|
||||
/// <param name="output">Must be writable</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// </summary>
|
||||
public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter()))
|
||||
{
|
||||
if (pagesBundle == null)
|
||||
{
|
||||
for (var i = 1; i <= file.NumberOfPages; i++)
|
||||
{
|
||||
document.AddPage(file, i);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
foreach (var i in pagesBundle)
|
||||
{
|
||||
document.AddPage(file, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user