mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-01-18 19:51:24 +08:00
Merge pull request #525 from mvantzet/ITokenWriter
Make TokenWriter non-static, implement ITokenWriter, injection in PdfDocumentBuilder, add PdfTextRemover
This commit is contained in:
@@ -217,9 +217,11 @@
|
||||
"UglyToad.PdfPig.Util.DefaultWordExtractor",
|
||||
"UglyToad.PdfPig.Util.DateFormatHelper",
|
||||
"UglyToad.PdfPig.Util.WhitespaceSizeStatistics",
|
||||
"UglyToad.PdfPig.Writer.ITokenWriter",
|
||||
"UglyToad.PdfPig.Writer.PdfAStandard",
|
||||
"UglyToad.PdfPig.Writer.PdfDocumentBuilder",
|
||||
"UglyToad.PdfPig.Writer.PdfMerger",
|
||||
"UglyToad.PdfPig.Writer.PdfTextRemover",
|
||||
"UglyToad.PdfPig.Writer.PdfWriterType",
|
||||
"UglyToad.PdfPig.Writer.PdfPageBuilder",
|
||||
"UglyToad.PdfPig.Writer.TokenWriter",
|
||||
|
||||
@@ -1,73 +1,74 @@
|
||||
namespace UglyToad.PdfPig.Tests.Writer
|
||||
{
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Integration;
|
||||
using PdfPig.Core;
|
||||
using PdfPig.Fonts.Standard14Fonts;
|
||||
using PdfPig.Tokens;
|
||||
using PdfPig.Writer;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Integration;
|
||||
using PdfPig.Core;
|
||||
using PdfPig.Fonts.Standard14Fonts;
|
||||
using PdfPig.Tokens;
|
||||
using PdfPig.Writer;
|
||||
using System.Collections.Generic;
|
||||
using Tests.Fonts.TrueType;
|
||||
using Xunit;
|
||||
|
||||
public class PdfDocumentBuilderTests
|
||||
{
|
||||
[Fact]
|
||||
public void CanWriteSingleBlankPage()
|
||||
{
|
||||
var result = CreateSingleBlankPage();
|
||||
|
||||
WriteFile(nameof(CanWriteSinglePageHelloWorld), result);
|
||||
|
||||
Assert.NotEmpty(result);
|
||||
|
||||
var str = OtherEncodings.BytesAsLatin1String(result);
|
||||
Assert.StartsWith("%PDF", str);
|
||||
Assert.EndsWith("%%EOF", str);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanCreateSingleCustomPageSize()
|
||||
{
|
||||
var builder = new PdfDocumentBuilder();
|
||||
|
||||
var page = builder.AddPage(120, 250);
|
||||
|
||||
var font = builder.AddStandard14Font(Standard14Font.Helvetica);
|
||||
|
||||
page.AddText("Small page.", 12, new PdfPoint(25, 200), font);
|
||||
|
||||
var bytes = builder.Build();
|
||||
|
||||
WriteFile(nameof(CanCreateSingleCustomPageSize), bytes);
|
||||
|
||||
using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
Assert.Equal(1, document.NumberOfPages);
|
||||
|
||||
var page1 = document.GetPage(1);
|
||||
|
||||
Assert.Equal(120, page1.Width);
|
||||
Assert.Equal(250, page1.Height);
|
||||
|
||||
Assert.Equal("Small page.", page1.Text);
|
||||
}
|
||||
public class PdfDocumentBuilderTests
|
||||
{
|
||||
[Fact]
|
||||
public void CanWriteSingleBlankPage()
|
||||
{
|
||||
var result = CreateSingleBlankPage();
|
||||
|
||||
WriteFile(nameof(CanWriteSinglePageHelloWorld), result);
|
||||
|
||||
Assert.NotEmpty(result);
|
||||
|
||||
var str = OtherEncodings.BytesAsLatin1String(result);
|
||||
Assert.StartsWith("%PDF", str);
|
||||
Assert.EndsWith("%%EOF", str);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanFastAddPageAndInheritProps()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("inherited_mediabox.pdf");
|
||||
var contents = File.ReadAllBytes(first);
|
||||
|
||||
[Fact]
|
||||
public void CanCreateSingleCustomPageSize()
|
||||
{
|
||||
var builder = new PdfDocumentBuilder();
|
||||
|
||||
byte[] results = null;
|
||||
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
|
||||
using (var output = new PdfDocumentBuilder())
|
||||
var page = builder.AddPage(120, 250);
|
||||
|
||||
var font = builder.AddStandard14Font(Standard14Font.Helvetica);
|
||||
|
||||
page.AddText("Small page.", 12, new PdfPoint(25, 200), font);
|
||||
|
||||
var bytes = builder.Build();
|
||||
|
||||
WriteFile(nameof(CanCreateSingleCustomPageSize), bytes);
|
||||
|
||||
using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
Assert.Equal(1, document.NumberOfPages);
|
||||
|
||||
var page1 = document.GetPage(1);
|
||||
|
||||
Assert.Equal(120, page1.Width);
|
||||
Assert.Equal(250, page1.Height);
|
||||
|
||||
Assert.Equal("Small page.", page1.Text);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanFastAddPageAndInheritProps()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("inherited_mediabox.pdf");
|
||||
var contents = File.ReadAllBytes(first);
|
||||
|
||||
|
||||
byte[] results = null;
|
||||
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
|
||||
using (var output = new PdfDocumentBuilder())
|
||||
{
|
||||
output.AddPage(existing, 1);
|
||||
results = output.Build();
|
||||
results = output.Build();
|
||||
}
|
||||
|
||||
using (var rewritted = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
|
||||
@@ -75,22 +76,22 @@
|
||||
var pg = rewritted.GetPage(1);
|
||||
Assert.Equal(200, pg.MediaBox.Bounds.Width);
|
||||
Assert.Equal(100, pg.MediaBox.Bounds.Height);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanFastAddPageWithStreamSubtype()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("steam_in_page_dict.pdf");
|
||||
var contents = File.ReadAllBytes(first);
|
||||
|
||||
[Fact]
|
||||
public void CanFastAddPageWithStreamSubtype()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("steam_in_page_dict.pdf");
|
||||
var contents = File.ReadAllBytes(first);
|
||||
|
||||
byte[] results = null;
|
||||
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
|
||||
using (var output = new PdfDocumentBuilder())
|
||||
|
||||
byte[] results = null;
|
||||
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
|
||||
using (var output = new PdfDocumentBuilder())
|
||||
{
|
||||
output.AddPage(existing, 1);
|
||||
results = output.Build();
|
||||
results = output.Build();
|
||||
}
|
||||
|
||||
using (var rewritted = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
|
||||
@@ -98,26 +99,26 @@
|
||||
// really just checking for no exception...
|
||||
var pg = rewritted.GetPage(1);
|
||||
Assert.NotNull(pg.Content);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanFastAddPageAndStripLinkAnnots()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("outline.pdf");
|
||||
var contents = File.ReadAllBytes(first);
|
||||
|
||||
[Fact]
|
||||
public void CanFastAddPageAndStripLinkAnnots()
|
||||
{
|
||||
var first = IntegrationHelpers.GetDocumentPath("outline.pdf");
|
||||
var contents = File.ReadAllBytes(first);
|
||||
|
||||
var annotCount = 0;
|
||||
byte[] results = null;
|
||||
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
|
||||
using (var output = new PdfDocumentBuilder())
|
||||
byte[] results = null;
|
||||
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
|
||||
using (var output = new PdfDocumentBuilder())
|
||||
{
|
||||
output.AddPage(existing, 1);
|
||||
results = output.Build();
|
||||
var pg = existing.GetPage(1);
|
||||
var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
|
||||
annotCount = annots.Count;
|
||||
Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link);
|
||||
Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link);
|
||||
}
|
||||
|
||||
using (var rewritten = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
|
||||
@@ -126,7 +127,7 @@
|
||||
var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
|
||||
Assert.Equal(annotCount - 1, annots.Count);
|
||||
Assert.DoesNotContain(annots, x => x.Type == Annotations.AnnotationType.Link);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -1114,6 +1115,27 @@
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanUseCustomTokenWriter()
|
||||
{
|
||||
var docPath = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf");
|
||||
var tw = new TestTokenWriter();
|
||||
|
||||
using (var doc = PdfDocument.Open(docPath))
|
||||
using (var ms = new MemoryStream())
|
||||
using (var builder = new PdfDocumentBuilder(ms, tokenWriter: tw))
|
||||
{
|
||||
for (var i = 1; i <= doc.NumberOfPages; i++)
|
||||
{
|
||||
builder.AddPage(doc, i);
|
||||
}
|
||||
builder.Build();
|
||||
}
|
||||
Assert.Equal(tw.Objects, 0); // No objects in sample file
|
||||
Assert.True(tw.Tokens > 1000); // Roughly 1065
|
||||
Assert.True(tw.WroteCrossReferenceTable);
|
||||
}
|
||||
|
||||
private static void WriteFile(string name, byte[] bytes, string extension = "pdf")
|
||||
{
|
||||
try
|
||||
@@ -1133,4 +1155,29 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public class TestTokenWriter : ITokenWriter
|
||||
{
|
||||
public int Tokens { get; private set; }
|
||||
public int Objects { get; private set; }
|
||||
public bool WroteCrossReferenceTable { get; private set; }
|
||||
|
||||
public void WriteToken(IToken token, Stream outputStream)
|
||||
{
|
||||
Tokens++;
|
||||
}
|
||||
|
||||
public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream)
|
||||
{
|
||||
Objects++;
|
||||
}
|
||||
|
||||
public void WriteCrossReferenceTable(IReadOnlyDictionary<IndirectReference, long> objectOffsets,
|
||||
IndirectReference catalogToken,
|
||||
Stream outputStream,
|
||||
IndirectReference? documentInformationReference)
|
||||
{
|
||||
WroteCrossReferenceTable = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
35
src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs
Normal file
35
src/UglyToad.PdfPig.Tests/Writer/PdfTextRemoverTests.cs
Normal file
@@ -0,0 +1,35 @@
|
||||
using UglyToad.PdfPig.Tests.Integration;
|
||||
using UglyToad.PdfPig.Writer;
|
||||
using System.IO;
|
||||
using Xunit;
|
||||
|
||||
namespace UglyToad.PdfPig.Tests.Writer
|
||||
{
|
||||
public class PdfTextRemoverTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData("Two Page Text Only - from libre office.pdf")]
|
||||
[InlineData("cat-genetics.pdf")]
|
||||
[InlineData("Motor Insurance claim form.pdf")]
|
||||
[InlineData("Single Page Images - from libre office.pdf")]
|
||||
public void TextRemoverRemovesText(string file)
|
||||
{
|
||||
var filePath = IntegrationHelpers.GetDocumentPath(file);
|
||||
using (var document = PdfDocument.Open(filePath))
|
||||
{
|
||||
var withoutText = PdfTextRemover.RemoveText(filePath);
|
||||
File.WriteAllBytes(@"C:\temp\_tx.pdf", withoutText);
|
||||
using (var documentWithoutText = PdfDocument.Open(withoutText))
|
||||
{
|
||||
Assert.Equal(document.NumberOfPages, documentWithoutText.NumberOfPages);
|
||||
for (var i = 1; i <= documentWithoutText.NumberOfPages; i++)
|
||||
{
|
||||
Assert.NotEqual(document.GetPage(i).Text, string.Empty);
|
||||
Assert.Equal(documentWithoutText.GetPage(i).Text, string.Empty);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -12,10 +12,11 @@
|
||||
[Fact]
|
||||
public void EscapeSpecialCharacter()
|
||||
{
|
||||
var writer = new TokenWriter();
|
||||
using (var memStream = new MemoryStream())
|
||||
{
|
||||
TokenWriter.WriteToken(new StringToken("\\"), memStream);
|
||||
TokenWriter.WriteToken(new StringToken("(Hello)"), memStream);
|
||||
writer.WriteToken(new StringToken("\\"), memStream);
|
||||
writer.WriteToken(new StringToken("(Hello)"), memStream);
|
||||
|
||||
// Read Test
|
||||
memStream.Position = 0;
|
||||
|
||||
@@ -10,6 +10,8 @@
|
||||
/// </summary>
|
||||
public class BeginMarkedContent : IGraphicsStateOperation
|
||||
{
|
||||
private static readonly TokenWriter TokenWriter = new TokenWriter();
|
||||
|
||||
/// <summary>
|
||||
/// The symbol for this operation in a stream.
|
||||
/// </summary>
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
/// </summary>
|
||||
public class BeginMarkedContentWithProperties : IGraphicsStateOperation
|
||||
{
|
||||
private static readonly TokenWriter TokenWriter = new TokenWriter();
|
||||
|
||||
/// <summary>
|
||||
/// The symbol for this operation in a stream.
|
||||
/// </summary>
|
||||
|
||||
@@ -10,6 +10,8 @@
|
||||
/// </summary>
|
||||
public class DesignateMarkedContentPoint : IGraphicsStateOperation
|
||||
{
|
||||
private static readonly TokenWriter TokenWriter = new TokenWriter();
|
||||
|
||||
/// <summary>
|
||||
/// The symbol for this operation in a stream.
|
||||
/// </summary>
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
/// </summary>
|
||||
public class DesignateMarkedContentPointWithProperties : IGraphicsStateOperation
|
||||
{
|
||||
private static readonly TokenWriter TokenWriter = new TokenWriter();
|
||||
|
||||
/// <summary>
|
||||
/// The symbol for this operation in a stream.
|
||||
/// </summary>
|
||||
|
||||
@@ -13,6 +13,8 @@
|
||||
/// </summary>
|
||||
public class SetNonStrokeColorAdvanced : IGraphicsStateOperation
|
||||
{
|
||||
private static readonly TokenWriter TokenWriter = new TokenWriter();
|
||||
|
||||
/// <summary>
|
||||
/// The symbol for this operation in a stream.
|
||||
/// </summary>
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
/// </summary>
|
||||
public class SetNonStrokeColorSpace : IGraphicsStateOperation
|
||||
{
|
||||
private static readonly TokenWriter TokenWriter = new TokenWriter();
|
||||
|
||||
/// <summary>
|
||||
/// The symbol for this operation in a stream.
|
||||
/// </summary>
|
||||
|
||||
@@ -13,6 +13,8 @@
|
||||
/// </summary>
|
||||
public class SetStrokeColorAdvanced : IGraphicsStateOperation
|
||||
{
|
||||
private static readonly TokenWriter TokenWriter = new TokenWriter();
|
||||
|
||||
/// <summary>
|
||||
/// The symbol for this operation in a stream.
|
||||
/// </summary>
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
/// </summary>
|
||||
public class SetStrokeColorSpace : IGraphicsStateOperation
|
||||
{
|
||||
private static readonly TokenWriter TokenWriter = new TokenWriter();
|
||||
|
||||
/// <summary>
|
||||
/// The symbol for this operation in a stream.
|
||||
/// </summary>
|
||||
|
||||
@@ -15,6 +15,8 @@
|
||||
/// </summary>
|
||||
public class ShowTextsWithPositioning : IGraphicsStateOperation
|
||||
{
|
||||
private static readonly TokenWriter TokenWriter = new TokenWriter();
|
||||
|
||||
/// <summary>
|
||||
/// The symbol for this operation in a stream.
|
||||
/// </summary>
|
||||
|
||||
@@ -96,8 +96,6 @@
|
||||
}
|
||||
|
||||
return CharacterIdentifierSystemInfo;
|
||||
|
||||
throw new InvalidOperationException("The Character Identifier System Information was never set.");
|
||||
}
|
||||
|
||||
public void UseCMap(CMap other)
|
||||
|
||||
@@ -14,6 +14,8 @@
|
||||
private const string DictToken = "dict";
|
||||
private const string FindResourceToken = "findresource";
|
||||
|
||||
private static readonly TokenWriter TokenWriter = new TokenWriter();
|
||||
|
||||
public static IReadOnlyList<byte> ConvertToCMapStream(IReadOnlyDictionary<char, byte> unicodeToCharacterCode)
|
||||
{
|
||||
using (var memoryStream = new MemoryStream())
|
||||
|
||||
37
src/UglyToad.PdfPig/Writer/ITokenWriter.cs
Normal file
37
src/UglyToad.PdfPig/Writer/ITokenWriter.cs
Normal file
@@ -0,0 +1,37 @@
|
||||
namespace UglyToad.PdfPig.Writer;
|
||||
|
||||
using Core;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using Tokens;
|
||||
|
||||
/// <summary>
|
||||
/// Writes any type of <see cref="IToken"/> to the corresponding PDF document format output.
|
||||
/// </summary>
|
||||
public interface ITokenWriter
|
||||
{
|
||||
/// <summary>
|
||||
/// Writes the given input token to the output stream with the correct PDF format and encoding including whitespace and line breaks as applicable.
|
||||
/// </summary>
|
||||
/// <param name="token">The token to write to the stream.</param>
|
||||
/// <param name="outputStream">The stream to write the token to.</param>
|
||||
void WriteToken(IToken token, Stream outputStream);
|
||||
|
||||
/// <summary>
|
||||
/// Writes pre-serialized token as an object token to the output stream.
|
||||
/// </summary>
|
||||
/// <param name="objectNumber">Object number of the indirect object.</param>
|
||||
/// <param name="generation">Generation of the indirect object.</param>
|
||||
/// <param name="data">Pre-serialized object contents.</param>
|
||||
/// <param name="outputStream">The stream to write the token to.</param>
|
||||
void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream);
|
||||
|
||||
/// <summary>
|
||||
/// Writes a valid single section cross-reference (xref) table plus trailer dictionary to the output for the set of object offsets.
|
||||
/// </summary>
|
||||
/// <param name="objectOffsets">The byte offset from the start of the document for each object in the document.</param>
|
||||
/// <param name="catalogToken">The object representing the catalog dictionary which is referenced from the trailer dictionary.</param>
|
||||
/// <param name="outputStream">The output stream to write to.</param>
|
||||
/// <param name="documentInformationReference">The object reference for the document information dictionary if present.</param>
|
||||
void WriteCrossReferenceTable(IReadOnlyDictionary<IndirectReference, long> objectOffsets, IndirectReference catalogToken, Stream outputStream, IndirectReference? documentInformationReference);
|
||||
}
|
||||
97
src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
Normal file
97
src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
Normal file
@@ -0,0 +1,97 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.Core;
|
||||
using UglyToad.PdfPig.Filters;
|
||||
using UglyToad.PdfPig.Graphics.Operations.TextShowing;
|
||||
using UglyToad.PdfPig.Graphics.Operations;
|
||||
using UglyToad.PdfPig.Graphics;
|
||||
using UglyToad.PdfPig.Logging;
|
||||
using UglyToad.PdfPig.Parser;
|
||||
using UglyToad.PdfPig.Tokens;
|
||||
|
||||
namespace UglyToad.PdfPig.Writer
|
||||
{
|
||||
/// <summary>
|
||||
/// Derived class of <see cref="TokenWriter"/> that does not write <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations in streams
|
||||
/// </summary>
|
||||
internal class NoTextTokenWriter : TokenWriter
|
||||
{
|
||||
/// <summary>
|
||||
/// Write stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations
|
||||
/// </summary>
|
||||
/// <param name="streamToken"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected override void WriteStream(StreamToken streamToken, Stream outputStream)
|
||||
{
|
||||
if (!TryGetStreamWithoutText(streamToken, out var outputStreamToken))
|
||||
{
|
||||
outputStreamToken = streamToken;
|
||||
}
|
||||
WriteDictionary(outputStreamToken.StreamDictionary, outputStream);
|
||||
WriteLineBreak(outputStream);
|
||||
outputStream.Write(StreamStart, 0, StreamStart.Length);
|
||||
WriteLineBreak(outputStream);
|
||||
outputStream.Write(outputStreamToken.Data.ToArray(), 0, outputStreamToken.Data.Count);
|
||||
WriteLineBreak(outputStream);
|
||||
outputStream.Write(StreamEnd, 0, StreamEnd.Length);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Try get a stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations.
|
||||
/// </summary>
|
||||
/// <param name="streamToken"></param>
|
||||
/// <param name="outputStreamToken"></param>
|
||||
/// <returns>true if any text operation found (and we have a valid <paramref name="outputStreamToken"/> without the text operations),
|
||||
/// false if no text operation found (in which case <paramref name="outputStreamToken"/> is null)</returns>
|
||||
private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken outputStreamToken)
|
||||
{
|
||||
var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance);
|
||||
IReadOnlyList<byte> bytes;
|
||||
try
|
||||
{
|
||||
bytes = streamToken.Decode(filterProvider);
|
||||
}
|
||||
catch
|
||||
{
|
||||
outputStreamToken = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
var pageContentParser = new PageContentParser(new ReflectionGraphicsStateOperationFactory());
|
||||
IReadOnlyList<IGraphicsStateOperation> operations;
|
||||
try
|
||||
{
|
||||
operations = pageContentParser.Parse(1, new ByteArrayInputBytes(bytes), new NoOpLog());
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
outputStreamToken = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
using (var outputStreamT = new MemoryStream())
|
||||
{
|
||||
var haveText = false;
|
||||
foreach (var op in operations)
|
||||
{
|
||||
if (op.Operator == ShowText.Symbol || op.Operator == ShowTextsWithPositioning.Symbol)
|
||||
{
|
||||
haveText = true;
|
||||
continue;
|
||||
}
|
||||
op.Write(outputStreamT);
|
||||
}
|
||||
if (!haveText)
|
||||
{
|
||||
outputStreamToken = null;
|
||||
return false;
|
||||
}
|
||||
outputStreamT.Seek(0, SeekOrigin.Begin);
|
||||
outputStreamToken = DataCompresser.CompressToStream(outputStreamT.ToArray());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,7 @@
|
||||
{
|
||||
private readonly Dictionary<byte[], IndirectReferenceToken> hashes = new Dictionary<byte[], IndirectReferenceToken>(new FNVByteComparison());
|
||||
|
||||
public PdfDedupStreamWriter(Stream stream, bool dispose) : base(stream, dispose)
|
||||
public PdfDedupStreamWriter(Stream stream, bool dispose, ITokenWriter tokenWriter = null) : base(stream, dispose, tokenWriter)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ namespace UglyToad.PdfPig.Writer
|
||||
private readonly Dictionary<int, PdfPageBuilder> pages = new Dictionary<int, PdfPageBuilder>();
|
||||
private readonly Dictionary<Guid, FontStored> fonts = new Dictionary<Guid, FontStored>();
|
||||
private bool completed = false;
|
||||
internal int fontId = 0;
|
||||
private int fontId = 0;
|
||||
|
||||
private readonly static ArrayToken DefaultProcSet = new ArrayToken(new List<NameToken>
|
||||
{
|
||||
@@ -89,20 +89,21 @@ namespace UglyToad.PdfPig.Writer
|
||||
/// <param name="disposeStream">If stream should be disposed when builder is.</param>
|
||||
/// <param name="type">Type of pdf stream writer to use</param>
|
||||
/// <param name="version">Pdf version to use in header.</param>
|
||||
public PdfDocumentBuilder(Stream stream, bool disposeStream = false, PdfWriterType type = PdfWriterType.Default, decimal version = 1.7m)
|
||||
/// <param name="tokenWriter">Token writer to use</param>
|
||||
public PdfDocumentBuilder(Stream stream, bool disposeStream = false, PdfWriterType type = PdfWriterType.Default, decimal version = 1.7m, ITokenWriter tokenWriter = null)
|
||||
{
|
||||
switch (type)
|
||||
{
|
||||
case PdfWriterType.ObjectInMemoryDedup:
|
||||
context = new PdfDedupStreamWriter(stream, disposeStream);
|
||||
context = new PdfDedupStreamWriter(stream, disposeStream, tokenWriter);
|
||||
break;
|
||||
default:
|
||||
context = new PdfStreamWriter(stream, disposeStream);
|
||||
context = new PdfStreamWriter(stream, disposeStream, tokenWriter);
|
||||
break;
|
||||
}
|
||||
context.InitializePdf(version);
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Determines whether the bytes of the TrueType font file provided can be used in a PDF document.
|
||||
/// </summary>
|
||||
|
||||
@@ -18,16 +18,17 @@
|
||||
protected bool DisposeStream { get; set; }
|
||||
protected bool Initialized { get; set; }
|
||||
protected int CurrentNumber { get; set; } = 1;
|
||||
protected readonly ITokenWriter TokenWriter;
|
||||
|
||||
internal PdfStreamWriter(Stream baseStream, bool disposeStream = true)
|
||||
internal PdfStreamWriter(Stream baseStream, bool disposeStream = true, ITokenWriter tokenWriter = null)
|
||||
{
|
||||
Stream = baseStream ?? throw new ArgumentNullException(nameof(baseStream));
|
||||
if (!baseStream.CanWrite)
|
||||
{
|
||||
throw new ArgumentException("Output stream must be writable");
|
||||
}
|
||||
|
||||
DisposeStream = disposeStream;
|
||||
TokenWriter = tokenWriter ?? new TokenWriter();
|
||||
}
|
||||
|
||||
public Stream Stream { get; protected set; }
|
||||
|
||||
100
src/UglyToad.PdfPig/Writer/PdfTextRemover.cs
Normal file
100
src/UglyToad.PdfPig/Writer/PdfTextRemover.cs
Normal file
@@ -0,0 +1,100 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
|
||||
namespace UglyToad.PdfPig.Writer
|
||||
{
|
||||
/// <summary>
|
||||
/// Class to remove text from PDFs, useful as a preprocessing step for Optical Character Recognition (OCR)
|
||||
/// </summary>
|
||||
public static class PdfTextRemover
|
||||
{
|
||||
/// <summary>
|
||||
/// Return PDF without text as bytes
|
||||
/// <param name="filePath">Path to PDF</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// </summary>
|
||||
public static byte[] RemoveText(string filePath, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
using (var output = new MemoryStream())
|
||||
{
|
||||
RemoveText(output, filePath, pagesBundle);
|
||||
return output.ToArray();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Write PDF without text to the output stream. The caller must manage disposing the output stream.
|
||||
/// <param name="output">Must be writable</param>
|
||||
/// <param name="filePath">Path to PDF</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// </summary>
|
||||
public static void RemoveText(Stream output, string filePath, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
using (var stream = File.OpenRead(filePath))
|
||||
{
|
||||
RemoveText(stream, output, pagesBundle);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove text from the PDF (passed in as a byte array) and return it as a new byte array
|
||||
/// <param name="file">PDF document (as byte array)</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// <returns>PDF without text (as a byte array)</returns>
|
||||
/// </summary>
|
||||
public static byte[] RemoveText(byte[] file, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
_ = file ?? throw new ArgumentNullException(nameof(file));
|
||||
|
||||
using (var output = new MemoryStream())
|
||||
{
|
||||
RemoveText(PdfDocument.Open(file), output, pagesBundle);
|
||||
return output.ToArray();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove text from the PDF in the input stream and write it to the output stream.
|
||||
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
|
||||
/// <param name="stream">Streams for the file contents, this must support reading and seeking.</param>
|
||||
/// <param name="output">Must be writable</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// </summary>
|
||||
public static void RemoveText(Stream stream, Stream output, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
_ = stream ?? throw new ArgumentNullException(nameof(stream));
|
||||
_ = output ?? throw new ArgumentNullException(nameof(output));
|
||||
|
||||
RemoveText(PdfDocument.Open(stream), output, pagesBundle);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove text from the PDF and write it to the output stream.
|
||||
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
|
||||
/// <param name="file">PDF document</param>
|
||||
/// <param name="output">Must be writable</param>
|
||||
/// <param name="pagesBundle">List of pages to emit; if null all pages are emitted</param>
|
||||
/// </summary>
|
||||
public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList<int> pagesBundle = null)
|
||||
{
|
||||
using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter()))
|
||||
{
|
||||
if (pagesBundle == null)
|
||||
{
|
||||
for (var i = 1; i <= file.NumberOfPages; i++)
|
||||
{
|
||||
document.AddPage(file, i);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
foreach (var i in pagesBundle)
|
||||
{
|
||||
document.AddPage(file, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -14,10 +14,8 @@
|
||||
/// <summary>
|
||||
/// Writes any type of <see cref="IToken"/> to the corresponding PDF document format output.
|
||||
/// </summary>
|
||||
public class TokenWriter
|
||||
public class TokenWriter : ITokenWriter
|
||||
{
|
||||
private static readonly byte Backslash = GetByte("\\");
|
||||
|
||||
private static readonly byte ArrayStart = GetByte("[");
|
||||
private static readonly byte ArrayEnd = GetByte("]");
|
||||
|
||||
@@ -46,10 +44,18 @@
|
||||
|
||||
private static readonly byte[] StartXref = OtherEncodings.StringAsLatin1Bytes("startxref");
|
||||
|
||||
private static readonly byte[] StreamStart = OtherEncodings.StringAsLatin1Bytes("stream");
|
||||
private static readonly byte[] StreamEnd = OtherEncodings.StringAsLatin1Bytes("endstream");
|
||||
/// <summary>
|
||||
/// Bytes that indicate start of stream
|
||||
/// </summary>
|
||||
protected static readonly byte[] StreamStart = OtherEncodings.StringAsLatin1Bytes("stream");
|
||||
|
||||
/// <summary>
|
||||
/// Bytes that indicate end start of stream
|
||||
/// </summary>
|
||||
protected static readonly byte[] StreamEnd = OtherEncodings.StringAsLatin1Bytes("endstream");
|
||||
|
||||
private static readonly byte StringStart = GetByte("(");
|
||||
|
||||
private static readonly byte StringEnd = GetByte(")");
|
||||
|
||||
private static readonly byte[] Trailer = OtherEncodings.StringAsLatin1Bytes("trailer");
|
||||
@@ -79,7 +85,7 @@
|
||||
/// </summary>
|
||||
/// <param name="token">The token to write to the stream.</param>
|
||||
/// <param name="outputStream">The stream to write the token to.</param>
|
||||
public static void WriteToken(IToken token, Stream outputStream)
|
||||
public void WriteToken(IToken token, Stream outputStream)
|
||||
{
|
||||
if (token == null)
|
||||
{
|
||||
@@ -129,14 +135,8 @@
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes a valid single section cross-reference (xref) table plus trailer dictionary to the output for the set of object offsets.
|
||||
/// </summary>
|
||||
/// <param name="objectOffsets">The byte offset from the start of the document for each object in the document.</param>
|
||||
/// <param name="catalogToken">The object representing the catalog dictionary which is referenced from the trailer dictionary.</param>
|
||||
/// <param name="outputStream">The output stream to write to.</param>
|
||||
/// <param name="documentInformationReference">The object reference for the document information dictionary if present.</param>
|
||||
internal static void WriteCrossReferenceTable(IReadOnlyDictionary<IndirectReference, long> objectOffsets,
|
||||
/// <inheritdoc cref="ITokenWriter.WriteCrossReferenceTable" />
|
||||
public void WriteCrossReferenceTable(IReadOnlyDictionary<IndirectReference, long> objectOffsets,
|
||||
IndirectReference catalogToken,
|
||||
Stream outputStream,
|
||||
IndirectReference? documentInformationReference)
|
||||
@@ -271,14 +271,8 @@
|
||||
outputStream.Write(Eof, 0, Eof.Length);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes pre-serialized token as an object token to the output stream.
|
||||
/// </summary>
|
||||
/// <param name="objectNumber">Object number of the indirect object.</param>
|
||||
/// <param name="generation">Generation of the indirect object.</param>
|
||||
/// <param name="data">Pre-serialized object contents.</param>
|
||||
/// <param name="outputStream">The stream to write the token to.</param>
|
||||
internal static void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream)
|
||||
/// <inheritdoc cref="ITokenWriter.WriteObject" />
|
||||
public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream)
|
||||
{
|
||||
WriteLong(objectNumber, outputStream);
|
||||
WriteWhitespace(outputStream);
|
||||
@@ -297,14 +291,24 @@
|
||||
WriteLineBreak(outputStream);
|
||||
}
|
||||
|
||||
private static void WriteHex(HexToken hex, Stream stream)
|
||||
/// <summary>
|
||||
/// Write a hex value to the output stream
|
||||
/// </summary>
|
||||
/// <param name="hex"></param>
|
||||
/// <param name="stream"></param>
|
||||
protected void WriteHex(HexToken hex, Stream stream)
|
||||
{
|
||||
stream.WriteByte(HexStart);
|
||||
stream.WriteText(hex.GetHexString());
|
||||
stream.WriteByte(HexEnd);
|
||||
}
|
||||
|
||||
private static void WriteArray(ArrayToken array, Stream outputStream)
|
||||
/// <summary>
|
||||
/// Write an array to the output stream, with whitespace at the end.
|
||||
/// </summary>
|
||||
/// <param name="array"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected void WriteArray(ArrayToken array, Stream outputStream)
|
||||
{
|
||||
outputStream.WriteByte(ArrayStart);
|
||||
WriteWhitespace(outputStream);
|
||||
@@ -319,14 +323,24 @@
|
||||
WriteWhitespace(outputStream);
|
||||
}
|
||||
|
||||
private static void WriteBoolean(BooleanToken boolean, Stream outputStream)
|
||||
/// <summary>
|
||||
/// Write a boolean "true" or "false" to the output stream, with whitespace at the end.
|
||||
/// </summary>
|
||||
/// <param name="boolean"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected void WriteBoolean(BooleanToken boolean, Stream outputStream)
|
||||
{
|
||||
var bytes = boolean.Data ? TrueBytes : FalseBytes;
|
||||
outputStream.Write(bytes, 0, bytes.Length);
|
||||
WriteWhitespace(outputStream);
|
||||
}
|
||||
|
||||
private static void WriteComment(CommentToken comment, Stream outputStream)
|
||||
/// <summary>
|
||||
/// Write a "%comment" in the output stream, with a line break at the end.
|
||||
/// </summary>
|
||||
/// <param name="comment"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected void WriteComment(CommentToken comment, Stream outputStream)
|
||||
{
|
||||
var bytes = OtherEncodings.StringAsLatin1Bytes(comment.Data);
|
||||
outputStream.WriteByte(Comment);
|
||||
@@ -334,7 +348,12 @@
|
||||
WriteLineBreak(outputStream);
|
||||
}
|
||||
|
||||
private static void WriteDictionary(DictionaryToken dictionary, Stream outputStream)
|
||||
/// <summary>
|
||||
/// Writes dictionary key/value pairs to output stream as Name/Token pairs.
|
||||
/// </summary>
|
||||
/// <param name="dictionary"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected void WriteDictionary(DictionaryToken dictionary, Stream outputStream)
|
||||
{
|
||||
outputStream.Write(DictionaryStart, 0, DictionaryStart.Length);
|
||||
|
||||
@@ -356,7 +375,12 @@
|
||||
outputStream.Write(DictionaryEnd, 0, DictionaryEnd.Length);
|
||||
}
|
||||
|
||||
private static void WriteIndirectReference(IndirectReferenceToken reference, Stream outputStream)
|
||||
/// <summary>
|
||||
/// Write an indirect reference to the stream, with whitespace at the end.
|
||||
/// </summary>
|
||||
/// <param name="reference"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected virtual void WriteIndirectReference(IndirectReferenceToken reference, Stream outputStream)
|
||||
{
|
||||
WriteLong(reference.Data.ObjectNumber, outputStream);
|
||||
WriteWhitespace(outputStream);
|
||||
@@ -368,12 +392,17 @@
|
||||
WriteWhitespace(outputStream);
|
||||
}
|
||||
|
||||
private static void WriteName(NameToken name, Stream outputStream)
|
||||
/// <summary>
|
||||
/// Write a name to the stream, with whitespace at the end.
|
||||
/// </summary>
|
||||
/// <param name="name"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected virtual void WriteName(NameToken name, Stream outputStream)
|
||||
{
|
||||
WriteName(name.Data, outputStream);
|
||||
}
|
||||
|
||||
private static void WriteName(string name, Stream outputStream)
|
||||
private void WriteName(string name, Stream outputStream)
|
||||
{
|
||||
/*
|
||||
* Beginning with PDF 1.2, any character except null (character code 0) may be
|
||||
@@ -404,7 +433,12 @@
|
||||
WriteWhitespace(outputStream);
|
||||
}
|
||||
|
||||
private static void WriteNumber(NumericToken number, Stream outputStream)
|
||||
/// <summary>
|
||||
/// Write a number to the stream, with whitespace at the end.
|
||||
/// </summary>
|
||||
/// <param name="number"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected virtual void WriteNumber(NumericToken number, Stream outputStream)
|
||||
{
|
||||
if (!number.HasDecimalPlaces)
|
||||
{
|
||||
@@ -419,7 +453,15 @@
|
||||
WriteWhitespace(outputStream);
|
||||
}
|
||||
|
||||
private static void WriteObject(ObjectToken objectToken, Stream outputStream)
|
||||
/// <summary>
|
||||
/// Write an object to the stream, with a line break at the end. It writes the following contents:
|
||||
/// - "[ObjectNumber] [Generation] obj"
|
||||
/// - Object data
|
||||
/// - "endobj"
|
||||
/// </summary>
|
||||
/// <param name="objectToken"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected virtual void WriteObject(ObjectToken objectToken, Stream outputStream)
|
||||
{
|
||||
WriteLong(objectToken.Number.ObjectNumber, outputStream);
|
||||
WriteWhitespace(outputStream);
|
||||
@@ -438,7 +480,16 @@
|
||||
WriteLineBreak(outputStream);
|
||||
}
|
||||
|
||||
private static void WriteStream(StreamToken streamToken, Stream outputStream)
|
||||
/// <summary>
|
||||
/// Write a stream token to the output stream, with the following contents:
|
||||
/// - Dictionary specifying the length of the stream, any applied compression filters and additional information.
|
||||
/// - Stream start indicator
|
||||
/// - Bytes in the StreamToken data
|
||||
/// - Stream end indicator
|
||||
/// </summary>
|
||||
/// <param name="streamToken"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected virtual void WriteStream(StreamToken streamToken, Stream outputStream)
|
||||
{
|
||||
WriteDictionary(streamToken.StreamDictionary, outputStream);
|
||||
WriteLineBreak(outputStream);
|
||||
@@ -449,15 +500,22 @@
|
||||
outputStream.Write(StreamEnd, 0, StreamEnd.Length);
|
||||
}
|
||||
|
||||
private static int[] EscapeNeeded = new int[]
|
||||
private static readonly int[] EscapeNeeded = new int[]
|
||||
{
|
||||
'\r', '\n', '\t', '\b', '\f', '\\'
|
||||
};
|
||||
private static int[] Escaped = new int[]
|
||||
|
||||
private static readonly int[] Escaped = new int[]
|
||||
{
|
||||
'r', 'n', 't', 'b', 'f', '\\'
|
||||
};
|
||||
private static void WriteString(StringToken stringToken, Stream outputStream)
|
||||
|
||||
/// <summary>
|
||||
/// Write string to the stream, with whitespace at the end
|
||||
/// </summary>
|
||||
/// <param name="stringToken"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected virtual void WriteString(StringToken stringToken, Stream outputStream)
|
||||
{
|
||||
outputStream.WriteByte(StringStart);
|
||||
|
||||
@@ -515,29 +573,47 @@
|
||||
WriteWhitespace(outputStream);
|
||||
}
|
||||
|
||||
private static void WriteInt(int value, Stream outputStream)
|
||||
/// <summary>
|
||||
/// Write an integer to the stream
|
||||
/// </summary>
|
||||
/// <param name="value"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected virtual void WriteInt(int value, Stream outputStream)
|
||||
{
|
||||
var bytes = OtherEncodings.StringAsLatin1Bytes(value.ToString("G", CultureInfo.InvariantCulture));
|
||||
outputStream.Write(bytes, 0, bytes.Length);
|
||||
}
|
||||
|
||||
private static void WriteLineBreak(Stream outputStream)
|
||||
/// <summary>
|
||||
/// Write a line break to the output stream
|
||||
/// </summary>
|
||||
/// <param name="outputStream"></param>
|
||||
protected virtual void WriteLineBreak(Stream outputStream)
|
||||
{
|
||||
outputStream.WriteNewLine();
|
||||
}
|
||||
|
||||
private static void WriteLong(long value, Stream outputStream)
|
||||
/// <summary>
|
||||
/// Write a long to the stream
|
||||
/// </summary>
|
||||
/// <param name="value"></param>
|
||||
/// <param name="outputStream"></param>
|
||||
protected virtual void WriteLong(long value, Stream outputStream)
|
||||
{
|
||||
var bytes = OtherEncodings.StringAsLatin1Bytes(value.ToString("G", CultureInfo.InvariantCulture));
|
||||
outputStream.Write(bytes, 0, bytes.Length);
|
||||
}
|
||||
|
||||
private static void WriteWhitespace(Stream outputStream)
|
||||
/// <summary>
|
||||
/// Write a space to the output stream
|
||||
/// </summary>
|
||||
/// <param name="outputStream"></param>
|
||||
protected virtual void WriteWhitespace(Stream outputStream)
|
||||
{
|
||||
outputStream.WriteByte(Whitespace);
|
||||
}
|
||||
|
||||
private static void WriteFirstXrefEmptyEntry(Stream outputStream)
|
||||
private void WriteFirstXrefEmptyEntry(Stream outputStream)
|
||||
{
|
||||
/*
|
||||
* The first entry in the table (object number 0) is always free and has a generation number of 65,535;
|
||||
@@ -591,5 +667,4 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user