mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-11-28 09:28:25 +08:00
#441 option to replace tabs and whitespace with space character in generated text
This commit is contained in:
@@ -3,6 +3,7 @@
|
|||||||
using System;
|
using System;
|
||||||
using System.Text;
|
using System.Text;
|
||||||
using Content;
|
using Content;
|
||||||
|
using System.Collections.Generic;
|
||||||
using Util;
|
using Util;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -10,6 +11,13 @@
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public static class ContentOrderTextExtractor
|
public static class ContentOrderTextExtractor
|
||||||
{
|
{
|
||||||
|
private static readonly HashSet<string> ReplaceableWhitespace = new HashSet<string>
|
||||||
|
{
|
||||||
|
"\t",
|
||||||
|
"\v",
|
||||||
|
"\f"
|
||||||
|
};
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Gets a human readable representation of the text from the page based on
|
/// Gets a human readable representation of the text from the page based on
|
||||||
/// the letter order of the original PDF document.
|
/// the letter order of the original PDF document.
|
||||||
@@ -17,7 +25,23 @@
|
|||||||
/// <param name="page">A page from the document.</param>
|
/// <param name="page">A page from the document.</param>
|
||||||
/// <param name="addDoubleNewline">Whether to include a double new-line when the text is likely to be a new paragraph.</param>
|
/// <param name="addDoubleNewline">Whether to include a double new-line when the text is likely to be a new paragraph.</param>
|
||||||
public static string GetText(Page page, bool addDoubleNewline = false)
|
public static string GetText(Page page, bool addDoubleNewline = false)
|
||||||
|
=> GetText(
|
||||||
|
page,
|
||||||
|
new Options
|
||||||
{
|
{
|
||||||
|
SeparateParagraphsWithDoubleNewline = addDoubleNewline
|
||||||
|
});
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets a human readable representation of the text from the page based on
|
||||||
|
/// the letter order of the original PDF document.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="page">A page from the document.</param>
|
||||||
|
/// <param name="options">Control various aspects of the generated text.</param>
|
||||||
|
public static string GetText(Page page, Options options)
|
||||||
|
{
|
||||||
|
options ??= new Options();
|
||||||
|
|
||||||
var sb = new StringBuilder();
|
var sb = new StringBuilder();
|
||||||
|
|
||||||
var previous = default(Letter);
|
var previous = default(Letter);
|
||||||
@@ -31,6 +55,21 @@
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (options.ReplaceWhitespaceWithSpace && ReplaceableWhitespace.Contains(letter.Value))
|
||||||
|
{
|
||||||
|
letter = new Letter(
|
||||||
|
" ",
|
||||||
|
letter.GlyphRectangle,
|
||||||
|
letter.StartBaseLine,
|
||||||
|
letter.EndBaseLine,
|
||||||
|
letter.Width,
|
||||||
|
letter.FontSize,
|
||||||
|
letter.Font,
|
||||||
|
letter.Color,
|
||||||
|
letter.PointSize,
|
||||||
|
letter.TextSequence);
|
||||||
|
}
|
||||||
|
|
||||||
if (letter.Value == " " && !hasJustAddedWhitespace)
|
if (letter.Value == " " && !hasJustAddedWhitespace)
|
||||||
{
|
{
|
||||||
if (previous != null && IsNewline(previous, letter, page, out _))
|
if (previous != null && IsNewline(previous, letter, page, out _))
|
||||||
@@ -58,7 +97,7 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
sb.AppendLine();
|
sb.AppendLine();
|
||||||
if (addDoubleNewline && isDoubleNewline)
|
if (options.SeparateParagraphsWithDoubleNewline && isDoubleNewline)
|
||||||
{
|
{
|
||||||
sb.AppendLine();
|
sb.AppendLine();
|
||||||
}
|
}
|
||||||
@@ -120,5 +159,23 @@
|
|||||||
|
|
||||||
return gap > minPtSize * 0.9;
|
return gap > minPtSize * 0.9;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Options controlling the text generation algorithm.
|
||||||
|
/// </summary>
|
||||||
|
public class Options
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Whether to include a double new-line when the text is likely to be a new paragraph.
|
||||||
|
/// Default <see langword="false"/>.
|
||||||
|
/// </summary>
|
||||||
|
public bool SeparateParagraphsWithDoubleNewline { get; set; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Whether to replace all whitespace characters (except line breaks) with single space ' '
|
||||||
|
/// character. Default <see langword="false"/>.
|
||||||
|
/// </summary>
|
||||||
|
public bool ReplaceWhitespaceWithSpace { get; set; }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user