mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
update RecursiveXYCut to use DlaOptions
This commit is contained in:
@@ -7,8 +7,9 @@
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.Geometry;
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
|
||||
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
|
||||
/// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
|
||||
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
|
||||
/// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips</para>
|
||||
@@ -20,81 +21,73 @@
|
||||
/// </summary>
|
||||
public static RecursiveXYCut Instance { get; } = new RecursiveXYCut();
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// Get the blocks.
|
||||
/// <para>Uses 'minimumWidth' = 0, 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height)</para>
|
||||
/// Get the blocks using default options values.
|
||||
/// </summary>
|
||||
/// <param name="pageWords">The words in the page.</param>
|
||||
/// <returns></returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords)
|
||||
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
|
||||
{
|
||||
return GetBlocks(pageWords, 0);
|
||||
return GetBlocks(words, new RecursiveXYCutOptions());
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// Get the blocks.
|
||||
/// <para>Uses 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height)</para>
|
||||
/// Get the blocks using options values.
|
||||
/// </summary>
|
||||
/// <param name="pageWords">The words in the page.</param>
|
||||
/// <param name="minimumWidth">The minimum width for a block.</param>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double minimumWidth)
|
||||
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <param name="options">The <see cref="RecursiveXYCutOptions"/> to use.</param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
|
||||
{
|
||||
return GetBlocks(pageWords, minimumWidth,
|
||||
(letters) =>
|
||||
if (options is RecursiveXYCutOptions ryxcOptions)
|
||||
{
|
||||
if (words?.Any() != true)
|
||||
{
|
||||
var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3)));
|
||||
var mode = widths.Mode();
|
||||
if (double.IsNaN(mode) || mode == 0)
|
||||
{
|
||||
mode = widths.Average();
|
||||
}
|
||||
return mode;
|
||||
},
|
||||
(letters) =>
|
||||
{
|
||||
var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3));
|
||||
var mode = heights.Mode();
|
||||
if (double.IsNaN(mode) || mode == 0)
|
||||
{
|
||||
mode = heights.Average();
|
||||
}
|
||||
return mode * 1.5;
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
);
|
||||
|
||||
return GetBlocks(words,
|
||||
ryxcOptions.MinimumWidth,
|
||||
ryxcOptions.DominantFontWidthFunc,
|
||||
ryxcOptions.DominantFontHeightFunc,
|
||||
ryxcOptions.WordSeparator,
|
||||
ryxcOptions.LineSeparator);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentException("Options provided must be of type " + nameof(RecursiveXYCutOptions) + ".", nameof(options));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks.
|
||||
/// </summary>
|
||||
/// <param name="pageWords">The words in the page.</param>
|
||||
/// <param name="minimumWidth">The minimum width for a block.</param>
|
||||
/// <param name="dominantFontWidth">The dominant font width.</param>
|
||||
/// <param name="dominantFontHeight">The dominant font height.</param>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double minimumWidth,
|
||||
double dominantFontWidth, double dominantFontHeight)
|
||||
{
|
||||
return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks.
|
||||
/// </summary>
|
||||
/// <param name="pageWords">The words in the page.</param>
|
||||
/// <param name="words">The words in the page.</param>
|
||||
/// <param name="minimumWidth">The minimum width for a block.</param>
|
||||
/// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
|
||||
/// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double minimumWidth,
|
||||
/// <param name="wordSeparator"></param>
|
||||
/// <param name="lineSeparator"></param>
|
||||
private IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, double minimumWidth,
|
||||
Func<IEnumerable<Letter>, double> dominantFontWidthFunc,
|
||||
Func<IEnumerable<Letter>, double> dominantFontHeightFunc)
|
||||
Func<IEnumerable<Letter>, double> dominantFontHeightFunc,
|
||||
string wordSeparator, string lineSeparator)
|
||||
{
|
||||
if (pageWords.Count() == 0) return EmptyArray<TextBlock>.Instance;
|
||||
// Filter out white spaces
|
||||
words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text));
|
||||
if (!words.Any())
|
||||
{
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
|
||||
XYLeaf root = new XYLeaf(pageWords); // Create a root node.
|
||||
XYLeaf root = new XYLeaf(words); // Create a root node.
|
||||
XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
|
||||
|
||||
if (node.IsLeaf)
|
||||
{
|
||||
return new List<TextBlock> { new TextBlock((node as XYLeaf).GetLines()) };
|
||||
return new List<TextBlock> { new TextBlock((node as XYLeaf).GetLines(wordSeparator), lineSeparator) };
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -102,7 +95,7 @@
|
||||
|
||||
if (leaves.Count > 0)
|
||||
{
|
||||
return leaves.Select(l => new TextBlock(l.GetLines())).ToList();
|
||||
return leaves.Select(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator)).ToList();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,7 +107,7 @@
|
||||
Func<IEnumerable<Letter>, double> dominantFontHeightFunc, int level = 0)
|
||||
{
|
||||
// Order words left to right
|
||||
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Left).ToArray();
|
||||
var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Left).ToArray();
|
||||
|
||||
if (words.Length == 0)
|
||||
{
|
||||
@@ -123,7 +116,7 @@
|
||||
|
||||
// Create new leaf with non-whitespace words.
|
||||
leaf = new XYLeaf(words);
|
||||
|
||||
|
||||
if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
|
||||
{
|
||||
// We stop cutting if
|
||||
@@ -195,7 +188,7 @@
|
||||
}
|
||||
if (i == wordsCount - 1) projectionProfile.Add(currentProjection);
|
||||
}
|
||||
|
||||
|
||||
var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w =>
|
||||
{
|
||||
// Get words that are contained in each projection profiles
|
||||
@@ -203,7 +196,7 @@
|
||||
return normalisedBB.Left >= p.LowerBound && normalisedBB.Right <= p.UpperBound;
|
||||
}));
|
||||
|
||||
var newLeaves = newLeavesEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
|
||||
var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e));
|
||||
var newNodes = newLeaves.Select(l => HorizontalCut(l, minimumWidth,
|
||||
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
|
||||
|
||||
@@ -221,7 +214,7 @@
|
||||
Func<IEnumerable<Letter>, double> dominantFontHeightFunc, int level = 0)
|
||||
{
|
||||
// Order words bottom to top
|
||||
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray();
|
||||
var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray();
|
||||
|
||||
if (words.Length == 0)
|
||||
{
|
||||
@@ -303,7 +296,7 @@
|
||||
return normalisedBB.Bottom >= p.LowerBound && normalisedBB.Top <= p.UpperBound;
|
||||
}));
|
||||
|
||||
var newLeaves = newLeavesEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
|
||||
var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e));
|
||||
var newNodes = newLeaves.Select(l => VerticalCut(l, minimumWidth,
|
||||
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
|
||||
|
||||
@@ -335,5 +328,51 @@
|
||||
return value >= LowerBound && value <= UpperBound;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Recursive X-Y cut page segmenter options.
|
||||
/// </summary>
|
||||
public class RecursiveXYCutOptions : PageSegmenterOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// The minimum width for a block.
|
||||
/// <para>Default value is 1.</para>
|
||||
/// </summary>
|
||||
public double MinimumWidth { get; set; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// The function that determines the dominant font width.
|
||||
/// <para>Default value is the mode of the block's letters width.
|
||||
/// If the mode is not available, the average is used.</para>
|
||||
/// </summary>
|
||||
public Func<IEnumerable<Letter>, double> DominantFontWidthFunc { get; set; } =
|
||||
(letters) =>
|
||||
{
|
||||
var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3)));
|
||||
var mode = widths.Mode();
|
||||
if (double.IsNaN(mode) || mode == 0)
|
||||
{
|
||||
mode = widths.Average();
|
||||
}
|
||||
return mode;
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// The function that determines the dominant font height.
|
||||
/// <para>Default value is the mode of the block's letters height times 1.5.
|
||||
/// If the mode is not available, the average is used.</para>
|
||||
/// </summary>
|
||||
public Func<IEnumerable<Letter>, double> DominantFontHeightFunc { get; set; } =
|
||||
(letters) =>
|
||||
{
|
||||
var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3));
|
||||
var mode = heights.Mode();
|
||||
if (double.IsNaN(mode) || mode == 0)
|
||||
{
|
||||
mode = heights.Average();
|
||||
}
|
||||
return mode * 1.5;
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user