diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs
index e393184e..c030c663 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs
@@ -7,8 +7,9 @@
using System.Linq;
using UglyToad.PdfPig.Geometry;
+ ///
///
- /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
+ /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
/// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
/// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips
@@ -20,81 +21,73 @@
///
public static RecursiveXYCut Instance { get; } = new RecursiveXYCut();
+ ///
///
- /// Get the blocks.
- /// Uses 'minimumWidth' = 0, 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height)
+ /// Get the blocks using default options values.
///
- /// The words in the page.
- ///
- public IReadOnlyList GetBlocks(IEnumerable pageWords)
+ /// The page's words to segment into s.
+ /// The s generated by the Recursive X-Y cut method.
+ public IReadOnlyList GetBlocks(IEnumerable words)
{
- return GetBlocks(pageWords, 0);
+ return GetBlocks(words, new RecursiveXYCutOptions());
}
+ ///
///
- /// Get the blocks.
- /// Uses 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height)
+ /// Get the blocks using options values.
///
- /// The words in the page.
- /// The minimum width for a block.
- public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth)
+ /// The page's words to segment into s.
+ /// The to use.
+ /// The s generated by the Recursive X-Y cut method.
+ public IReadOnlyList GetBlocks(IEnumerable words, DlaOptions options)
{
- return GetBlocks(pageWords, minimumWidth,
- (letters) =>
+ if (options is RecursiveXYCutOptions ryxcOptions)
+ {
+ if (words?.Any() != true)
{
- var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3)));
- var mode = widths.Mode();
- if (double.IsNaN(mode) || mode == 0)
- {
- mode = widths.Average();
- }
- return mode;
- },
- (letters) =>
- {
- var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3));
- var mode = heights.Mode();
- if (double.IsNaN(mode) || mode == 0)
- {
- mode = heights.Average();
- }
- return mode * 1.5;
+ return EmptyArray.Instance;
}
- );
+
+ return GetBlocks(words,
+ ryxcOptions.MinimumWidth,
+ ryxcOptions.DominantFontWidthFunc,
+ ryxcOptions.DominantFontHeightFunc,
+ ryxcOptions.WordSeparator,
+ ryxcOptions.LineSeparator);
+ }
+ else
+ {
+ throw new ArgumentException("Options provided must be of type " + nameof(RecursiveXYCutOptions) + ".", nameof(options));
+ }
}
///
/// Get the blocks.
///
- /// The words in the page.
- /// The minimum width for a block.
- /// The dominant font width.
- /// The dominant font height.
- public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth,
- double dominantFontWidth, double dominantFontHeight)
- {
- return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight);
- }
-
- ///
- /// Get the blocks.
- ///
- /// The words in the page.
+ /// The words in the page.
/// The minimum width for a block.
/// The function that determines the dominant font width.
/// The function that determines the dominant font height.
- public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth,
+ ///
+ ///
+ private IReadOnlyList GetBlocks(IEnumerable words, double minimumWidth,
Func, double> dominantFontWidthFunc,
- Func, double> dominantFontHeightFunc)
+ Func, double> dominantFontHeightFunc,
+ string wordSeparator, string lineSeparator)
{
- if (pageWords.Count() == 0) return EmptyArray.Instance;
+ // Filter out white spaces
+ words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text));
+ if (!words.Any())
+ {
+ return EmptyArray.Instance;
+ }
- XYLeaf root = new XYLeaf(pageWords); // Create a root node.
+ XYLeaf root = new XYLeaf(words); // Create a root node.
XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
if (node.IsLeaf)
{
- return new List { new TextBlock((node as XYLeaf).GetLines()) };
+ return new List { new TextBlock((node as XYLeaf).GetLines(wordSeparator), lineSeparator) };
}
else
{
@@ -102,7 +95,7 @@
if (leaves.Count > 0)
{
- return leaves.Select(l => new TextBlock(l.GetLines())).ToList();
+ return leaves.Select(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator)).ToList();
}
}
@@ -114,7 +107,7 @@
Func, double> dominantFontHeightFunc, int level = 0)
{
// Order words left to right
- var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Left).ToArray();
+ var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Left).ToArray();
if (words.Length == 0)
{
@@ -123,7 +116,7 @@
// Create new leaf with non-whitespace words.
leaf = new XYLeaf(words);
-
+
if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
{
// We stop cutting if
@@ -195,7 +188,7 @@
}
if (i == wordsCount - 1) projectionProfile.Add(currentProjection);
}
-
+
var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w =>
{
// Get words that are contained in each projection profiles
@@ -203,7 +196,7 @@
return normalisedBB.Left >= p.LowerBound && normalisedBB.Right <= p.UpperBound;
}));
- var newLeaves = newLeavesEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
+ var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e));
var newNodes = newLeaves.Select(l => HorizontalCut(l, minimumWidth,
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
@@ -221,7 +214,7 @@
Func, double> dominantFontHeightFunc, int level = 0)
{
// Order words bottom to top
- var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray();
+ var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray();
if (words.Length == 0)
{
@@ -303,7 +296,7 @@
return normalisedBB.Bottom >= p.LowerBound && normalisedBB.Top <= p.UpperBound;
}));
- var newLeaves = newLeavesEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
+ var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e));
var newNodes = newLeaves.Select(l => VerticalCut(l, minimumWidth,
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
@@ -335,5 +328,51 @@
return value >= LowerBound && value <= UpperBound;
}
}
+
+ ///
+ /// Recursive X-Y cut page segmenter options.
+ ///
+ public class RecursiveXYCutOptions : PageSegmenterOptions
+ {
+ ///
+ /// The minimum width for a block.
+ /// Default value is 1.
+ ///
+ public double MinimumWidth { get; set; } = 1;
+
+ ///
+ /// The function that determines the dominant font width.
+ /// Default value is the mode of the block's letters width.
+ /// If the mode is not available, the average is used.
+ ///
+ public Func, double> DominantFontWidthFunc { get; set; } =
+ (letters) =>
+ {
+ var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3)));
+ var mode = widths.Mode();
+ if (double.IsNaN(mode) || mode == 0)
+ {
+ mode = widths.Average();
+ }
+ return mode;
+ };
+
+ ///
+ /// The function that determines the dominant font height.
+ /// Default value is the mode of the block's letters height times 1.5.
+ /// If the mode is not available, the average is used.
+ ///
+ public Func, double> DominantFontHeightFunc { get; set; } =
+ (letters) =>
+ {
+ var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3));
+ var mode = heights.Mode();
+ if (double.IsNaN(mode) || mode == 0)
+ {
+ mode = heights.Average();
+ }
+ return mode * 1.5;
+ };
+ }
}
-}
+}
\ No newline at end of file