diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs
index 5da09106..de0dc77c 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs
@@ -9,9 +9,10 @@
public static class MathExtensions
{
///
- /// Computes the mode of a sequence of float values.
+ /// Computes the mode of a sequence of values.
///
- /// The array of floats.
+ /// The sequence of floats.
+ /// The mode of the sequence. Returns if the sequence has no mode or if it is not unique.
public static float Mode(this IEnumerable array)
{
if (array == null || array.Count() == 0) return float.NaN;
@@ -22,9 +23,10 @@
}
///
- /// Computes the mode of a sequence of decimal values.
+ /// Computes the mode of a sequence of values.
///
- /// The array of decimal.
+ /// The sequence of doubles.
+ /// The mode of the sequence. Returns if the sequence has no mode or if it is not unique.
public static double Mode(this IEnumerable array)
{
if (array == null || array.Count() == 0) return double.NaN;
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs
index 4ea35bcf..e393184e 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs
@@ -39,7 +39,28 @@
/// The minimum width for a block.
public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth)
{
- return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5, 3));
+ return GetBlocks(pageWords, minimumWidth,
+ (letters) =>
+ {
+ var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3)));
+ var mode = widths.Mode();
+ if (double.IsNaN(mode) || mode == 0)
+ {
+ mode = widths.Average();
+ }
+ return mode;
+ },
+ (letters) =>
+ {
+ var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3));
+ var mode = heights.Mode();
+ if (double.IsNaN(mode) || mode == 0)
+ {
+ mode = heights.Average();
+ }
+ return mode * 1.5;
+ }
+ );
}
///
@@ -63,8 +84,8 @@
/// The function that determines the dominant font width.
/// The function that determines the dominant font height.
public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth,
- Func, double> dominantFontWidthFunc,
- Func, double> dominantFontHeightFunc)
+ Func, double> dominantFontWidthFunc,
+ Func, double> dominantFontHeightFunc)
{
if (pageWords.Count() == 0) return EmptyArray.Instance;
@@ -89,13 +110,13 @@
}
private XYNode VerticalCut(XYLeaf leaf, double minimumWidth,
- Func, double> dominantFontWidthFunc,
- Func, double> dominantFontHeightFunc, int level = 0)
+ Func, double> dominantFontWidthFunc,
+ Func, double> dominantFontHeightFunc, int level = 0)
{
// Order words left to right
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Left).ToArray();
- if (!words.Any())
+ if (words.Length == 0)
{
return new XYNode(null);
}
@@ -112,8 +133,7 @@
}
// Determine dominant font width
- double dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
- .Select(x => x.GlyphRectangle.Normalise().Width));
+ double dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters));
List projectionProfile = new List();
@@ -197,13 +217,13 @@
}
private XYNode HorizontalCut(XYLeaf leaf, double minimumWidth,
- Func, double> dominantFontWidthFunc,
- Func, double> dominantFontHeightFunc, int level = 0)
+ Func, double> dominantFontWidthFunc,
+ Func, double> dominantFontHeightFunc, int level = 0)
{
// Order words bottom to top
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray();
- if (!words.Any())
+ if (words.Length == 0)
{
return new XYNode(null);
}
@@ -219,14 +239,13 @@
}
// Determine dominant font height
- double dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
- .Select(x => x.GlyphRectangle.Normalise().Height));
+ double dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters));
List projectionProfile = new List();
var firstWordBound = words[0].BoundingBox.Normalise();
Projection currentProjection = new Projection(firstWordBound.Bottom, firstWordBound.Top);
- int wordsCount = words.Count();
+ int wordsCount = words.Length;
for (int i = 1; i < wordsCount; i++)
{