Improve default RecursiveXYCut dominant font width and height functions

This commit is contained in:
BobLd
2020-04-12 19:49:30 +01:00
parent 3833fff28f
commit 395c5a7fd9
2 changed files with 39 additions and 18 deletions

View File

@@ -9,9 +9,10 @@
public static class MathExtensions public static class MathExtensions
{ {
/// <summary> /// <summary>
/// Computes the mode of a sequence of float values. /// Computes the mode of a sequence of <see cref="float"/> values.
/// </summary> /// </summary>
/// <param name="array">The array of floats.</param> /// <param name="array">The sequence of floats.</param>
/// <returns>The mode of the sequence. Returns <see cref="float.NaN"/> if the sequence has no mode or if it is not unique.</returns>
public static float Mode(this IEnumerable<float> array) public static float Mode(this IEnumerable<float> array)
{ {
if (array == null || array.Count() == 0) return float.NaN; if (array == null || array.Count() == 0) return float.NaN;
@@ -22,9 +23,10 @@
} }
/// <summary> /// <summary>
/// Computes the mode of a sequence of decimal values. /// Computes the mode of a sequence of <see cref="double"/> values.
/// </summary> /// </summary>
/// <param name="array">The array of decimal.</param> /// <param name="array">The sequence of doubles.</param>
/// <returns>The mode of the sequence. Returns <see cref="double.NaN"/> if the sequence has no mode or if it is not unique.</returns>
public static double Mode(this IEnumerable<double> array) public static double Mode(this IEnumerable<double> array)
{ {
if (array == null || array.Count() == 0) return double.NaN; if (array == null || array.Count() == 0) return double.NaN;

View File

@@ -39,7 +39,28 @@
/// <param name="minimumWidth">The minimum width for a block.</param> /// <param name="minimumWidth">The minimum width for a block.</param>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double minimumWidth) public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double minimumWidth)
{ {
return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5, 3)); return GetBlocks(pageWords, minimumWidth,
(letters) =>
{
var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3)));
var mode = widths.Mode();
if (double.IsNaN(mode) || mode == 0)
{
mode = widths.Average();
}
return mode;
},
(letters) =>
{
var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3));
var mode = heights.Mode();
if (double.IsNaN(mode) || mode == 0)
{
mode = heights.Average();
}
return mode * 1.5;
}
);
} }
/// <summary> /// <summary>
@@ -63,8 +84,8 @@
/// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param> /// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
/// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param> /// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double minimumWidth, public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double minimumWidth,
Func<IEnumerable<double>, double> dominantFontWidthFunc, Func<IEnumerable<Letter>, double> dominantFontWidthFunc,
Func<IEnumerable<double>, double> dominantFontHeightFunc) Func<IEnumerable<Letter>, double> dominantFontHeightFunc)
{ {
if (pageWords.Count() == 0) return EmptyArray<TextBlock>.Instance; if (pageWords.Count() == 0) return EmptyArray<TextBlock>.Instance;
@@ -89,13 +110,13 @@
} }
private XYNode VerticalCut(XYLeaf leaf, double minimumWidth, private XYNode VerticalCut(XYLeaf leaf, double minimumWidth,
Func<IEnumerable<double>, double> dominantFontWidthFunc, Func<IEnumerable<Letter>, double> dominantFontWidthFunc,
Func<IEnumerable<double>, double> dominantFontHeightFunc, int level = 0) Func<IEnumerable<Letter>, double> dominantFontHeightFunc, int level = 0)
{ {
// Order words left to right // Order words left to right
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Left).ToArray(); var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Left).ToArray();
if (!words.Any()) if (words.Length == 0)
{ {
return new XYNode(null); return new XYNode(null);
} }
@@ -112,8 +133,7 @@
} }
// Determine dominant font width // Determine dominant font width
double dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) double dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters));
.Select(x => x.GlyphRectangle.Normalise().Width));
List<Projection> projectionProfile = new List<Projection>(); List<Projection> projectionProfile = new List<Projection>();
@@ -197,13 +217,13 @@
} }
private XYNode HorizontalCut(XYLeaf leaf, double minimumWidth, private XYNode HorizontalCut(XYLeaf leaf, double minimumWidth,
Func<IEnumerable<double>, double> dominantFontWidthFunc, Func<IEnumerable<Letter>, double> dominantFontWidthFunc,
Func<IEnumerable<double>, double> dominantFontHeightFunc, int level = 0) Func<IEnumerable<Letter>, double> dominantFontHeightFunc, int level = 0)
{ {
// Order words bottom to top // Order words bottom to top
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray(); var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray();
if (!words.Any()) if (words.Length == 0)
{ {
return new XYNode(null); return new XYNode(null);
} }
@@ -219,14 +239,13 @@
} }
// Determine dominant font height // Determine dominant font height
double dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) double dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters));
.Select(x => x.GlyphRectangle.Normalise().Height));
List<Projection> projectionProfile = new List<Projection>(); List<Projection> projectionProfile = new List<Projection>();
var firstWordBound = words[0].BoundingBox.Normalise(); var firstWordBound = words[0].BoundingBox.Normalise();
Projection currentProjection = new Projection(firstWordBound.Bottom, firstWordBound.Top); Projection currentProjection = new Projection(firstWordBound.Bottom, firstWordBound.Top);
int wordsCount = words.Count(); int wordsCount = words.Length;
for (int i = 1; i < wordsCount; i++) for (int i = 1; i < wordsCount; i++)
{ {