2020-01-10 18:08:33 +00:00
|
|
|
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-01-10 18:08:33 +00:00
|
|
|
|
using Content;
|
|
|
|
|
|
using Core;
|
2020-01-04 16:38:18 +00:00
|
|
|
|
using System;
|
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
|
using System.Linq;
|
2020-02-10 13:53:59 +00:00
|
|
|
|
using UglyToad.PdfPig.Geometry;
|
2020-01-04 16:38:18 +00:00
|
|
|
|
|
2020-05-23 20:07:43 +01:00
|
|
|
|
/// <inheritdoc />
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// <summary>
|
2020-05-23 20:07:43 +01:00
|
|
|
|
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
|
|
|
|
|
|
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips</para>
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// </summary>
|
2019-08-10 16:01:27 +01:00
|
|
|
|
public class RecursiveXYCut : IPageSegmenter
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2019-08-10 16:01:27 +01:00
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Create an instance of Recursive X-Y Cut page segmenter, <see cref="RecursiveXYCut"/>.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public static RecursiveXYCut Instance { get; } = new RecursiveXYCut();
|
|
|
|
|
|
|
2020-05-23 20:07:43 +01:00
|
|
|
|
/// <inheritdoc />
|
2019-06-20 22:10:05 +01:00
|
|
|
|
/// <summary>
|
2020-05-23 20:07:43 +01:00
|
|
|
|
/// Get the blocks using default options values.
|
2019-08-10 16:01:27 +01:00
|
|
|
|
/// </summary>
|
2020-05-23 20:07:43 +01:00
|
|
|
|
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
|
|
|
|
|
|
/// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
|
|
|
|
|
|
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
|
2019-08-10 16:01:27 +01:00
|
|
|
|
{
|
2020-05-23 20:07:43 +01:00
|
|
|
|
return GetBlocks(words, new RecursiveXYCutOptions());
|
2019-08-10 16:01:27 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2020-05-23 20:07:43 +01:00
|
|
|
|
/// <inheritdoc />
|
2019-08-10 16:01:27 +01:00
|
|
|
|
/// <summary>
|
2020-05-23 20:07:43 +01:00
|
|
|
|
/// Get the blocks using options values.
|
2019-06-20 22:10:05 +01:00
|
|
|
|
/// </summary>
|
2020-05-23 20:07:43 +01:00
|
|
|
|
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
|
|
|
|
|
|
/// <param name="options">The <see cref="RecursiveXYCutOptions"/> to use.</param>
|
|
|
|
|
|
/// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
|
|
|
|
|
|
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
|
2019-06-20 22:10:05 +01:00
|
|
|
|
{
|
2020-05-23 20:07:43 +01:00
|
|
|
|
if (options is RecursiveXYCutOptions ryxcOptions)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (words?.Any() != true)
|
2020-04-12 19:49:30 +01:00
|
|
|
|
{
|
2020-05-23 20:07:43 +01:00
|
|
|
|
return EmptyArray<TextBlock>.Instance;
|
2020-04-12 19:49:30 +01:00
|
|
|
|
}
|
2019-06-20 22:10:05 +01:00
|
|
|
|
|
2020-05-23 20:07:43 +01:00
|
|
|
|
return GetBlocks(words,
|
|
|
|
|
|
ryxcOptions.MinimumWidth,
|
|
|
|
|
|
ryxcOptions.DominantFontWidthFunc,
|
|
|
|
|
|
ryxcOptions.DominantFontHeightFunc,
|
|
|
|
|
|
ryxcOptions.WordSeparator,
|
|
|
|
|
|
ryxcOptions.LineSeparator);
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
throw new ArgumentException("Options provided must be of type " + nameof(RecursiveXYCutOptions) + ".", nameof(options));
|
|
|
|
|
|
}
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Get the blocks.
|
|
|
|
|
|
/// </summary>
|
2020-05-23 20:07:43 +01:00
|
|
|
|
/// <param name="words">The words in the page.</param>
|
2019-06-16 14:03:12 +01:00
|
|
|
|
/// <param name="minimumWidth">The minimum width for a block.</param>
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
|
|
|
|
|
|
/// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
|
2020-05-23 20:07:43 +01:00
|
|
|
|
/// <param name="wordSeparator"></param>
|
|
|
|
|
|
/// <param name="lineSeparator"></param>
|
|
|
|
|
|
private IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, double minimumWidth,
|
2020-04-12 19:49:30 +01:00
|
|
|
|
Func<IEnumerable<Letter>, double> dominantFontWidthFunc,
|
2020-05-23 20:07:43 +01:00
|
|
|
|
Func<IEnumerable<Letter>, double> dominantFontHeightFunc,
|
|
|
|
|
|
string wordSeparator, string lineSeparator)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-05-23 20:07:43 +01:00
|
|
|
|
// Filter out white spaces
|
|
|
|
|
|
words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text));
|
|
|
|
|
|
if (!words.Any())
|
|
|
|
|
|
{
|
|
|
|
|
|
return EmptyArray<TextBlock>.Instance;
|
|
|
|
|
|
}
|
2019-09-04 22:14:08 +01:00
|
|
|
|
|
2020-05-23 20:07:43 +01:00
|
|
|
|
XYLeaf root = new XYLeaf(words); // Create a root node.
|
2019-08-10 16:01:27 +01:00
|
|
|
|
XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
|
|
|
|
|
|
|
2019-12-10 15:03:05 -06:00
|
|
|
|
if (node.IsLeaf)
|
|
|
|
|
|
{
|
2020-05-23 20:07:43 +01:00
|
|
|
|
return new List<TextBlock> { new TextBlock((node as XYLeaf).GetLines(wordSeparator), lineSeparator) };
|
2019-12-10 15:03:05 -06:00
|
|
|
|
}
|
|
|
|
|
|
else
|
2019-08-10 16:01:27 +01:00
|
|
|
|
{
|
2020-02-12 12:00:14 +00:00
|
|
|
|
var leaves = node.GetLeaves();
|
2019-12-10 15:03:05 -06:00
|
|
|
|
|
2020-02-12 12:00:14 +00:00
|
|
|
|
if (leaves.Count > 0)
|
2019-12-10 15:03:05 -06:00
|
|
|
|
{
|
2020-05-23 20:07:43 +01:00
|
|
|
|
return leaves.Select(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator)).ToList();
|
2019-12-10 15:03:05 -06:00
|
|
|
|
}
|
2019-08-10 16:01:27 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return new List<TextBlock>();
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-12-21 18:09:49 +00:00
|
|
|
|
private XYNode VerticalCut(XYLeaf leaf, double minimumWidth,
|
2020-04-12 19:49:30 +01:00
|
|
|
|
Func<IEnumerable<Letter>, double> dominantFontWidthFunc,
|
|
|
|
|
|
Func<IEnumerable<Letter>, double> dominantFontHeightFunc, int level = 0)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// Order words left to right
|
2020-05-23 20:07:43 +01:00
|
|
|
|
var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Left).ToArray();
|
2019-12-10 15:03:05 -06:00
|
|
|
|
|
2020-04-12 19:49:30 +01:00
|
|
|
|
if (words.Length == 0)
|
2019-12-10 15:03:05 -06:00
|
|
|
|
{
|
|
|
|
|
|
return new XYNode(null);
|
|
|
|
|
|
}
|
2019-12-11 10:05:17 -06:00
|
|
|
|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// Create new leaf with non-whitespace words.
|
|
|
|
|
|
leaf = new XYLeaf(words);
|
2020-05-23 20:07:43 +01:00
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// We stop cutting if
|
2019-06-16 13:57:30 +01:00
|
|
|
|
// - only one word remains
|
|
|
|
|
|
// - width is too small
|
2019-06-18 20:48:49 +01:00
|
|
|
|
return leaf;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// Determine dominant font width
|
2020-04-12 19:49:30 +01:00
|
|
|
|
double dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters));
|
2019-06-16 13:57:30 +01:00
|
|
|
|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
List<Projection> projectionProfile = new List<Projection>();
|
2020-02-10 13:53:59 +00:00
|
|
|
|
|
|
|
|
|
|
var firstWordBound = words[0].BoundingBox.Normalise();
|
2020-02-11 10:04:04 +00:00
|
|
|
|
Projection currentProjection = new Projection(firstWordBound.Left, firstWordBound.Right);
|
2020-03-03 09:34:23 +00:00
|
|
|
|
int wordsCount = words.Length;
|
2020-02-10 13:53:59 +00:00
|
|
|
|
|
2019-06-16 13:57:30 +01:00
|
|
|
|
for (int i = 1; i < wordsCount; i++)
|
|
|
|
|
|
{
|
2020-02-10 13:53:59 +00:00
|
|
|
|
var currentWordBound = words[i].BoundingBox.Normalise();
|
|
|
|
|
|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
if (currentProjection.Contains(currentWordBound.Left) || currentProjection.Contains(currentWordBound.Right))
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// It is overlapping
|
|
|
|
|
|
if (currentWordBound.Left >= currentProjection.LowerBound
|
|
|
|
|
|
&& currentWordBound.Left <= currentProjection.UpperBound
|
|
|
|
|
|
&& currentWordBound.Right > currentProjection.UpperBound)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
|
|
|
|
|
// |____|
|
|
|
|
|
|
// |____|
|
|
|
|
|
|
// |_______| <- updated
|
2020-02-11 10:04:04 +00:00
|
|
|
|
currentProjection.UpperBound = currentWordBound.Right;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// We ignore the following cases:
|
2019-06-16 13:57:30 +01:00
|
|
|
|
// |____|
|
|
|
|
|
|
// |____| (not possible because of OrderBy)
|
|
|
|
|
|
//
|
|
|
|
|
|
// |____|
|
|
|
|
|
|
//|___________| (not possible because of OrderBy)
|
|
|
|
|
|
//
|
|
|
|
|
|
// |____|
|
|
|
|
|
|
// |_|
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// No overlap
|
|
|
|
|
|
if (currentWordBound.Left - currentProjection.UpperBound <= dominantFontWidth)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// If gap too small -> don't cut
|
2019-06-16 13:57:30 +01:00
|
|
|
|
// |____| |____|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
currentProjection.UpperBound = currentWordBound.Right;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
2020-02-11 10:04:04 +00:00
|
|
|
|
else if (currentProjection.UpperBound - currentProjection.LowerBound < minimumWidth)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// Still too small
|
|
|
|
|
|
currentProjection.UpperBound = currentWordBound.Right;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// If gap big enough -> cut!
|
2019-06-16 13:57:30 +01:00
|
|
|
|
// |____| | |____|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
if (i != wordsCount - 1) // Will always add the last one after
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
projectionProfile.Add(currentProjection);
|
|
|
|
|
|
currentProjection = new Projection(currentWordBound.Left, currentWordBound.Right);
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2020-02-11 10:04:04 +00:00
|
|
|
|
if (i == wordsCount - 1) projectionProfile.Add(currentProjection);
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
2020-05-23 20:07:43 +01:00
|
|
|
|
|
2020-02-12 12:00:14 +00:00
|
|
|
|
var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w =>
|
2020-02-10 13:53:59 +00:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// Get words that are contained in each projection profiles
|
2020-02-10 13:53:59 +00:00
|
|
|
|
var normalisedBB = w.BoundingBox.Normalise();
|
2020-02-11 10:04:04 +00:00
|
|
|
|
return normalisedBB.Left >= p.LowerBound && normalisedBB.Right <= p.UpperBound;
|
2020-02-10 13:53:59 +00:00
|
|
|
|
}));
|
2019-06-16 13:57:30 +01:00
|
|
|
|
|
2020-05-23 20:07:43 +01:00
|
|
|
|
var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e));
|
2020-02-12 12:00:14 +00:00
|
|
|
|
var newNodes = newLeaves.Select(l => HorizontalCut(l, minimumWidth,
|
2019-06-16 13:57:30 +01:00
|
|
|
|
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
|
|
|
|
|
|
|
2020-02-12 12:00:14 +00:00
|
|
|
|
var lost = leaf.Words.Except(newLeavesEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
|
2019-06-16 13:57:30 +01:00
|
|
|
|
if (lost.Count > 0)
|
|
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return new XYNode(newNodes);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-12-21 18:09:49 +00:00
|
|
|
|
private XYNode HorizontalCut(XYLeaf leaf, double minimumWidth,
|
2020-04-12 19:49:30 +01:00
|
|
|
|
Func<IEnumerable<Letter>, double> dominantFontWidthFunc,
|
|
|
|
|
|
Func<IEnumerable<Letter>, double> dominantFontHeightFunc, int level = 0)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// Order words bottom to top
|
2020-05-23 20:07:43 +01:00
|
|
|
|
var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray();
|
2019-12-10 15:03:05 -06:00
|
|
|
|
|
2020-04-12 19:49:30 +01:00
|
|
|
|
if (words.Length == 0)
|
2019-12-10 15:03:05 -06:00
|
|
|
|
{
|
|
|
|
|
|
return new XYNode(null);
|
|
|
|
|
|
}
|
2019-12-21 18:09:49 +00:00
|
|
|
|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// Create new leaf with non-whitespace words.
|
2019-12-21 18:09:49 +00:00
|
|
|
|
leaf = new XYLeaf(words);
|
2019-12-10 15:03:05 -06:00
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
if (leaf.CountWords() <= 1)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// We stop cutting if
|
2019-06-16 13:57:30 +01:00
|
|
|
|
// - only one word remains
|
2019-06-18 20:48:49 +01:00
|
|
|
|
return leaf;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// Determine dominant font height
|
2020-04-12 19:49:30 +01:00
|
|
|
|
double dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters));
|
2019-06-16 13:57:30 +01:00
|
|
|
|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
List<Projection> projectionProfile = new List<Projection>();
|
2020-02-10 13:53:59 +00:00
|
|
|
|
|
|
|
|
|
|
var firstWordBound = words[0].BoundingBox.Normalise();
|
2020-02-11 10:04:04 +00:00
|
|
|
|
Projection currentProjection = new Projection(firstWordBound.Bottom, firstWordBound.Top);
|
2020-04-12 19:49:30 +01:00
|
|
|
|
int wordsCount = words.Length;
|
2020-02-10 13:53:59 +00:00
|
|
|
|
|
2019-06-16 13:57:30 +01:00
|
|
|
|
for (int i = 1; i < wordsCount; i++)
|
|
|
|
|
|
{
|
2020-02-10 13:53:59 +00:00
|
|
|
|
var currentWordBound = words[i].BoundingBox.Normalise();
|
|
|
|
|
|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
if (currentProjection.Contains(currentWordBound.Bottom) || currentProjection.Contains(currentWordBound.Top))
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// It is overlapping
|
|
|
|
|
|
if (currentWordBound.Bottom >= currentProjection.LowerBound
|
|
|
|
|
|
&& currentWordBound.Bottom <= currentProjection.UpperBound
|
|
|
|
|
|
&& currentWordBound.Top > currentProjection.UpperBound)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
currentProjection.UpperBound = currentWordBound.Top;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// No overlap
|
|
|
|
|
|
if (currentWordBound.Bottom - currentProjection.UpperBound <= dominantFontHeight)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// If gap too small -> don't cut
|
2019-06-16 13:57:30 +01:00
|
|
|
|
// |____| |____|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
currentProjection.UpperBound = currentWordBound.Top;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// If gap big enough -> cut!
|
2019-06-16 13:57:30 +01:00
|
|
|
|
// |____| | |____|
|
2020-02-11 10:04:04 +00:00
|
|
|
|
if (i != wordsCount - 1) // Will always add the last one after
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
projectionProfile.Add(currentProjection);
|
|
|
|
|
|
currentProjection = new Projection(currentWordBound.Bottom, currentWordBound.Top);
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2020-02-11 10:04:04 +00:00
|
|
|
|
if (i == wordsCount - 1) projectionProfile.Add(currentProjection);
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (projectionProfile.Count == 1)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (level >= 1)
|
|
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
return leaf;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
level++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-02-12 12:00:14 +00:00
|
|
|
|
var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w =>
|
2020-02-10 13:53:59 +00:00
|
|
|
|
{
|
2020-02-11 10:04:04 +00:00
|
|
|
|
// Get words that are contained in each projection profiles
|
2020-02-10 13:53:59 +00:00
|
|
|
|
var normalisedBB = w.BoundingBox.Normalise();
|
2020-02-11 10:04:04 +00:00
|
|
|
|
return normalisedBB.Bottom >= p.LowerBound && normalisedBB.Top <= p.UpperBound;
|
2020-02-10 13:53:59 +00:00
|
|
|
|
}));
|
|
|
|
|
|
|
2020-05-23 20:07:43 +01:00
|
|
|
|
var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e));
|
2020-02-12 12:00:14 +00:00
|
|
|
|
var newNodes = newLeaves.Select(l => VerticalCut(l, minimumWidth,
|
2019-06-16 13:57:30 +01:00
|
|
|
|
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
|
|
|
|
|
|
|
2020-02-12 12:00:14 +00:00
|
|
|
|
var lost = leaf.Words.Except(newLeavesEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
|
2019-06-16 13:57:30 +01:00
|
|
|
|
if (lost.Count > 0)
|
|
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
return new XYNode(newNodes);
|
|
|
|
|
|
}
|
2020-02-11 10:04:04 +00:00
|
|
|
|
|
|
|
|
|
|
private struct Projection
|
|
|
|
|
|
{
|
|
|
|
|
|
public double UpperBound { get; set; }
|
|
|
|
|
|
public double LowerBound { get; set; }
|
|
|
|
|
|
|
|
|
|
|
|
public Projection(double lowerBound, double upperBound)
|
|
|
|
|
|
{
|
|
|
|
|
|
UpperBound = upperBound;
|
|
|
|
|
|
LowerBound = lowerBound;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Returns true if the value is greater or equal to the lower bound and smaller or equal to the upper bound.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
/// <param name="value">The value to test.</param>
|
|
|
|
|
|
public bool Contains(double value)
|
|
|
|
|
|
{
|
|
|
|
|
|
return value >= LowerBound && value <= UpperBound;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2020-05-23 20:07:43 +01:00
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Recursive X-Y cut page segmenter options.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public class RecursiveXYCutOptions : PageSegmenterOptions
|
|
|
|
|
|
{
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// The minimum width for a block.
|
|
|
|
|
|
/// <para>Default value is 1.</para>
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public double MinimumWidth { get; set; } = 1;
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// The function that determines the dominant font width.
|
|
|
|
|
|
/// <para>Default value is the mode of the block's letters width.
|
|
|
|
|
|
/// If the mode is not available, the average is used.</para>
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public Func<IEnumerable<Letter>, double> DominantFontWidthFunc { get; set; } =
|
|
|
|
|
|
(letters) =>
|
|
|
|
|
|
{
|
|
|
|
|
|
var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3)));
|
|
|
|
|
|
var mode = widths.Mode();
|
|
|
|
|
|
if (double.IsNaN(mode) || mode == 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
mode = widths.Average();
|
|
|
|
|
|
}
|
|
|
|
|
|
return mode;
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// The function that determines the dominant font height.
|
|
|
|
|
|
/// <para>Default value is the mode of the block's letters height times 1.5.
|
|
|
|
|
|
/// If the mode is not available, the average is used.</para>
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public Func<IEnumerable<Letter>, double> DominantFontHeightFunc { get; set; } =
|
|
|
|
|
|
(letters) =>
|
|
|
|
|
|
{
|
|
|
|
|
|
var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3));
|
|
|
|
|
|
var mode = heights.Mode();
|
|
|
|
|
|
if (double.IsNaN(mode) || mode == 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
mode = heights.Average();
|
|
|
|
|
|
}
|
|
|
|
|
|
return mode * 1.5;
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
2020-05-23 20:07:43 +01:00
|
|
|
|
}
|