2019-06-16 13:57:30 +01:00
|
|
|
|
using System;
|
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
|
using System.Linq;
|
|
|
|
|
|
using UglyToad.PdfPig.Content;
|
|
|
|
|
|
using UglyToad.PdfPig.Geometry;
|
|
|
|
|
|
|
|
|
|
|
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|
|
|
|
|
{
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
|
|
|
|
|
|
/// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
|
|
|
|
|
|
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips</para>
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// </summary>
|
|
|
|
|
|
public class RecursiveXYCut
|
|
|
|
|
|
{
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Get the blocks.
|
|
|
|
|
|
/// </summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// <param name="pageWords">The words in the page.</param>
|
2019-06-16 14:03:12 +01:00
|
|
|
|
/// <param name="minimumWidth">The minimum width for a block.</param>
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// <param name="dominantFontWidth">The dominant font width.</param>
|
|
|
|
|
|
/// <param name="dominantFontHeight">The dominant font height.</param>
|
|
|
|
|
|
/// <returns></returns>
|
2019-06-16 14:03:12 +01:00
|
|
|
|
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
|
2019-06-16 13:57:30 +01:00
|
|
|
|
decimal dominantFontWidth, decimal dominantFontHeight)
|
|
|
|
|
|
{
|
2019-06-16 14:03:12 +01:00
|
|
|
|
return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight);
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Get the blocks.
|
|
|
|
|
|
/// </summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// <param name="pageWords">The words in the page.</param>
|
2019-06-16 14:03:12 +01:00
|
|
|
|
/// <param name="minimumWidth">The minimum width for a block.</param>
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
|
|
|
|
|
|
/// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
|
|
|
|
|
|
/// <returns></returns>
|
2019-06-16 14:03:12 +01:00
|
|
|
|
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
|
2019-06-16 13:57:30 +01:00
|
|
|
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
|
|
|
|
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc)
|
|
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
var root = new XYLeaf(pageWords);
|
2019-06-16 14:03:12 +01:00
|
|
|
|
return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
private static XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth,
|
2019-06-16 13:57:30 +01:00
|
|
|
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
|
|
|
|
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
|
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
|
|
|
|
|
// we stop cutting if
|
|
|
|
|
|
// - only one word remains
|
|
|
|
|
|
// - width is too small
|
2019-06-18 20:48:49 +01:00
|
|
|
|
return leaf;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// order words left to right
|
2019-06-18 20:48:49 +01:00
|
|
|
|
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray();
|
2019-06-16 13:57:30 +01:00
|
|
|
|
|
|
|
|
|
|
// determine dominantFontWidth and dominantFontHeight
|
|
|
|
|
|
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
|
|
|
|
|
|
.Select(x => Math.Abs(x.GlyphRectangle.Width)));
|
|
|
|
|
|
decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
|
|
|
|
|
|
.Select(x => Math.Abs(x.GlyphRectangle.Height)));
|
|
|
|
|
|
|
|
|
|
|
|
List<decimal[]> projectionProfile = new List<decimal[]>();
|
|
|
|
|
|
decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Left, words[0].BoundingBox.Right };
|
|
|
|
|
|
int wordsCount = words.Count();
|
|
|
|
|
|
for (int i = 1; i < wordsCount; i++)
|
|
|
|
|
|
{
|
|
|
|
|
|
if ((words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1])
|
|
|
|
|
|
|| (words[i].BoundingBox.Right >= currentProj[0] && words[i].BoundingBox.Right <= currentProj[1]))
|
|
|
|
|
|
{
|
|
|
|
|
|
// it is overlapping
|
|
|
|
|
|
if (words[i].BoundingBox.Left >= currentProj[0]
|
|
|
|
|
|
&& words[i].BoundingBox.Left <= currentProj[1]
|
|
|
|
|
|
&& words[i].BoundingBox.Right > currentProj[1])
|
|
|
|
|
|
{
|
|
|
|
|
|
// |____|
|
|
|
|
|
|
// |____|
|
|
|
|
|
|
// |_______| <- updated
|
|
|
|
|
|
currentProj[1] = words[i].BoundingBox.Right;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// we ignore the following cases:
|
|
|
|
|
|
// |____|
|
|
|
|
|
|
// |____| (not possible because of OrderBy)
|
|
|
|
|
|
//
|
|
|
|
|
|
// |____|
|
|
|
|
|
|
//|___________| (not possible because of OrderBy)
|
|
|
|
|
|
//
|
|
|
|
|
|
// |____|
|
|
|
|
|
|
// |_|
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
// no overlap
|
|
|
|
|
|
if (words[i].BoundingBox.Left - currentProj[1] <= domFontWidth)
|
|
|
|
|
|
{
|
|
|
|
|
|
// if gap too small -> don't cut
|
|
|
|
|
|
// |____| |____|
|
|
|
|
|
|
currentProj[1] = words[i].BoundingBox.Right;
|
|
|
|
|
|
}
|
2019-06-16 14:03:12 +01:00
|
|
|
|
else if (currentProj[1] - currentProj[0] < minimumWidth)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
|
|
|
|
|
// still too small
|
|
|
|
|
|
currentProj[1] = words[i].BoundingBox.Right;
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
// if gap big enough -> cut!
|
|
|
|
|
|
// |____| | |____|
|
|
|
|
|
|
if (i != wordsCount - 1) // will always add the last one after
|
|
|
|
|
|
{
|
|
|
|
|
|
projectionProfile.Add(currentProj);
|
|
|
|
|
|
currentProj = new decimal[2] { words[i].BoundingBox.Left, words[i].BoundingBox.Right };
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (i == wordsCount - 1) projectionProfile.Add(currentProj);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
var newLeafsEnums = projectionProfile.Select(p => leaf.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1]));
|
|
|
|
|
|
var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
|
2019-06-16 13:57:30 +01:00
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
var newNodes = newLeafs.Select(l => HorizontalCut(l, minimumWidth,
|
2019-06-16 13:57:30 +01:00
|
|
|
|
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
|
|
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
|
2019-06-16 13:57:30 +01:00
|
|
|
|
if (lost.Count > 0)
|
|
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return new XYNode(newNodes);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
private static XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth,
|
2019-06-16 13:57:30 +01:00
|
|
|
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
|
|
|
|
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
|
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
if (leaf.CountWords() <= 1)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
|
|
|
|
|
// we stop cutting if
|
|
|
|
|
|
// - only one word remains
|
2019-06-18 20:48:49 +01:00
|
|
|
|
return leaf;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top
|
2019-06-16 13:57:30 +01:00
|
|
|
|
|
|
|
|
|
|
// determine dominantFontWidth and dominantFontHeight
|
|
|
|
|
|
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
|
|
|
|
|
|
.Select(x => Math.Abs(x.GlyphRectangle.Width)));
|
|
|
|
|
|
decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
|
|
|
|
|
|
.Select(x => Math.Abs(x.GlyphRectangle.Height)));
|
|
|
|
|
|
|
|
|
|
|
|
List<decimal[]> projectionProfile = new List<decimal[]>();
|
|
|
|
|
|
decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Bottom, words[0].BoundingBox.Top };
|
|
|
|
|
|
int wordsCount = words.Count();
|
|
|
|
|
|
for (int i = 1; i < wordsCount; i++)
|
|
|
|
|
|
{
|
|
|
|
|
|
if ((words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1])
|
|
|
|
|
|
|| (words[i].BoundingBox.Top >= currentProj[0] && words[i].BoundingBox.Top <= currentProj[1]))
|
|
|
|
|
|
{
|
|
|
|
|
|
// it is overlapping
|
|
|
|
|
|
if (words[i].BoundingBox.Bottom >= currentProj[0]
|
|
|
|
|
|
&& words[i].BoundingBox.Bottom <= currentProj[1]
|
|
|
|
|
|
&& words[i].BoundingBox.Top > currentProj[1])
|
|
|
|
|
|
{
|
|
|
|
|
|
currentProj[1] = words[i].BoundingBox.Top;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
// no overlap
|
|
|
|
|
|
if (words[i].BoundingBox.Bottom - currentProj[1] <= domFontHeight)
|
|
|
|
|
|
{
|
|
|
|
|
|
// if gap too small -> don't cut
|
|
|
|
|
|
// |____| |____|
|
|
|
|
|
|
currentProj[1] = words[i].BoundingBox.Top;
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
// if gap big enough -> cut!
|
|
|
|
|
|
// |____| | |____|
|
|
|
|
|
|
if (i != wordsCount - 1) // will always add the last one after
|
|
|
|
|
|
{
|
|
|
|
|
|
projectionProfile.Add(currentProj);
|
|
|
|
|
|
currentProj = new decimal[2] { words[i].BoundingBox.Bottom, words[i].BoundingBox.Top };
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (i == wordsCount - 1) projectionProfile.Add(currentProj);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (projectionProfile.Count == 1)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (level >= 1)
|
|
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
return leaf;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
level++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
var newLeafsEnums = projectionProfile.Select(p =>
|
|
|
|
|
|
leaf.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1]));
|
|
|
|
|
|
var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
|
|
|
|
|
|
var newNodes = newLeafs.Select(l => VerticalCut(l, minimumWidth,
|
2019-06-16 13:57:30 +01:00
|
|
|
|
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
|
|
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
|
2019-06-16 13:57:30 +01:00
|
|
|
|
if (lost.Count > 0)
|
|
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
return new XYNode(newNodes);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// A Node used in the <see cref="RecursiveXYCut"/> algorithm.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public class XYNode
|
|
|
|
|
|
{
|
|
|
|
|
|
/// <summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// Returns true if this node is a leaf, false otherwise.
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// </summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
public virtual bool IsLeaf => false;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// The rectangle completely containing the node.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public PdfRectangle BoundingBox { get; set; }
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// The children of the node.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public XYNode[] Children { get; set; }
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Recursively counts the words included in this node.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
/// <returns></returns>
|
|
|
|
|
|
public virtual int CountWords()
|
|
|
|
|
|
{
|
|
|
|
|
|
if (Children == null) return 0;
|
|
|
|
|
|
int count = 0;
|
|
|
|
|
|
RecursiveCount(Children, ref count);
|
|
|
|
|
|
return count;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// Recursively gets the leafs (last nodes) of this node.
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// </summary>
|
|
|
|
|
|
/// <returns></returns>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
public virtual List<XYLeaf> GetLeafs()
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
List<XYLeaf> leafs = new List<XYLeaf>();
|
|
|
|
|
|
if (Children == null || Children.Count() == 0) return leafs;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
int level = 0;
|
2019-06-18 20:48:49 +01:00
|
|
|
|
RecursiveGetLeafs(Children, ref leafs, level);
|
|
|
|
|
|
return leafs;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Create a new <see cref="XYNode"/>.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
/// <param name="children"></param>
|
|
|
|
|
|
public XYNode(params XYNode[] children)
|
|
|
|
|
|
: this(children?.ToList())
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Create a new <see cref="XYNode"/>.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
/// <param name="children"></param>
|
|
|
|
|
|
public XYNode(IEnumerable<XYNode> children)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (children != null && children.Count() != 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
Children = children.ToArray();
|
|
|
|
|
|
decimal left = children.Min(b => b.BoundingBox.Left);
|
|
|
|
|
|
decimal right = children.Max(b => b.BoundingBox.Right);
|
|
|
|
|
|
decimal bottom = children.Min(b => b.BoundingBox.Bottom);
|
|
|
|
|
|
decimal top = children.Max(b => b.BoundingBox.Top);
|
|
|
|
|
|
BoundingBox = new PdfRectangle(left, bottom, right, top);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private void RecursiveCount(IEnumerable<XYNode> children, ref int count)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (children.Count() == 0) return;
|
2019-06-18 20:48:49 +01:00
|
|
|
|
foreach (XYNode node in children.Where(x => x.IsLeaf))
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
|
|
|
|
|
count += node.CountWords();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
foreach (XYNode node in children.Where(x => !x.IsLeaf))
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
|
|
|
|
|
RecursiveCount(node.Children, ref count);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
private void RecursiveGetLeafs(IEnumerable<XYNode> children, ref List<XYLeaf> leafs, int level)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
|
|
|
|
|
if (children.Count() == 0) return;
|
|
|
|
|
|
bool isVerticalCut = level % 2 == 0;
|
|
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
foreach (XYLeaf node in children.Where(x => x.IsLeaf))
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
leafs.Add(node);
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
level++;
|
|
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
IEnumerable<XYNode> notLeafs = children.Where(x => !x.IsLeaf);
|
2019-06-16 13:57:30 +01:00
|
|
|
|
|
|
|
|
|
|
if (isVerticalCut)
|
|
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
notLeafs = notLeafs.OrderBy(x => x.BoundingBox.Left).ToList();
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
notLeafs = notLeafs.OrderByDescending(x => x.BoundingBox.Top).ToList();
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-06-18 20:48:49 +01:00
|
|
|
|
foreach (XYNode node in notLeafs)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
RecursiveGetLeafs(node.Children, ref leafs, level);
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public override string ToString()
|
|
|
|
|
|
{
|
2019-06-18 20:48:49 +01:00
|
|
|
|
return (IsLeaf ? "Leaf" : "Node");
|
2019-06-16 13:57:30 +01:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// A Leaf node used in the <see cref="RecursiveXYCut"/> algorithm, i.e. a block.
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// </summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
public class XYLeaf : XYNode
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
|
|
|
|
|
/// <summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// Returns true if this node is a leaf, false otherwise.
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// </summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
public override bool IsLeaf => true;
|
2019-06-16 13:57:30 +01:00
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// The words in the leaf.
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// </summary>
|
|
|
|
|
|
public Word[] Words { get; set; }
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// The number of words in the leaf.
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// </summary>
|
|
|
|
|
|
public override int CountWords() => Words == null ? 0 : Words.Length;
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// Returns null as a leaf doesn't have leafs.
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// </summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
public override List<XYLeaf> GetLeafs()
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// Gets the lines of the leaf.
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// </summary>
|
|
|
|
|
|
public TextLine[] GetLines()
|
|
|
|
|
|
{
|
|
|
|
|
|
var groupedWords = Words.GroupBy(x => x.BoundingBox.Bottom).ToDictionary(x => x.Key, x => x.ToList());
|
|
|
|
|
|
return groupedWords.OrderByDescending(x => x.Key).Select(ws => new TextLine(ws.Value)).ToArray();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// Create a new <see cref="XYLeaf"/>.
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// </summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// <param name="words">The words contained in the leaf.</param>
|
|
|
|
|
|
public XYLeaf(params Word[] words) : this(words == null ? null : words.ToList())
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// Create a new <see cref="XYLeaf"/>.
|
2019-06-16 13:57:30 +01:00
|
|
|
|
/// </summary>
|
2019-06-18 20:48:49 +01:00
|
|
|
|
/// <param name="words">The words contained in the leaf.</param>
|
|
|
|
|
|
public XYLeaf(IEnumerable<Word> words) : base(null)
|
2019-06-16 13:57:30 +01:00
|
|
|
|
{
|
|
|
|
|
|
decimal left = words.Min(b => b.BoundingBox.Left);
|
|
|
|
|
|
decimal right = words.Max(b => b.BoundingBox.Right);
|
|
|
|
|
|
|
|
|
|
|
|
decimal bottom = words.Min(b => b.BoundingBox.Bottom);
|
|
|
|
|
|
decimal top = words.Max(b => b.BoundingBox.Top);
|
|
|
|
|
|
|
|
|
|
|
|
BoundingBox = new PdfRectangle(left, bottom, right, top);
|
|
|
|
|
|
Words = words.ToArray();
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|