diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
index 662780b3..02e38938 100644
--- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
@@ -51,6 +51,10 @@
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.Distances",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.NNWordExtractor",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
new file mode 100644
index 00000000..d5ad4ea4
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
@@ -0,0 +1,86 @@
+using System;
+using System.Linq;
+using UglyToad.PdfPig.Geometry;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// Contains helpful tools for distance measures.
+ ///
+ public static class Distances
+ {
+ ///
+ /// The Euclidean distance is the "ordinary" straight-line distance between two points.
+ ///
+ /// The first point.
+ /// The second point.
+ ///
+ public static double Euclidean(PdfPoint point1, PdfPoint point2)
+ {
+ double dx = (double)(point1.X - point2.X);
+ double dy = (double)(point1.Y - point2.Y);
+ return Math.Sqrt(dx * dx + dy * dy);
+ }
+
+ ///
+ /// The weighted Euclidean distance.
+ ///
+ /// The first point.
+ /// The second point.
+ /// The weight of the X coordinates. Default is 1.
+ /// The weight of the Y coordinates. Default is 1.
+ ///
+ public static double WghtdEuclidean(PdfPoint point1, PdfPoint point2, double wX = 1.0, double wY = 1.0)
+ {
+ double dx = (double)(point1.X - point2.X);
+ double dy = (double)(point1.Y - point2.Y);
+ return Math.Sqrt(wX * dx * dx + wY * dy * dy);
+ }
+
+ ///
+ /// The Manhattan distance between two points is the sum of the absolute differences of their Cartesian coordinates.
+ /// Also known as rectilinear distance, L1 distance, L1 norm, snake distance, city block distance, taxicab metric.
+ ///
+ /// The first point.
+ /// The second point.
+ ///
+ public static double Manhattan(PdfPoint point1, PdfPoint point2)
+ {
+ return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y));
+ }
+
+ ///
+ /// Find the nearest point.
+ ///
+ /// The reference point, for which to find the nearest neighbour.
+ /// The list of neighbours candidates.
+ /// The distance measure to use.
+ /// The distance between reference point, and its nearest neighbour
+ ///
+ public static PdfPoint FindNearest(this PdfPoint pdfPoint, PdfPoint[] points,
+ Func measure, out double dist)
+ {
+ double d = points.Min(k => measure(k, pdfPoint));
+ PdfPoint point = points.First(x => measure(x, pdfPoint) == d);
+ dist = d;
+ return point;
+ }
+
+ ///
+ /// Find the index of the nearest point.
+ ///
+ /// The reference point, for which to find the nearest neighbour.
+ /// The list of neighbours candidates.
+ /// The distance measure to use.
+ /// The distance between reference point, and its nearest neighbour
+ ///
+ public static int FindIndexNearest(this PdfPoint pdfPoint, PdfPoint[] points,
+ Func measure, out double dist)
+ {
+ double d = points.Min(k => measure(k, pdfPoint));
+ int index = Array.FindIndex(points, x => measure(x, pdfPoint) == d);
+ dist = d;
+ return index;
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs
new file mode 100644
index 00000000..a32fc576
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs
@@ -0,0 +1,32 @@
+using System.Collections.Generic;
+using System.Linq;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// Useful math extensions.
+ ///
+ public static class MathExtensions
+ {
+ ///
+ /// Computes the mode of a sequence of float values.
+ ///
+ ///
+ ///
+ public static float Mode(this IEnumerable array)
+ {
+ if (array == null || array.Count() == 0) return float.NaN;
+ return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key;
+ }
+
+ ///
+ /// Computes the mode of a sequence of decimal values.
+ ///
+ ///
+ ///
+ public static decimal Mode(this IEnumerable array)
+ {
+ return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key;
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs
new file mode 100644
index 00000000..5da1efbf
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs
@@ -0,0 +1,200 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using UglyToad.PdfPig.Content;
+using UglyToad.PdfPig.Geometry;
+using UglyToad.PdfPig.Util;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// Nearest Neighbour Word Extractor, using the distance.
+ /// This implementation leverages bounding boxes.
+ ///
+ public class NNWordExtractor : IWordExtractor
+ {
+ ///
+ /// Create an instance of Nearest Neighbour Word Extractor, .
+ ///
+ public static IWordExtractor Instance { get; } = new NNWordExtractor();
+
+ ///
+ /// Gets the words.
+ ///
+ ///
+ ///
+ public IEnumerable GetWords(IReadOnlyList letters)
+ {
+ List wordsH = GetWords(
+ letters.Where(l => l.TextDirection == TextDirection.Horizontal),
+ l => l.GlyphRectangle.Width, Distances.Manhattan)
+ .OrderByDescending(x => x.BoundingBox.Bottom)
+ .ThenBy(x => x.BoundingBox.Left).ToList();
+
+ List words180 = GetWords(
+ letters.Where(l => l.TextDirection == TextDirection.Rotate180),
+ l => l.GlyphRectangle.Width, Distances.Manhattan)
+ .OrderBy(x => x.BoundingBox.Top)
+ .ThenByDescending(x => x.BoundingBox.Right).ToList();
+ wordsH.AddRange(words180);
+
+ List words90 = GetWords(
+ letters.Where(l => l.TextDirection == TextDirection.Rotate90),
+ l => l.GlyphRectangle.Height, Distances.Manhattan)
+ .OrderByDescending(x => x.BoundingBox.Left)
+ .ThenBy(x => x.BoundingBox.Top).ToList();
+ wordsH.AddRange(words90);
+
+ List words270 = GetWords(
+ letters.Where(l => l.TextDirection == TextDirection.Rotate270),
+ l => l.GlyphRectangle.Height, Distances.Manhattan)
+ .OrderBy(x => x.BoundingBox.Right)
+ .ThenByDescending(x => x.BoundingBox.Bottom).ToList();
+ wordsH.AddRange(words270);
+
+ List wordsU = GetWords(
+ letters.Where(l => l.TextDirection == TextDirection.Unknown),
+ l => l.GlyphRectangle.Width, Distances.Manhattan)
+ .OrderByDescending(x => x.BoundingBox.Bottom)
+ .ThenBy(x => x.BoundingBox.Left).ToList();
+ wordsH.AddRange(wordsU);
+
+ return wordsH;
+ }
+
+ ///
+ ///
+ ///
+ /// The letters in the page, they must have
+ /// the same text directions.
+ /// The letter's metric to use in the minimum distance
+ /// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height.
+ /// The distance measure between two start and end base line points,
+ /// e.g. the Manhattan distance.
+ ///
+ private static List GetWords(IEnumerable pageLetters,
+ Func metric, Func distMeasure)
+ {
+ if (pageLetters == null || pageLetters.Count() == 0) return new List();
+
+ if (pageLetters.Any(x => pageLetters.ElementAt(0).TextDirection != x.TextDirection))
+ {
+ throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction.");
+ }
+
+ Letter[] letters = pageLetters.ToArray();
+ int lettersCount = letters.Length;
+ PdfPoint[] startBaseLines = letters.Select(x => x.StartBaseLine).ToArray();
+ int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray();
+
+ // Find nearest neighbours indexes
+ Parallel.For(0, lettersCount, c =>
+ {
+ var currentLetter = letters[c];
+ // only check neighbours if not a white space
+ if (!string.IsNullOrWhiteSpace(currentLetter.Value))
+ {
+ int index = currentLetter.EndBaseLine.FindIndexNearest(startBaseLines, distMeasure, out double dist);
+ var pairedLetter = letters[index];
+
+ if (!string.IsNullOrWhiteSpace(pairedLetter.Value) &&
+ string.Equals(currentLetter.FontName, pairedLetter.FontName, StringComparison.OrdinalIgnoreCase))
+ {
+ decimal minDist = Math.Max(Math.Abs(metric(currentLetter)), Math.Abs(metric(pairedLetter))) * 0.60m;
+ if ((decimal)dist < minDist)
+ {
+ indexes[c] = index;
+ }
+ }
+ }
+ });
+
+ // Group indexes
+ List> groupedIndexes = new List>();
+ List indexDone = new List();
+ for (int c = 0; c < lettersCount; c++)
+ {
+ int i = indexes[c];
+ if (i == -1) continue;
+
+ bool isDoneC = indexDone.Contains(c);
+ bool isDoneI = indexDone.Contains(i);
+ if (isDoneC || isDoneI)
+ {
+ if (isDoneC && !isDoneI)
+ {
+ foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
+ {
+ pair.Add(i);
+ }
+ indexDone.Add(i);
+ }
+ else if (!isDoneC && isDoneI)
+ {
+ foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
+ {
+ pair.Add(c);
+ }
+ indexDone.Add(c);
+ }
+ else
+ {
+ foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
+ {
+ if (!pair.Contains(c)) pair.Add(c);
+ }
+
+ foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
+ {
+ if (!pair.Contains(i)) pair.Add(i);
+ }
+ }
+ }
+ else
+ {
+ List pair = new List() { c, i };
+ groupedIndexes.Add(pair);
+ indexDone.AddRange(pair);
+ }
+ }
+
+ // Merge lists with common index
+ for (int c = 0; c < lettersCount; c++)
+ {
+ List> candidates = groupedIndexes.Where(x => x.Any(t => t == c)).ToList();
+ if (candidates.Count < 2) continue; // only one group with this index
+
+ List merged = candidates.First();
+ groupedIndexes.Remove(merged);
+ for (int i = 1; i < candidates.Count; i++)
+ {
+ var current = candidates[i];
+ merged = merged.Union(current).ToList();
+ groupedIndexes.Remove(current);
+ }
+ groupedIndexes.Add(merged);
+ }
+
+ List words = new List();
+ for (int a = 0; a < groupedIndexes.Count(); a++)
+ {
+ List groupedLetters = new List();
+ foreach (int s in groupedIndexes[a])
+ {
+ groupedLetters.Add(letters[s]);
+ }
+ words.Add(new Word(groupedLetters));
+ }
+
+ List indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList();
+ for (int n = 0; n < indexesNotDone.Count(); n++)
+ {
+ Letter letter = letters[indexesNotDone[n]];
+ words.Add(new Word(new Letter[] { letter }));
+ }
+
+ return words;
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs
new file mode 100644
index 00000000..7893e71e
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs
@@ -0,0 +1,411 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using UglyToad.PdfPig.Content;
+using UglyToad.PdfPig.Geometry;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
+ /// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
+ /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
+ /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha and Robert M.Haralick Ihsin T. Phillips
+ ///
+ public class RecursiveXYCut
+ {
+ ///
+ /// Get the blocks.
+ ///
+ /// The words in a page.
+ /// The minimum widht for a block.
+ /// The dominant font width.
+ /// The dominant font height.
+ ///
+ public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidht,
+ decimal dominantFontWidth, decimal dominantFontHeight)
+ {
+ return GetBlocks(pageWords, minimumWidht, k => dominantFontWidth, k => dominantFontHeight);
+ }
+
+ ///
+ /// Get the blocks.
+ ///
+ /// The words in a page.
+ /// The minimum widht for a block.
+ /// The function that determines the dominant font width.
+ /// The function that determines the dominant font height.
+ ///
+ public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidht,
+ Func, decimal> dominantFontWidthFunc,
+ Func, decimal> dominantFontHeightFunc)
+ {
+ var root = new XYLeef(pageWords);
+ return VerticalCut(root, minimumWidht, dominantFontWidthFunc, dominantFontHeightFunc);
+ }
+
+ private static XYNode VerticalCut(XYLeef leef, decimal minimumWidht,
+ Func, decimal> dominantFontWidthFunc,
+ Func, decimal> dominantFontHeightFunc, int level = 0)
+ {
+ if (leef.CountWords() <= 1 || leef.BoundingBox.Width <= minimumWidht)
+ {
+ // we stop cutting if
+ // - only one word remains
+ // - width is too small
+ return leef;
+ }
+
+ // order words left to right
+ var words = leef.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray();
+
+ // determine dominantFontWidth and dominantFontHeight
+ decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
+ .Select(x => Math.Abs(x.GlyphRectangle.Width)));
+ decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
+ .Select(x => Math.Abs(x.GlyphRectangle.Height)));
+
+ List projectionProfile = new List();
+ decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Left, words[0].BoundingBox.Right };
+ int wordsCount = words.Count();
+ for (int i = 1; i < wordsCount; i++)
+ {
+ if ((words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1])
+ || (words[i].BoundingBox.Right >= currentProj[0] && words[i].BoundingBox.Right <= currentProj[1]))
+ {
+ // it is overlapping
+ if (words[i].BoundingBox.Left >= currentProj[0]
+ && words[i].BoundingBox.Left <= currentProj[1]
+ && words[i].BoundingBox.Right > currentProj[1])
+ {
+ // |____|
+ // |____|
+ // |_______| <- updated
+ currentProj[1] = words[i].BoundingBox.Right;
+ }
+
+ // we ignore the following cases:
+ // |____|
+ // |____| (not possible because of OrderBy)
+ //
+ // |____|
+ //|___________| (not possible because of OrderBy)
+ //
+ // |____|
+ // |_|
+ }
+ else
+ {
+ // no overlap
+ if (words[i].BoundingBox.Left - currentProj[1] <= domFontWidth)
+ {
+ // if gap too small -> don't cut
+ // |____| |____|
+ currentProj[1] = words[i].BoundingBox.Right;
+ }
+ else if (currentProj[1] - currentProj[0] < minimumWidht)
+ {
+ // still too small
+ currentProj[1] = words[i].BoundingBox.Right;
+ }
+ else
+ {
+ // if gap big enough -> cut!
+ // |____| | |____|
+ if (i != wordsCount - 1) // will always add the last one after
+ {
+ projectionProfile.Add(currentProj);
+ currentProj = new decimal[2] { words[i].BoundingBox.Left, words[i].BoundingBox.Right };
+ }
+ }
+ }
+ if (i == wordsCount - 1) projectionProfile.Add(currentProj);
+ }
+
+ var newLeefsEnums = projectionProfile.Select(p => leef.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1]));
+ var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e));
+
+ var newNodes = newLeefs.Select(l => HorizontalCut(l, minimumWidht,
+ dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
+
+ var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
+ if (lost.Count > 0)
+ {
+ newNodes.AddRange(lost.Select(w => new XYLeef(w)));
+ }
+
+ return new XYNode(newNodes);
+ }
+
+ private static XYNode HorizontalCut(XYLeef leef, decimal minimumWidht,
+ Func, decimal> dominantFontWidthFunc,
+ Func, decimal> dominantFontHeightFunc, int level = 0)
+ {
+ if (leef.CountWords() <= 1)
+ {
+ // we stop cutting if
+ // - only one word remains
+ return leef;
+ }
+
+ var words = leef.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top
+
+ // determine dominantFontWidth and dominantFontHeight
+ decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
+ .Select(x => Math.Abs(x.GlyphRectangle.Width)));
+ decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
+ .Select(x => Math.Abs(x.GlyphRectangle.Height)));
+
+ List projectionProfile = new List();
+ decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Bottom, words[0].BoundingBox.Top };
+ int wordsCount = words.Count();
+ for (int i = 1; i < wordsCount; i++)
+ {
+ if ((words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1])
+ || (words[i].BoundingBox.Top >= currentProj[0] && words[i].BoundingBox.Top <= currentProj[1]))
+ {
+ // it is overlapping
+ if (words[i].BoundingBox.Bottom >= currentProj[0]
+ && words[i].BoundingBox.Bottom <= currentProj[1]
+ && words[i].BoundingBox.Top > currentProj[1])
+ {
+ currentProj[1] = words[i].BoundingBox.Top;
+ }
+ }
+ else
+ {
+ // no overlap
+ if (words[i].BoundingBox.Bottom - currentProj[1] <= domFontHeight)
+ {
+ // if gap too small -> don't cut
+ // |____| |____|
+ currentProj[1] = words[i].BoundingBox.Top;
+ }
+ else
+ {
+ // if gap big enough -> cut!
+ // |____| | |____|
+ if (i != wordsCount - 1) // will always add the last one after
+ {
+ projectionProfile.Add(currentProj);
+ currentProj = new decimal[2] { words[i].BoundingBox.Bottom, words[i].BoundingBox.Top };
+ }
+ }
+ }
+ if (i == wordsCount - 1) projectionProfile.Add(currentProj);
+ }
+
+ if (projectionProfile.Count == 1)
+ {
+ if (level >= 1)
+ {
+ return leef;
+ }
+ else
+ {
+ level++;
+ }
+ }
+
+ var newLeefsEnums = projectionProfile.Select(p =>
+ leef.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1]));
+ var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e));
+ var newNodes = newLeefs.Select(l => VerticalCut(l, minimumWidht,
+ dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
+
+ var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
+ if (lost.Count > 0)
+ {
+ newNodes.AddRange(lost.Select(w => new XYLeef(w)));
+ }
+ return new XYNode(newNodes);
+ }
+ }
+
+ ///
+ /// A Node used in the algorithm.
+ ///
+ public class XYNode
+ {
+ ///
+ /// Returns true if this node is a leef, false otherwise.
+ ///
+ public virtual bool IsLeef => false;
+
+ ///
+ /// The rectangle completely containing the node.
+ ///
+ public PdfRectangle BoundingBox { get; set; }
+
+ ///
+ /// The children of the node.
+ ///
+ public XYNode[] Children { get; set; }
+
+ ///
+ /// Recursively counts the words included in this node.
+ ///
+ ///
+ public virtual int CountWords()
+ {
+ if (Children == null) return 0;
+ int count = 0;
+ RecursiveCount(Children, ref count);
+ return count;
+ }
+
+ ///
+ /// Recursively gets the leefs (last nodes) of this node.
+ ///
+ ///
+ public virtual List GetLeefs()
+ {
+ List leefs = new List();
+ if (Children == null || Children.Count() == 0) return leefs;
+ int level = 0;
+ RecursiveGetLeefs(Children, ref leefs, level);
+ return leefs;
+ }
+
+ ///
+ /// Create a new .
+ ///
+ ///
+ public XYNode(params XYNode[] children)
+ : this(children?.ToList())
+ {
+
+ }
+
+ ///
+ /// Create a new .
+ ///
+ ///
+ public XYNode(IEnumerable children)
+ {
+ if (children != null && children.Count() != 0)
+ {
+ Children = children.ToArray();
+ decimal left = children.Min(b => b.BoundingBox.Left);
+ decimal right = children.Max(b => b.BoundingBox.Right);
+ decimal bottom = children.Min(b => b.BoundingBox.Bottom);
+ decimal top = children.Max(b => b.BoundingBox.Top);
+ BoundingBox = new PdfRectangle(left, bottom, right, top);
+ }
+ }
+
+ private void RecursiveCount(IEnumerable children, ref int count)
+ {
+ if (children.Count() == 0) return;
+ foreach (XYNode node in children.Where(x => x.IsLeef))
+ {
+ count += node.CountWords();
+ }
+
+ foreach (XYNode node in children.Where(x => !x.IsLeef))
+ {
+ RecursiveCount(node.Children, ref count);
+ }
+ }
+
+ private void RecursiveGetLeefs(IEnumerable children, ref List leefs, int level)
+ {
+ if (children.Count() == 0) return;
+ bool isVerticalCut = level % 2 == 0;
+
+ foreach (XYLeef node in children.Where(x => x.IsLeef))
+ {
+ leefs.Add(node);
+ }
+
+ level++;
+
+ IEnumerable notLeefs = children.Where(x => !x.IsLeef);
+
+ if (isVerticalCut)
+ {
+ notLeefs = notLeefs.OrderBy(x => x.BoundingBox.Left).ToList();
+ }
+ else
+ {
+ notLeefs = notLeefs.OrderByDescending(x => x.BoundingBox.Top).ToList();
+ }
+
+ foreach (XYNode node in notLeefs)
+ {
+ RecursiveGetLeefs(node.Children, ref leefs, level);
+ }
+ }
+
+ public override string ToString()
+ {
+ return (IsLeef ? "Leef" : "Node");
+ }
+ }
+
+ ///
+ /// A Leef node used in the algorithm, i.e. a block.
+ ///
+ public class XYLeef : XYNode
+ {
+ ///
+ /// Returns true if this node is a leef, false otherwise.
+ ///
+ public override bool IsLeef => true;
+
+ ///
+ /// The words in the leef.
+ ///
+ public Word[] Words { get; set; }
+
+ ///
+ /// The number of words in the leef.
+ ///
+ ///
+ public override int CountWords() => Words == null ? 0 : Words.Length;
+
+ ///
+ /// Returns null as a leef doesn't have leefs.
+ ///
+ ///
+ public override List GetLeefs()
+ {
+ return null;
+ }
+
+ ///
+ /// Gets the lines of the leef.
+ ///
+ ///
+ public TextLine[] GetLines()
+ {
+ var groupedWords = Words.GroupBy(x => x.BoundingBox.Bottom).ToDictionary(x => x.Key, x => x.ToList());
+ return groupedWords.OrderByDescending(x => x.Key).Select(ws => new TextLine(ws.Value)).ToArray();
+ }
+
+ ///
+ /// Create a new .
+ ///
+ /// The words contained in the leef.
+ public XYLeef(params Word[] words) : this(words == null ? null : words.ToList())
+ {
+
+ }
+
+ ///
+ /// Create a new .
+ ///
+ /// The words contained in the leef.
+ public XYLeef(IEnumerable words) : base(null)
+ {
+ decimal left = words.Min(b => b.BoundingBox.Left);
+ decimal right = words.Max(b => b.BoundingBox.Right);
+
+ decimal bottom = words.Min(b => b.BoundingBox.Bottom);
+ decimal top = words.Max(b => b.BoundingBox.Top);
+
+ BoundingBox = new PdfRectangle(left, bottom, right, top);
+ Words = words.ToArray();
+ }
+ }
+}