diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
index 662780b3..0e38466d 100644
--- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
@@ -51,6 +51,12 @@
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.Distances",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.NearestNeighbourWordExtractor",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
new file mode 100644
index 00000000..2b06eea9
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
@@ -0,0 +1,122 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using UglyToad.PdfPig.Geometry;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// Contains helpful tools for distance measures.
+ ///
+ public static class Distances
+ {
+ ///
+ /// The Euclidean distance is the "ordinary" straight-line distance between two points.
+ ///
+ /// The first point.
+ /// The second point.
+ public static double Euclidean(PdfPoint point1, PdfPoint point2)
+ {
+ double dx = (double)(point1.X - point2.X);
+ double dy = (double)(point1.Y - point2.Y);
+ return Math.Sqrt(dx * dx + dy * dy);
+ }
+
+ ///
+ /// The weighted Euclidean distance.
+ ///
+ /// The first point.
+ /// The second point.
+ /// The weight of the X coordinates. Default is 1.
+ /// The weight of the Y coordinates. Default is 1.
+ public static double WeightedEuclidean(PdfPoint point1, PdfPoint point2, double wX = 1.0, double wY = 1.0)
+ {
+ double dx = (double)(point1.X - point2.X);
+ double dy = (double)(point1.Y - point2.Y);
+ return Math.Sqrt(wX * dx * dx + wY * dy * dy);
+ }
+
+ ///
+ /// The Manhattan distance between two points is the sum of the absolute differences of their Cartesian coordinates.
+ /// Also known as rectilinear distance, L1 distance, L1 norm, snake distance, city block distance, taxicab metric.
+ ///
+ /// The first point.
+ /// The second point.
+ public static double Manhattan(PdfPoint point1, PdfPoint point2)
+ {
+ return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y));
+ }
+
+ ///
+ /// Find the nearest point.
+ ///
+ /// The reference point, for which to find the nearest neighbour.
+ /// The list of neighbours candidates.
+ /// The distance measure to use.
+ /// The distance between reference point, and its nearest neighbour
+ public static PdfPoint FindNearest(this PdfPoint pdfPoint, IReadOnlyList points,
+ Func distanceMeasure, out double distance)
+ {
+ if (points == null || points.Count == 0)
+ {
+ throw new ArgumentException("Distances.FindNearest(): The list of neighbours candidates is either null or empty.", "points");
+ }
+
+ if (distanceMeasure == null)
+ {
+ throw new ArgumentException("Distances.FindNearest(): The distance measure must not be null.", "distanceMeasure");
+ }
+
+ distance = double.MaxValue;
+ PdfPoint closestPoint = default;
+
+ for (var i = 0; i < points.Count; i++)
+ {
+ double currentDistance = distanceMeasure(points[i], pdfPoint);
+ if (currentDistance < distance)
+ {
+ distance = currentDistance;
+ closestPoint = points[i];
+ }
+ }
+
+ return closestPoint;
+ }
+
+ ///
+ /// Find the index of the nearest point.
+ ///
+ /// The reference point, for which to find the nearest neighbour.
+ /// The list of neighbours candidates.
+ /// The distance measure to use.
+ /// The distance between reference point, and its nearest neighbour
+ public static int FindIndexNearest(this PdfPoint pdfPoint, IReadOnlyList points,
+ Func distanceMeasure, out double distance)
+ {
+ if (points == null || points.Count == 0)
+ {
+ throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "points");
+ }
+
+ if (distanceMeasure == null)
+ {
+ throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", "distanceMeasure");
+ }
+
+ distance = double.MaxValue;
+ int closestPointIndex = -1;
+
+ for (var i = 0; i < points.Count; i++)
+ {
+ double currentDistance = distanceMeasure(points[i], pdfPoint);
+ if (currentDistance < distance)
+ {
+ distance = currentDistance;
+ closestPointIndex = i;
+ }
+ }
+
+ return closestPointIndex;
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs
new file mode 100644
index 00000000..295c524e
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs
@@ -0,0 +1,30 @@
+using System.Collections.Generic;
+using System.Linq;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// Useful math extensions.
+ ///
+ public static class MathExtensions
+ {
+ ///
+ /// Computes the mode of a sequence of float values.
+ ///
+ /// The array of floats.
+ public static float Mode(this IEnumerable array)
+ {
+ if (array == null || array.Count() == 0) return float.NaN;
+ return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key;
+ }
+
+ ///
+ /// Computes the mode of a sequence of decimal values.
+ ///
+ /// The array of decimal.
+ public static decimal Mode(this IEnumerable array)
+ {
+ return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key;
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs
new file mode 100644
index 00000000..34455cda
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs
@@ -0,0 +1,215 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using UglyToad.PdfPig.Content;
+using UglyToad.PdfPig.Geometry;
+using UglyToad.PdfPig.Util;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// Nearest Neighbour Word Extractor, using the distance.
+ /// This implementation leverages bounding boxes.
+ ///
+ public class NearestNeighbourWordExtractor : IWordExtractor
+ {
+ ///
+ /// Create an instance of Nearest Neighbour Word Extractor, .
+ ///
+ public static IWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
+
+ ///
+ /// Gets the words.
+ ///
+ /// The letters in the page.
+ public IEnumerable GetWords(IReadOnlyList letters)
+ {
+ List wordsH = GetWords(
+ letters.Where(l => l.TextDirection == TextDirection.Horizontal),
+ l => l.GlyphRectangle.Width, Distances.Manhattan)
+ .OrderByDescending(x => x.BoundingBox.Bottom)
+ .ThenBy(x => x.BoundingBox.Left).ToList();
+
+ List words180 = GetWords(
+ letters.Where(l => l.TextDirection == TextDirection.Rotate180),
+ l => l.GlyphRectangle.Width, Distances.Manhattan)
+ .OrderBy(x => x.BoundingBox.Top)
+ .ThenByDescending(x => x.BoundingBox.Right).ToList();
+ wordsH.AddRange(words180);
+
+ List words90 = GetWords(
+ letters.Where(l => l.TextDirection == TextDirection.Rotate90),
+ l => l.GlyphRectangle.Height, Distances.Manhattan)
+ .OrderByDescending(x => x.BoundingBox.Left)
+ .ThenBy(x => x.BoundingBox.Top).ToList();
+ wordsH.AddRange(words90);
+
+ List words270 = GetWords(
+ letters.Where(l => l.TextDirection == TextDirection.Rotate270),
+ l => l.GlyphRectangle.Height, Distances.Manhattan)
+ .OrderBy(x => x.BoundingBox.Right)
+ .ThenByDescending(x => x.BoundingBox.Bottom).ToList();
+ wordsH.AddRange(words270);
+
+ List wordsU = GetWords(
+ letters.Where(l => l.TextDirection == TextDirection.Unknown),
+ l => l.GlyphRectangle.Width, Distances.Manhattan)
+ .OrderByDescending(x => x.BoundingBox.Bottom)
+ .ThenBy(x => x.BoundingBox.Left).ToList();
+ wordsH.AddRange(wordsU);
+
+ return wordsH;
+ }
+
+ ///
+ /// Private method to get the words.
+ ///
+ /// The letters in the page, they must have
+ /// the same text directions.
+ /// The letter's metric to use in the minimum distance
+ /// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height.
+ /// The distance measure between two start and end base line points,
+ /// e.g. the Manhattan distance.
+ private static List GetWords(IEnumerable pageLetters,
+ Func metric, Func distMeasure)
+ {
+ if (pageLetters == null || pageLetters.Count() == 0) return new List();
+ TextDirection textDirection = pageLetters.ElementAt(0).TextDirection;
+
+ if (pageLetters.Any(x => textDirection != x.TextDirection))
+ {
+ throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction.");
+ }
+
+ Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList();
+ if (textDirection == TextDirection.Rotate180)
+ {
+ orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Right).ToList();
+ }
+ else if (textDirection == TextDirection.Rotate90)
+ {
+ orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Top).ToList();
+ }
+ else if (textDirection == TextDirection.Rotate270)
+ {
+ orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Bottom).ToList();
+ }
+
+ Letter[] letters = pageLetters.ToArray();
+ int lettersCount = letters.Length;
+ List startBaseLines = letters.Select(x => x.StartBaseLine).ToList();
+
+ int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray();
+
+ // Find nearest neighbours indexes
+ Parallel.For(0, lettersCount, c =>
+ {
+ var currentLetter = letters[c];
+ // only check neighbours if not a white space
+ if (!string.IsNullOrWhiteSpace(currentLetter.Value))
+ {
+ int index = currentLetter.EndBaseLine.FindIndexNearest(startBaseLines, distMeasure, out double dist);
+ var pairedLetter = letters[index];
+
+ if (!string.IsNullOrWhiteSpace(pairedLetter.Value) &&
+ string.Equals(currentLetter.FontName, pairedLetter.FontName, StringComparison.OrdinalIgnoreCase))
+ {
+ decimal minDist = Math.Max(Math.Abs(metric(currentLetter)), Math.Abs(metric(pairedLetter))) * 0.60m;
+ if ((decimal)dist < minDist)
+ {
+ indexes[c] = index;
+ }
+ }
+ }
+ });
+
+ // Group indexes
+ List> groupedIndexes = new List>();
+ List indexDone = new List();
+ for (int c = 0; c < lettersCount; c++)
+ {
+ int i = indexes[c];
+ if (i == -1) continue;
+
+ bool isDoneC = indexDone.Contains(c);
+ bool isDoneI = indexDone.Contains(i);
+ if (isDoneC || isDoneI)
+ {
+ if (isDoneC && !isDoneI)
+ {
+ foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
+ {
+ pair.Add(i);
+ }
+ indexDone.Add(i);
+ }
+ else if (!isDoneC && isDoneI)
+ {
+ foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
+ {
+ pair.Add(c);
+ }
+ indexDone.Add(c);
+ }
+ else
+ {
+ foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
+ {
+ if (!pair.Contains(c)) pair.Add(c);
+ }
+
+ foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
+ {
+ if (!pair.Contains(i)) pair.Add(i);
+ }
+ }
+ }
+ else
+ {
+ List pair = new List() { c, i };
+ groupedIndexes.Add(pair);
+ indexDone.AddRange(pair);
+ }
+ }
+
+ // Merge lists with common index
+ for (int c = 0; c < lettersCount; c++)
+ {
+ List> candidates = groupedIndexes.Where(x => x.Any(t => t == c)).ToList();
+ if (candidates.Count < 2) continue; // only one group with this index
+
+ List merged = candidates.First();
+ groupedIndexes.Remove(merged);
+ for (int i = 1; i < candidates.Count; i++)
+ {
+ var current = candidates[i];
+ merged = merged.Union(current).ToList();
+ groupedIndexes.Remove(current);
+ }
+ groupedIndexes.Add(merged);
+ }
+
+ List words = new List();
+ for (int a = 0; a < groupedIndexes.Count(); a++)
+ {
+ List groupedLetters = new List();
+ foreach (int s in groupedIndexes[a])
+ {
+ groupedLetters.Add(letters[s]);
+ }
+
+ words.Add(new Word(orderFunc(groupedLetters)));
+ }
+
+ List indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList();
+ for (int n = 0; n < indexesNotDone.Count(); n++)
+ {
+ Letter letter = letters[indexesNotDone[n]];
+ words.Add(new Word(new Letter[] { letter }));
+ }
+
+ return words;
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs
new file mode 100644
index 00000000..a961ff17
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs
@@ -0,0 +1,231 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using UglyToad.PdfPig.Content;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
+ /// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
+ /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
+ /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips
+ ///
+ public class RecursiveXYCut
+ {
+ ///
+ /// Get the blocks.
+ ///
+ /// The words in the page.
+ /// The minimum width for a block.
+ public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth = 0)
+ {
+ return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5m, 3));
+ }
+
+ ///
+ /// Get the blocks.
+ ///
+ /// The words in the page.
+ /// The minimum width for a block.
+ /// The dominant font width.
+ /// The dominant font height.
+ public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth,
+ decimal dominantFontWidth, decimal dominantFontHeight)
+ {
+ return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight);
+ }
+
+ ///
+ /// Get the blocks.
+ ///
+ /// The words in the page.
+ /// The minimum width for a block.
+ /// The function that determines the dominant font width.
+ /// The function that determines the dominant font height.
+ public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth,
+ Func, decimal> dominantFontWidthFunc,
+ Func, decimal> dominantFontHeightFunc)
+ {
+ var root = new XYLeaf(pageWords); // Create a root node.
+ return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
+ }
+
+ private static XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth,
+ Func, decimal> dominantFontWidthFunc,
+ Func, decimal> dominantFontHeightFunc, int level = 0)
+ {
+ if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
+ {
+ // we stop cutting if
+ // - only one word remains
+ // - width is too small
+ return leaf;
+ }
+
+ // order words left to right
+ var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray();
+
+ // determine dominantFontWidth and dominantFontHeight
+ decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
+ .Select(x => Math.Abs(x.GlyphRectangle.Width)));
+ decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
+ .Select(x => Math.Abs(x.GlyphRectangle.Height)));
+
+ List projectionProfile = new List();
+ decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Left, words[0].BoundingBox.Right };
+ int wordsCount = words.Count();
+ for (int i = 1; i < wordsCount; i++)
+ {
+ if ((words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1])
+ || (words[i].BoundingBox.Right >= currentProj[0] && words[i].BoundingBox.Right <= currentProj[1]))
+ {
+ // it is overlapping
+ if (words[i].BoundingBox.Left >= currentProj[0]
+ && words[i].BoundingBox.Left <= currentProj[1]
+ && words[i].BoundingBox.Right > currentProj[1])
+ {
+ // |____|
+ // |____|
+ // |_______| <- updated
+ currentProj[1] = words[i].BoundingBox.Right;
+ }
+
+ // we ignore the following cases:
+ // |____|
+ // |____| (not possible because of OrderBy)
+ //
+ // |____|
+ //|___________| (not possible because of OrderBy)
+ //
+ // |____|
+ // |_|
+ }
+ else
+ {
+ // no overlap
+ if (words[i].BoundingBox.Left - currentProj[1] <= domFontWidth)
+ {
+ // if gap too small -> don't cut
+ // |____| |____|
+ currentProj[1] = words[i].BoundingBox.Right;
+ }
+ else if (currentProj[1] - currentProj[0] < minimumWidth)
+ {
+ // still too small
+ currentProj[1] = words[i].BoundingBox.Right;
+ }
+ else
+ {
+ // if gap big enough -> cut!
+ // |____| | |____|
+ if (i != wordsCount - 1) // will always add the last one after
+ {
+ projectionProfile.Add(currentProj);
+ currentProj = new decimal[2] { words[i].BoundingBox.Left, words[i].BoundingBox.Right };
+ }
+ }
+ }
+ if (i == wordsCount - 1) projectionProfile.Add(currentProj);
+ }
+
+ var newLeafsEnums = projectionProfile.Select(p => leaf.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1]));
+ var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
+
+ var newNodes = newLeafs.Select(l => HorizontalCut(l, minimumWidth,
+ dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
+
+ var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
+ if (lost.Count > 0)
+ {
+ newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
+ }
+
+ return new XYNode(newNodes);
+ }
+
+ private static XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth,
+ Func, decimal> dominantFontWidthFunc,
+ Func, decimal> dominantFontHeightFunc, int level = 0)
+ {
+ if (leaf.CountWords() <= 1)
+ {
+ // we stop cutting if
+ // - only one word remains
+ return leaf;
+ }
+
+ var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top
+
+ // determine dominantFontWidth and dominantFontHeight
+ decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
+ .Select(x => Math.Abs(x.GlyphRectangle.Width)));
+ decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
+ .Select(x => Math.Abs(x.GlyphRectangle.Height)));
+
+ List projectionProfile = new List();
+ decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Bottom, words[0].BoundingBox.Top };
+ int wordsCount = words.Count();
+ for (int i = 1; i < wordsCount; i++)
+ {
+ if ((words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1])
+ || (words[i].BoundingBox.Top >= currentProj[0] && words[i].BoundingBox.Top <= currentProj[1]))
+ {
+ // it is overlapping
+ if (words[i].BoundingBox.Bottom >= currentProj[0]
+ && words[i].BoundingBox.Bottom <= currentProj[1]
+ && words[i].BoundingBox.Top > currentProj[1])
+ {
+ currentProj[1] = words[i].BoundingBox.Top;
+ }
+ }
+ else
+ {
+ // no overlap
+ if (words[i].BoundingBox.Bottom - currentProj[1] <= domFontHeight)
+ {
+ // if gap too small -> don't cut
+ // |____| |____|
+ currentProj[1] = words[i].BoundingBox.Top;
+ }
+ else
+ {
+ // if gap big enough -> cut!
+ // |____| | |____|
+ if (i != wordsCount - 1) // will always add the last one after
+ {
+ projectionProfile.Add(currentProj);
+ currentProj = new decimal[2] { words[i].BoundingBox.Bottom, words[i].BoundingBox.Top };
+ }
+ }
+ }
+ if (i == wordsCount - 1) projectionProfile.Add(currentProj);
+ }
+
+ if (projectionProfile.Count == 1)
+ {
+ if (level >= 1)
+ {
+ return leaf;
+ }
+ else
+ {
+ level++;
+ }
+ }
+
+ var newLeafsEnums = projectionProfile.Select(p =>
+ leaf.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1]));
+ var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
+ var newNodes = newLeafs.Select(l => VerticalCut(l, minimumWidth,
+ dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
+
+ var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
+ if (lost.Count > 0)
+ {
+ newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
+ }
+ return new XYNode(newNodes);
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs
new file mode 100644
index 00000000..9dab8daf
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs
@@ -0,0 +1,76 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using UglyToad.PdfPig.Content;
+using UglyToad.PdfPig.Geometry;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// A Leaf node used in the algorithm, i.e. a block.
+ ///
+ public class XYLeaf : XYNode
+ {
+ ///
+ /// Returns true if this node is a leaf, false otherwise.
+ ///
+ public override bool IsLeaf => true;
+
+ ///
+ /// The words in the leaf.
+ ///
+ public IReadOnlyList Words { get; }
+
+ ///
+ /// The number of words in the leaf.
+ ///
+ public override int CountWords() => Words == null ? 0 : Words.Count;
+
+ ///
+ /// Returns null as a leaf doesn't have leafs.
+ ///
+ public override List GetLeafs()
+ {
+ return null;
+ }
+
+ ///
+ /// Gets the lines of the leaf.
+ ///
+ public IReadOnlyList GetLines()
+ {
+ return Words.GroupBy(x => x.BoundingBox.Bottom).OrderByDescending(x => x.Key)
+ .Select(x => new TextLine(x.ToList())).ToArray();
+ }
+
+ ///
+ /// Create a new .
+ ///
+ /// The words contained in the leaf.
+ public XYLeaf(params Word[] words) : this(words == null ? null : words.ToList())
+ {
+
+ }
+
+ ///
+ /// Create a new .
+ ///
+ /// The words contained in the leaf.
+ public XYLeaf(IEnumerable words) : base(null)
+ {
+ if (words == null)
+ {
+ throw new ArgumentException("XYLeaf(): The words contained in the leaf cannot be null.", "words");
+ }
+
+ decimal left = words.Min(b => b.BoundingBox.Left);
+ decimal right = words.Max(b => b.BoundingBox.Right);
+
+ decimal bottom = words.Min(b => b.BoundingBox.Bottom);
+ decimal top = words.Max(b => b.BoundingBox.Top);
+
+ BoundingBox = new PdfRectangle(left, bottom, right, top);
+ Words = words.ToArray();
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs
new file mode 100644
index 00000000..70620807
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs
@@ -0,0 +1,130 @@
+using System.Collections.Generic;
+using System.Linq;
+using UglyToad.PdfPig.Geometry;
+using UglyToad.PdfPig.Util;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// A Node used in the algorithm.
+ ///
+ public class XYNode
+ {
+ ///
+ /// Returns true if this node is a leaf, false otherwise.
+ ///
+ public virtual bool IsLeaf => false;
+
+ ///
+ /// The rectangle completely containing the node.
+ ///
+ public PdfRectangle BoundingBox { get; set; }
+
+ ///
+ /// The children of the node.
+ ///
+ public XYNode[] Children { get; set; }
+
+ ///
+ /// Recursively counts the words included in this node.
+ ///
+ public virtual int CountWords()
+ {
+ if (Children == null) return 0;
+ int count = 0;
+ RecursiveCount(Children, ref count);
+ return count;
+ }
+
+ ///
+ /// Recursively gets the leafs (last nodes) of this node.
+ ///
+ public virtual List GetLeafs()
+ {
+ List leafs = new List();
+ if (Children == null || Children.Count() == 0) return leafs;
+ int level = 0;
+ RecursiveGetLeafs(Children, ref leafs, level);
+ return leafs;
+ }
+
+ ///
+ /// Create a new .
+ ///
+ /// The node's children.
+ public XYNode(params XYNode[] children)
+ : this(children?.ToList())
+ {
+
+ }
+
+ ///
+ /// Create a new .
+ ///
+ /// The node's children.
+ public XYNode(IEnumerable children)
+ {
+ if (children != null && children.Count() != 0)
+ {
+ Children = children.ToArray();
+ decimal left = children.Min(b => b.BoundingBox.Left);
+ decimal right = children.Max(b => b.BoundingBox.Right);
+ decimal bottom = children.Min(b => b.BoundingBox.Bottom);
+ decimal top = children.Max(b => b.BoundingBox.Top);
+ BoundingBox = new PdfRectangle(left, bottom, right, top);
+ }
+ else
+ {
+ Children = EmptyArray.Instance;
+ }
+ }
+
+ private void RecursiveCount(IEnumerable children, ref int count)
+ {
+ if (children.Count() == 0) return;
+ foreach (XYNode node in children.Where(x => x.IsLeaf))
+ {
+ count += node.CountWords();
+ }
+
+ foreach (XYNode node in children.Where(x => !x.IsLeaf))
+ {
+ RecursiveCount(node.Children, ref count);
+ }
+ }
+
+ private void RecursiveGetLeafs(IEnumerable children, ref List leafs, int level)
+ {
+ if (children.Count() == 0) return;
+ bool isVerticalCut = level % 2 == 0;
+
+ foreach (XYLeaf node in children.Where(x => x.IsLeaf))
+ {
+ leafs.Add(node);
+ }
+
+ level++;
+
+ IEnumerable notLeafs = children.Where(x => !x.IsLeaf);
+
+ if (isVerticalCut)
+ {
+ notLeafs = notLeafs.OrderBy(x => x.BoundingBox.Left).ToList();
+ }
+ else
+ {
+ notLeafs = notLeafs.OrderByDescending(x => x.BoundingBox.Top).ToList();
+ }
+
+ foreach (XYNode node in notLeafs)
+ {
+ RecursiveGetLeafs(node.Children, ref leafs, level);
+ }
+ }
+
+ public override string ToString()
+ {
+ return (IsLeaf ? "Leaf" : "Node");
+ }
+ }
+}