From eb9a9fd00ecb14409a132e0d20e07c4fbce24acc Mon Sep 17 00:00:00 2001
From: BobLd <BobLd@email>
Date: Sat, 10 Aug 2019 16:01:27 +0100
Subject: [PATCH 1/4] Document Layout Analysis - IPageSegmenter, Docstrum -
 Create a TextBlock class - Creates IPageSegmenter - Add other useful
 distances: angle, etc. - Update RecursiveXYCut  - With IPageSegmenter and
 TextBlock  - Make XYNode and XYLeaf internal - Optimise (faster)
 NearestNeighbourWordExtractor and isolate the clustering algorithms for use
 outside of this class - Implement a Docstrum inspired page segmentation
 algorithm

---
 .../PublicApiScannerTests.cs                  |   6 +-
 src/UglyToad.PdfPig/Content/TextBlock.cs      |  68 ++++++
 .../ClusteringAlgorithms.cs                   | 164 ++++++++++++++
 .../DocumentLayoutAnalysis/Distances.cs       |  33 +++
 .../DocumentLayoutAnalysis/DocstrumBB.cs      | 212 ++++++++++++++++++
 .../DocumentLayoutAnalysis/IPageSegmenter.cs  |  19 ++
 .../NearestNeighbourWordExtractor .cs         | 115 +---------
 .../DocumentLayoutAnalysis/RecursiveXYCut.cs  |  42 +++-
 .../DocumentLayoutAnalysis/XYLeaf.cs          |   2 +-
 .../DocumentLayoutAnalysis/XYNode.cs          |   2 +-
 10 files changed, 544 insertions(+), 119 deletions(-)
 create mode 100644 src/UglyToad.PdfPig/Content/TextBlock.cs
 create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
 create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
 create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs
diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
index 865a6240..273f59f7 100644
--- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
@@ -54,17 +54,19 @@
                 "UglyToad.PdfPig.Content.PageSize",
                 "UglyToad.PdfPig.Content.Word",
                 "UglyToad.PdfPig.Content.TextLine",
+                "UglyToad.PdfPig.Content.TextBlock",
                 "UglyToad.PdfPig.Content.TextDirection",
                 "UglyToad.PdfPig.Core.TransformationMatrix",
                 "UglyToad.PdfPig.CrossReference.CrossReferenceTable",
                 "UglyToad.PdfPig.CrossReference.CrossReferenceType",
                 "UglyToad.PdfPig.CrossReference.TrailerDictionary",
+                "UglyToad.PdfPig.DocumentLayoutAnalysis.ClusteringAlgorithms",
                 "UglyToad.PdfPig.DocumentLayoutAnalysis.Distances",
+                "UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBB",
+                "UglyToad.PdfPig.DocumentLayoutAnalysis.IPageSegmenter",
                 "UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions",
                 "UglyToad.PdfPig.DocumentLayoutAnalysis.NearestNeighbourWordExtractor",
                 "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
-                "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
-                "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf",
                 "UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
                 "UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
                 "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
diff --git a/src/UglyToad.PdfPig/Content/TextBlock.cs b/src/UglyToad.PdfPig/Content/TextBlock.cs
new file mode 100644
index 00000000..85c10b63
--- /dev/null
+++ b/src/UglyToad.PdfPig/Content/TextBlock.cs
@@ -0,0 +1,68 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using UglyToad.PdfPig.Geometry;
+
+namespace UglyToad.PdfPig.Content
+{
+    /// <summary>
+    /// A block of text.
+    /// </summary>
+    public class TextBlock
+    {
+        /// <summary>
+        /// The text of the block.
+        /// </summary>
+        public string Text { get; }
+
+        /// <summary>
+        /// The text direction of the block.
+        /// </summary>
+        public TextDirection TextDirection { get; }
+
+        /// <summary>
+        /// The rectangle completely containing the block.
+        /// </summary>
+        public PdfRectangle BoundingBox { get; }
+
+        /// <summary>
+        /// The text lines contained in the block.
+        /// </summary>
+        public IReadOnlyList<TextLine> TextLines { get; }
+
+        /// <summary>
+        /// Create a new <see cref="TextBlock"/>.
+        /// </summary>
+        /// <param name="lines"></param>
+        public TextBlock(IReadOnlyList<TextLine> lines)
+        {
+            if (lines == null)
+            {
+                throw new ArgumentNullException(nameof(lines));
+            }
+
+            if (lines.Count == 0)
+            {
+                throw new ArgumentException("Empty lines provided.", nameof(lines));
+            }
+
+            TextLines = lines;
+
+            Text = string.Join(" ", lines.Select(x => x.Text));
+
+            var minX = lines.Min(x => x.BoundingBox.Left);
+            var minY = lines.Min(x => x.BoundingBox.Bottom);
+            var maxX = lines.Max(x => x.BoundingBox.Right);
+            var maxY = lines.Max(x => x.BoundingBox.Top);
+            BoundingBox = new PdfRectangle(minX, minY, maxX, maxY);
+
+            TextDirection = lines[0].TextDirection;
+        }
+
+        /// <inheritdoc />
+        public override string ToString()
+        {
+            return Text;
+        }
+    }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
new file mode 100644
index 00000000..be7a8299
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
@@ -0,0 +1,164 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using UglyToad.PdfPig.Geometry;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+    /// <summary>
+    /// Clustering Algorithms.
+    /// </summary>
+    internal class ClusteringAlgorithms
+    {
+        /// <summary>
+        /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
+        /// https://en.wikipedia.org/wiki/Transitive_closure
+        /// </summary>
+        /// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
+        /// <param name="elements">Array of elements to group.</param>
+        /// <param name="distMeasure">The distance measure between two points.</param>
+        /// <param name="maxDistanceFunction">The function that determines the distance between to points in the same cluster.</param>
+        /// <param name="pivotPoint">The pivot's point to use.</param>
+        /// <param name="candidatesPoint">The candidates to pair point to use.</param>
+        /// <param name="filterPivot">Filter to apply to the pivot point.</param>
+        /// <param name="filterFinal">Filter to apply to both the pivot and the paired point.</param>
+        internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
+            Func<PdfPoint, PdfPoint, double> distMeasure,
+            Func<T, T, double> maxDistanceFunction,
+            Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
+            Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
+        {
+            /*************************************************************************************
+             * Algorithm steps
+             * 1. Find nearest neighbours indexes (done in parallel)
+             *  Iterate every point (pivot) and put its nearest neighbour's index in an array
+             *  e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
+             *  Only conciders a neighbour if it is within the maximum distance. 
+             *  If not within the maximum distance, index will be set to -1.
+             *  NB: Given the possible asymmetry in the relationship, it is possible 
+             *  that if indexes[i] = j then indexes[j] != i.
+             *  
+             * 2. Group indexes
+             *  Group indexes if share neighbours in common - Transitive closure
+             *  e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
+             *  (i,j,k) will form a group and (m,n) will form another group.
+             *  
+             * 3. Merge groups that have indexes in common - If any
+             *  If there are group with indexes in common, merge them.
+             *  (Could be improved and put in step 2)
+             *************************************************************************************/
+
+            int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
+            var candidatesPoints = elements.Select(x => candidatesPoint(x)).ToList();
+
+            // 1. Find nearest neighbours indexes
+            Parallel.For(0, elements.Length, e =>
+            {
+                var pivot = elements[e];
+
+                if (filterPivot(pivot))
+                {
+                    int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist);
+                    var paired = elements[index];
+
+                    if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
+                    {
+                        indexes[e] = index;
+                    }
+                }
+            });
+
+            // 2. Group indexes
+            List<HashSet<int>> groupedIndexes = new List<HashSet<int>>();
+            HashSet<int> indexDone = new HashSet<int>();
+
+            for (int e = 0; e < elements.Length; e++)
+            {
+                int index = indexes[e];
+
+                if (index == -1) // This element is not connected
+                {
+                    // Check if another element index is connected to this element (nb: distance measure is asymetric)
+                    if (!indexes.Contains(e))
+                    {
+                        // If no other element is connected to this element, add it as a standalone element
+                        groupedIndexes.Add(new HashSet<int>() { e });
+                        indexDone.Add(e);
+                    }
+                    continue;
+                }
+
+                bool isDoneC = indexDone.Contains(e);
+                bool isDoneI = indexDone.Contains(index);
+                if (isDoneC || isDoneI)
+                {
+                    if (isDoneC && !isDoneI)
+                    {
+                        foreach (var pair in groupedIndexes.Where(x => x.Contains(e)))
+                        {
+                            pair.Add(index);
+                        }
+                        indexDone.Add(index);
+                    }
+                    else if (!isDoneC && isDoneI)
+                    {
+                        foreach (var pair in groupedIndexes.Where(x => x.Contains(index)))
+                        {
+                            pair.Add(e);
+                        }
+                        indexDone.Add(e);
+                    }
+                    else // isDoneC && isDoneI
+                    {
+                        foreach (var pair in groupedIndexes.Where(x => x.Contains(index)))
+                        {
+                            if (!pair.Contains(e)) pair.Add(e);
+                        }
+
+                        foreach (var pair in groupedIndexes.Where(x => x.Contains(e)))
+                        {
+                            if (!pair.Contains(index)) pair.Add(index);
+                        }
+                    }
+                }
+                else
+                {
+                    groupedIndexes.Add(new HashSet<int>() { e, index });
+                    indexDone.Add(e);
+                    indexDone.Add(index);
+                }
+            }
+
+            // Check that all elements are done
+            if (elements.Length != indexDone.Count)
+            {
+                throw new Exception("ClusteringAlgorithms.GetNNGroupedIndexes(): Some elements were not done.");
+            }
+
+            // 3. Merge groups that have indexes in common
+            // Check if duplicates (if duplicates, then same index in different groups)
+            if (indexDone.Count != groupedIndexes.SelectMany(x => x).Count())
+            {
+                for (int e = 0; e < elements.Length; e++)
+                {
+                    List<HashSet<int>> candidates = groupedIndexes.Where(x => x.Contains(e)).ToList();
+                    int count = candidates.Count();
+                    if (count < 2) continue; // Only one group with this index
+
+                    HashSet<int> merged = candidates.First();
+                    groupedIndexes.Remove(merged);
+                    for (int i = 1; i < count; i++)
+                    {
+                        var current = candidates.ElementAt(i);
+                        merged.UnionWith(current);
+                        groupedIndexes.Remove(current);
+                    }
+                    groupedIndexes.Add(merged);
+                }
+            }
+
+            return groupedIndexes;
+        }
+    }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
index 2b06eea9..8921392f 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
@@ -47,6 +47,39 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
             return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y));
         }
 
+        /// <summary>
+        /// The angle in degrees between the horizontal axis and the line between two points.
+        /// </summary>
+        /// <param name="point1">The first point.</param>
+        /// <param name="point2">The second point.</param>
+        /// <returns></returns>
+        public static double Angle(PdfPoint point1, PdfPoint point2)
+        {
+            return Math.Atan2((float)(point2.Y - point1.Y), (float)(point2.X - point1.X)) * 180.0 / Math.PI;
+        }
+
+        /// <summary>
+        /// The absolute distance between the Y coordinates of two points.
+        /// </summary>
+        /// <param name="point1">The first point.</param>
+        /// <param name="point2">The second point.</param>
+        /// <returns></returns>
+        public static double Vertical(PdfPoint point1, PdfPoint point2)
+        {
+            return Math.Abs((double)(point2.Y - point1.Y));
+        }
+
+        /// <summary>
+        /// The absolute distance between the X coordinates of two points.
+        /// </summary>
+        /// <param name="point1">The first point.</param>
+        /// <param name="point2">The second point.</param>
+        /// <returns></returns>
+        public static double Horizontal(PdfPoint point1, PdfPoint point2)
+        {
+            return Math.Abs((double)(point2.X - point1.X));
+        }
+
         /// <summary>
         /// Find the nearest point.
         /// </summary>
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
new file mode 100644
index 00000000..3ced0778
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
@@ -0,0 +1,212 @@
+﻿using System;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using UglyToad.PdfPig.Content;
+using UglyToad.PdfPig.Geometry;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+    /// <summary>
+    /// The Docstrum algorithm is a bottom-up page segmentation technique based on nearest-neighborhood 
+    /// clustering of connected components extracted from the document. 
+    /// This implementation leverages bounding boxes and does not exactly replicates the original algorithm.
+    /// <para>See 'The document spectrum for page layout analysis.' by L. O’Gorman.</para>
+    /// </summary>
+    public class DocstrumBB : IPageSegmenter
+    {
+        /// <summary>
+        /// Create an instance of Docstrum for bounding boxes page segmenter, <see cref="DocstrumBB"/>.
+        /// </summary>
+        public static DocstrumBB Instance { get; } = new DocstrumBB();
+
+        /// <summary>
+        /// Get the blocks.
+        /// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
+        /// </summary>
+        /// <param name="pageWords"></param>
+        /// <returns></returns>
+        public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords)
+        {
+            return GetBlocks(pageWords, -30, 30, -135, -45, 1.3);
+        }
+
+        /// <summary>
+        /// Get the blocks. See original paper for more information.
+        /// </summary>
+        /// <param name="pageWords"></param>
+        /// <param name="wlAngleLB">Within-line lower bound angle.</param>
+        /// <param name="wlAngleUB">Within-line upper bound angle.</param>
+        /// <param name="blAngleLB">Between-line lower bound angle.</param>
+        /// <param name="blAngleUB">Between-line upper bound angle.</param>
+        /// <param name="blMultiplier">Multiplier that gives the maximum perpendicular distance between 
+        /// text lines for blocking. Maximum distance will be this number times the between-line 
+        /// distance found by the analysis.</param>
+        /// <returns></returns>
+        public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double wlAngleLB, double wlAngleUB,
+            double blAngleLB, double blAngleUB, double blMultiplier)
+        {
+            var pageWordsArr = pageWords.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToArray(); // remove white spaces
+
+            var withinLineDistList = new ConcurrentBag<double[]>();
+            var betweenLineDistList = new ConcurrentBag<double[]>();
+
+            // 1. Estimate in line and between line spacing
+            Parallel.For(0, pageWordsArr.Length, i =>
+            {
+                var word = pageWordsArr[i];
+
+                // Within-line distance
+                var pointWL = GetNearestPointData(pageWordsArr, word,
+                    bb => bb.BottomRight, bb => bb.BottomRight,
+                    bb => bb.BottomLeft, bb => bb.BottomLeft,
+                    wlAngleLB, wlAngleUB, Distances.Horizontal);
+                if (pointWL != null) withinLineDistList.Add(pointWL);
+
+                // Between-line distance
+                var pointBL = GetNearestPointData(pageWordsArr, word,
+                    bb => bb.BottomLeft, bb => bb.Centroid,
+                    bb => bb.TopLeft, bb => bb.Centroid,
+                    blAngleLB, blAngleUB, Distances.Vertical);
+                if (pointBL != null) betweenLineDistList.Add(pointBL);
+            });
+
+            double withinLineDistance = GetPeakAverageDistance(withinLineDistList);
+            double betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);
+
+            // 2. Find lines of text
+            double maxDistWL = Math.Min(3 * withinLineDistance, Math.Sqrt(2) * betweenLineDistance);
+            var lines = GetLines(pageWordsArr, maxDistWL).ToArray();
+
+            // 3. Find blocks of text
+            double maxDistBL = blMultiplier * betweenLineDistance;
+            return GetLinesGroups(lines, maxDistBL).ToList();
+        }
+
+        /// <summary>
+        /// Get information on the nearest point, filtered for angle.
+        /// </summary>
+        /// <param name="words"></param>
+        /// <param name="pivot"></param>
+        /// <param name="funcPivotDist"></param>
+        /// <param name="funcPivotAngle"></param>
+        /// <param name="funcPointsDist"></param>
+        /// <param name="funcPointsAngle"></param>
+        /// <param name="angleStart"></param>
+        /// <param name="angleEnd"></param>
+        /// <param name="finalDistMEasure"></param>
+        /// <returns></returns>
+        private double[] GetNearestPointData(Word[] words, Word pivot, Func<PdfRectangle,
+            PdfPoint> funcPivotDist, Func<PdfRectangle, PdfPoint> funcPivotAngle,
+            Func<PdfRectangle, PdfPoint> funcPointsDist, Func<PdfRectangle, PdfPoint> funcPointsAngle,
+            double angleStart, double angleEnd,
+            Func<PdfPoint, PdfPoint, double> finalDistMEasure)
+        {
+            var pointR = funcPivotDist(pivot.BoundingBox);
+            var filtered = words.Where(w =>
+            {
+                var angleWL = Distances.Angle(funcPivotAngle(pivot.BoundingBox), funcPointsAngle(w.BoundingBox));
+                return (angleWL >= angleStart && angleWL <= angleEnd);
+            }).ToList();
+            filtered.Remove(pivot); // remove itself
+
+            if (filtered.Count > 0)
+            {
+                int index = pointR.FindIndexNearest(
+                    filtered.Select(w => funcPointsDist(w.BoundingBox)).ToList(),
+                    Distances.Euclidean, out double distWL);
+
+                if (index >= 0)
+                {
+                    var matchWL = filtered[index];
+                    return new double[]
+                    {
+                        (double)pivot.Letters.Select(l => l.FontSize).Mode(),
+                        finalDistMEasure(pointR, funcPointsDist(matchWL.BoundingBox))
+                    };
+                }
+            }
+            return null;
+        }
+
+        /// <summary>
+        /// Build lines via transitive closure.
+        /// </summary>
+        /// <param name="words"></param>
+        /// <param name="maxDist"></param>
+        /// <returns></returns>
+        private IEnumerable<TextLine> GetLines(Word[] words, double maxDist)
+        {
+            TextDirection textDirection = words[0].TextDirection;
+            var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean,
+                (w1, w2) => maxDist,
+                w => w.BoundingBox.BottomRight, w => w.BoundingBox.BottomLeft,
+                w => true,
+                (w1, w2) =>
+                {
+                    var angleWL = Distances.Angle(w1.BoundingBox.BottomRight, w2.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle
+                    return (angleWL >= -30 && angleWL <= 30);
+                }).ToList();
+
+            Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
+            if (textDirection == TextDirection.Rotate180)
+            {
+                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList();
+            }
+            else if (textDirection == TextDirection.Rotate90)
+            {
+                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList();
+            }
+            else if (textDirection == TextDirection.Rotate270)
+            {
+                orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
+            }
+
+            for (int a = 0; a < groupedIndexes.Count(); a++)
+            {
+                yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i])));
+            }
+        }
+
+        /// <summary>
+        /// Build blocks via transitive closure.
+        /// </summary>
+        /// <param name="lines"></param>
+        /// <param name="maxDist"></param>
+        /// <returns></returns>
+        private IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
+        {
+            var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, Distances.Euclidean,
+                (l1, l2) => maxDist,
+                l => l.BoundingBox.TopLeft, l => l.BoundingBox.BottomLeft,
+                l => true, (l1, l2) => true).ToList();
+
+            for (int a = 0; a < groupedIndexes.Count(); a++)
+            {
+                yield return new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList());
+            }
+        }
+
+        /// <summary>
+        /// Get the average distance value of the peak bucket of the histogram.
+        /// </summary>
+        /// <param name="values">array[0]=font size, array[1]=distance</param>
+        /// <returns></returns>
+        private double GetPeakAverageDistance(IEnumerable<double[]> values)
+        {
+            int max = (int)values.Max(x => x[1]) + 1;
+            int[] distrib = new int[max];
+
+            // Create histogram with buckets of size 1.
+            for (int i = 0; i < max; i++)
+            {
+                distrib[i] = values.Where(x => x[1] > i && x[1] <= i + 1).Count();
+            }
+
+            var peakIndex = Array.IndexOf(distrib, distrib.Max());
+
+            return values.Where(v => v[1] > peakIndex && v[1] <= peakIndex + 1).Average(x => x[1]);
+        }
+    }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs
new file mode 100644
index 00000000..27511786
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs
@@ -0,0 +1,19 @@
+﻿using System.Collections.Generic;
+using UglyToad.PdfPig.Content;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+    /// <summary>
+    /// Page segmentation divides a page into areas, each consisting of a layout structure (blocks, lines, etc.).
+    /// <para> See 'Performance Comparison of Six Algorithms for Page Segmentation' by Faisal Shafait, Daniel Keysers, and Thomas M. Breuel.</para>
+    /// </summary>
+    public interface IPageSegmenter
+    {
+        /// <summary>
+        /// Get the text blocks.
+        /// </summary>
+        /// <param name="pageWords">The words to generate text blocks for.</param>
+        /// <returns>A list of text blocks from this approach.</returns>
+        IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords);
+    }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs
index 34455cda..3efb19be 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs	
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs	
@@ -1,7 +1,6 @@
 ﻿using System;
 using System.Collections.Generic;
 using System.Linq;
-using System.Threading.Tasks;
 using UglyToad.PdfPig.Content;
 using UglyToad.PdfPig.Geometry;
 using UglyToad.PdfPig.Util;
@@ -71,7 +70,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
         /// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height.</param>
         /// <param name="distMeasure">The distance measure between two start and end base line points,
         /// e.g. the Manhattan distance.</param>
-        private static List<Word> GetWords(IEnumerable<Letter> pageLetters,
+        private List<Word> GetWords(IEnumerable<Letter> pageLetters,
             Func<Letter, decimal> metric, Func<PdfPoint, PdfPoint, double> distMeasure)
         {
             if (pageLetters == null || pageLetters.Count() == 0) return new List<Word>();
@@ -97,116 +96,18 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
             }
 
             Letter[] letters = pageLetters.ToArray();
-            int lettersCount = letters.Length;
-            List<PdfPoint> startBaseLines = letters.Select(x => x.StartBaseLine).ToList();
 
-            int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray();
-
-            // Find nearest neighbours indexes
-            Parallel.For(0, lettersCount, c =>
-            {
-                var currentLetter = letters[c];
-                // only check neighbours if not a white space
-                if (!string.IsNullOrWhiteSpace(currentLetter.Value))
-                {
-                    int index = currentLetter.EndBaseLine.FindIndexNearest(startBaseLines, distMeasure, out double dist);
-                    var pairedLetter = letters[index];
-
-                    if (!string.IsNullOrWhiteSpace(pairedLetter.Value) &&
-                        string.Equals(currentLetter.FontName, pairedLetter.FontName, StringComparison.OrdinalIgnoreCase))
-                    {
-                        decimal minDist = Math.Max(Math.Abs(metric(currentLetter)), Math.Abs(metric(pairedLetter))) * 0.60m;
-                        if ((decimal)dist < minDist)
-                        {
-                            indexes[c] = index;
-                        }
-                    }
-                }
-            });
-
-            // Group indexes
-            List<List<int>> groupedIndexes = new List<List<int>>();
-            List<int> indexDone = new List<int>();
-            for (int c = 0; c < lettersCount; c++)
-            {
-                int i = indexes[c];
-                if (i == -1) continue;
-
-                bool isDoneC = indexDone.Contains(c);
-                bool isDoneI = indexDone.Contains(i);
-                if (isDoneC || isDoneI)
-                {
-                    if (isDoneC && !isDoneI)
-                    {
-                        foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
-                        {
-                            pair.Add(i);
-                        }
-                        indexDone.Add(i);
-                    }
-                    else if (!isDoneC && isDoneI)
-                    {
-                        foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
-                        {
-                            pair.Add(c);
-                        }
-                        indexDone.Add(c);
-                    }
-                    else
-                    {
-                        foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
-                        {
-                            if (!pair.Contains(c)) pair.Add(c);
-                        }
-
-                        foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
-                        {
-                            if (!pair.Contains(i)) pair.Add(i);
-                        }
-                    }
-                }
-                else
-                {
-                    List<int> pair = new List<int>() { c, i };
-                    groupedIndexes.Add(pair);
-                    indexDone.AddRange(pair);
-                }
-            }
-
-            // Merge lists with common index 
-            for (int c = 0; c < lettersCount; c++)
-            {
-                List<List<int>> candidates = groupedIndexes.Where(x => x.Any(t => t == c)).ToList();
-                if (candidates.Count < 2) continue; // only one group with this index
-
-                List<int> merged = candidates.First();
-                groupedIndexes.Remove(merged);
-                for (int i = 1; i < candidates.Count; i++)
-                {
-                    var current = candidates[i];
-                    merged = merged.Union(current).ToList();
-                    groupedIndexes.Remove(current);
-                }
-                groupedIndexes.Add(merged);
-            }
+            var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(letters,
+                distMeasure,
+                (l1, l2) => Math.Max((double)metric(l1), (double)metric(l2)) * 0.60,
+                l => l.EndBaseLine, l => l.StartBaseLine,
+                l => !string.IsNullOrWhiteSpace(l.Value),
+                (l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value)).ToList();
 
             List<Word> words = new List<Word>();
             for (int a = 0; a < groupedIndexes.Count(); a++)
             {
-                List<Letter> groupedLetters = new List<Letter>();
-                foreach (int s in groupedIndexes[a])
-                {
-                    groupedLetters.Add(letters[s]);
-                }
-
-                words.Add(new Word(orderFunc(groupedLetters)));
-            }
-
-            List<int> indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList();
-            for (int n = 0; n < indexesNotDone.Count(); n++)
-            {
-                Letter letter = letters[indexesNotDone[n]];
-                words.Add(new Word(new Letter[] { letter }));
+                words.Add(new Word(orderFunc(groupedIndexes[a].Select(i => letters[i]))));
             }
 
             return words;
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs
index ecaa610b..16258f1f 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs
@@ -11,14 +11,31 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
     /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
     /// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips</para>
     /// </summary>
-    public static class RecursiveXYCut
+    public class RecursiveXYCut : IPageSegmenter
     {
+        /// <summary>
+        /// Create an instance of Recursive X-Y Cut page segmenter, <see cref="RecursiveXYCut"/>.
+        /// </summary>
+        public static RecursiveXYCut Instance { get; } = new RecursiveXYCut();
+
         /// <summary>
         /// Get the blocks.
+        /// <para>Uses 'minimumWidth' = 0, 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height)</para>
+        /// </summary>
+        /// <param name="pageWords">The words in the page.</param>
+        /// <returns></returns>
+        public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords)
+        {
+            return GetBlocks(pageWords, 0);
+        }
+
+        /// <summary>
+        /// Get the blocks.
+        /// <para>Uses 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height)</para>
         /// </summary>
         /// <param name="pageWords">The words in the page.</param>
         /// <param name="minimumWidth">The minimum width for a block.</param>
-        public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth = 0)
+        public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth)
         {
             return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5m, 3));
         }
@@ -30,7 +47,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
         /// <param name="minimumWidth">The minimum width for a block.</param>
         /// <param name="dominantFontWidth">The dominant font width.</param>
         /// <param name="dominantFontHeight">The dominant font height.</param>
-        public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
+        public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
             decimal dominantFontWidth, decimal dominantFontHeight)
         {
             return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight);
@@ -43,15 +60,24 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
         /// <param name="minimumWidth">The minimum width for a block.</param>
         /// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
         /// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
-        public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
+        public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
             Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
             Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc)
         {
-            var root = new XYLeaf(pageWords); // Create a root node.
-            return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
+            XYLeaf root = new XYLeaf(pageWords); // Create a root node.
+            XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
+
+            var leafs = node.GetLeafs();
+
+            if (leafs.Count > 0)
+            {
+                return leafs.Select(l => new TextBlock(l.GetLines())).ToList();
+            }
+
+            return new List<TextBlock>();
         }
 
-        private static XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth,
+        private XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth,
             Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
             Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
         {
@@ -144,7 +170,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
             return new XYNode(newNodes);
         }
 
-        private static XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth,
+        private XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth,
             Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
             Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
         {
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs
index 9dab8daf..a5970693 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs
@@ -9,7 +9,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
     /// <summary>
     /// A Leaf node used in the <see cref="RecursiveXYCut"/> algorithm, i.e. a block.
     /// </summary>
-    public class XYLeaf : XYNode
+    internal class XYLeaf : XYNode
     {
         /// <summary>
         /// Returns true if this node is a leaf, false otherwise.
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs
index db423c55..9bac97fb 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs
@@ -8,7 +8,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
     /// <summary>
     /// A Node used in the <see cref="RecursiveXYCut"/> algorithm.
     /// </summary>
-    public class XYNode
+    internal class XYNode
     {
         /// <summary>
         /// Returns true if this node is a leaf, false otherwise.

From c14d77e414e115c0f8b7f30efb156f07d675aab4 Mon Sep 17 00:00:00 2001
From: BobLd <BobLd@email>
Date: Sat, 10 Aug 2019 16:36:50 +0100
Subject: [PATCH 2/4] PublicApiScannerTests updated

---
 src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
index 273f59f7..b0de7ab1 100644
--- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
@@ -60,7 +60,6 @@
                 "UglyToad.PdfPig.CrossReference.CrossReferenceTable",
                 "UglyToad.PdfPig.CrossReference.CrossReferenceType",
                 "UglyToad.PdfPig.CrossReference.TrailerDictionary",
-                "UglyToad.PdfPig.DocumentLayoutAnalysis.ClusteringAlgorithms",
                 "UglyToad.PdfPig.DocumentLayoutAnalysis.Distances",
                 "UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBB",
                 "UglyToad.PdfPig.DocumentLayoutAnalysis.IPageSegmenter",

From 7e8b3bdc854eb56be34ba4f4432a4d279a9ac337 Mon Sep 17 00:00:00 2001
From: BobLd <BobLd@email>
Date: Sun, 11 Aug 2019 13:45:08 +0100
Subject: [PATCH 3/4] Update DocstrumBB to account for middle point of the
 overlapping area distance. For this, using distance between 2 lines.

---
 .../ClusteringAlgorithms.cs                   |  99 +++++++++++++++--
 .../DocumentLayoutAnalysis/Distances.cs       |  40 ++++++-
 .../DocumentLayoutAnalysis/DocstrumBB.cs      | 101 +++++++++++++++---
 3 files changed, 215 insertions(+), 25 deletions(-)

diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
index be7a8299..4e9e6182 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
@@ -18,11 +18,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
         /// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
         /// <param name="elements">Array of elements to group.</param>
         /// <param name="distMeasure">The distance measure between two points.</param>
-        /// <param name="maxDistanceFunction">The function that determines the distance between to points in the same cluster.</param>
-        /// <param name="pivotPoint">The pivot's point to use.</param>
-        /// <param name="candidatesPoint">The candidates to pair point to use.</param>
-        /// <param name="filterPivot">Filter to apply to the pivot point.</param>
-        /// <param name="filterFinal">Filter to apply to both the pivot and the paired point.</param>
+        /// <param name="maxDistanceFunction">The function that determines the maximum distance between two points in the same cluster.</param>
+        /// <param name="pivotPoint">The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.</param>
+        /// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
+        /// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
+        /// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
         internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
             Func<PdfPoint, PdfPoint, double> distMeasure,
             Func<T, T, double> maxDistanceFunction,
@@ -69,17 +69,97 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
                 }
             });
 
+            // 2. Group indexes
+            // 3. Merge groups that have indexes in common
+            var groupedIndexes = GroupMergeIndexes(indexes);
+
+            return groupedIndexes;
+        }
+
+        /// <summary>
+        /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
+        /// https://en.wikipedia.org/wiki/Transitive_closure
+        /// </summary>
+        /// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
+        /// <param name="elements">Array of elements to group.</param>
+        /// <param name="distMeasure">The distance measure between two lines.</param>
+        /// <param name="maxDistanceFunction">The function that determines the maximum distance between two points in the same cluster.</param>
+        /// <param name="pivotLine">The pivot's line to use for pairing.</param>
+        /// <param name="candidatesLine">The candidates' line to use for pairing.</param>
+        /// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
+        /// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
+        internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
+            Func<PdfLine, PdfLine, double> distMeasure,
+            Func<T, T, double> maxDistanceFunction,
+            Func<T, PdfLine> pivotLine, Func<T, PdfLine> candidatesLine,
+            Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
+        {
+            /*************************************************************************************
+             * Algorithm steps
+             * 1. Find nearest neighbours indexes (done in parallel)
+             *  Iterate every point (pivot) and put its nearest neighbour's index in an array
+             *  e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
+             *  Only conciders a neighbour if it is within the maximum distance. 
+             *  If not within the maximum distance, index will be set to -1.
+             *  NB: Given the possible asymmetry in the relationship, it is possible 
+             *  that if indexes[i] = j then indexes[j] != i.
+             *  
+             * 2. Group indexes
+             *  Group indexes if share neighbours in common - Transitive closure
+             *  e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
+             *  (i,j,k) will form a group and (m,n) will form another group.
+             *  
+             * 3. Merge groups that have indexes in common - If any
+             *  If there are group with indexes in common, merge them.
+             *  (Could be improved and put in step 2)
+             *************************************************************************************/
+
+            int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
+            var candidatesLines = elements.Select(x => candidatesLine(x)).ToList();
+
+            // 1. Find nearest neighbours indexes
+            Parallel.For(0, elements.Length, e =>
+            {
+                var pivot = elements[e];
+
+                if (filterPivot(pivot))
+                {
+                    int index = pivotLine(pivot).FindIndexNearest(candidatesLines, distMeasure, out double dist);
+                    var paired = elements[index];
+
+                    if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
+                    {
+                        indexes[e] = index;
+                    }
+                }
+            });
+
+            // 2. Group indexes
+            // 3. Merge groups that have indexes in common
+            var groupedIndexes = GroupMergeIndexes(indexes);
+
+            return groupedIndexes;
+        }
+
+        /// <summary>
+        /// Group elements via transitive closure.
+        /// https://en.wikipedia.org/wiki/Transitive_closure
+        /// </summary>
+        /// <param name="indexes">Array of paired elements index.</param>
+        /// <returns></returns>
+        internal static List<HashSet<int>> GroupMergeIndexes(int[] indexes)
+        {
             // 2. Group indexes
             List<HashSet<int>> groupedIndexes = new List<HashSet<int>>();
             HashSet<int> indexDone = new HashSet<int>();
 
-            for (int e = 0; e < elements.Length; e++)
+            for (int e = 0; e < indexes.Length; e++)
             {
                 int index = indexes[e];
 
                 if (index == -1) // This element is not connected
                 {
-                    // Check if another element index is connected to this element (nb: distance measure is asymetric)
+                    // Check if another element's index is connected to this element (nb: distance measure is asymmetric)
                     if (!indexes.Contains(e))
                     {
                         // If no other element is connected to this element, add it as a standalone element
@@ -131,7 +211,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
             }
 
             // Check that all elements are done
-            if (elements.Length != indexDone.Count)
+            if (indexes.Length != indexDone.Count)
             {
                 throw new Exception("ClusteringAlgorithms.GetNNGroupedIndexes(): Some elements were not done.");
             }
@@ -140,7 +220,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
             // Check if duplicates (if duplicates, then same index in different groups)
             if (indexDone.Count != groupedIndexes.SelectMany(x => x).Count())
             {
-                for (int e = 0; e < elements.Length; e++)
+                for (int e = 0; e < indexes.Length; e++)
                 {
                     List<HashSet<int>> candidates = groupedIndexes.Where(x => x.Contains(e)).ToList();
                     int count = candidates.Count();
@@ -157,7 +237,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
                     groupedIndexes.Add(merged);
                 }
             }
-
             return groupedIndexes;
         }
     }
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
index 8921392f..f099c175 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
@@ -86,7 +86,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
         /// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
         /// <param name="points">The list of neighbours candidates.</param>
         /// <param name="distanceMeasure">The distance measure to use.</param>
-        /// <param name="distance">The distance between reference point, and its nearest neighbour</param>
+        /// <param name="distance">The distance between reference point, and its nearest neighbour.</param>
         public static PdfPoint FindNearest(this PdfPoint pdfPoint, IReadOnlyList<PdfPoint> points,
             Func<PdfPoint, PdfPoint, double> distanceMeasure, out double distance)
         {
@@ -122,7 +122,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
         /// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
         /// <param name="points">The list of neighbours candidates.</param>
         /// <param name="distanceMeasure">The distance measure to use.</param>
-        /// <param name="distance">The distance between reference point, and its nearest neighbour</param>
+        /// <param name="distance">The distance between reference point, and its nearest neighbour.</param>
         public static int FindIndexNearest(this PdfPoint pdfPoint, IReadOnlyList<PdfPoint> points,
             Func<PdfPoint, PdfPoint, double> distanceMeasure, out double distance)
         {
@@ -151,5 +151,41 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
 
             return closestPointIndex;
         }
+
+        /// <summary>
+        /// Find the index of the nearest line.
+        /// </summary>
+        /// <param name="pdfLine">The reference line, for which to find the nearest neighbour.</param>
+        /// <param name="lines">The list of neighbours candidates.</param>
+        /// <param name="distanceMeasure">The distance measure between two lines to use.</param>
+        /// <param name="distance">The distance between reference line, and its nearest neighbour.</param>
+        public static int FindIndexNearest(this PdfLine pdfLine, IReadOnlyList<PdfLine> lines,
+            Func<PdfLine, PdfLine, double> distanceMeasure, out double distance)
+        {
+            if (lines == null || lines.Count == 0)
+            {
+                throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "lines");
+            }
+
+            if (distanceMeasure == null)
+            {
+                throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", "distanceMeasure");
+            }
+
+            distance = double.MaxValue;
+            int closestLineIndex = -1;
+
+            for (var i = 0; i < lines.Count; i++)
+            {
+                double currentDistance = distanceMeasure(lines[i], pdfLine);
+                if (currentDistance < distance)
+                {
+                    distance = currentDistance;
+                    closestLineIndex = i;
+                }
+            }
+
+            return closestLineIndex;
+        }
     }
 }
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
index 3ced0778..43d083a5 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
@@ -77,11 +77,48 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
 
             // 2. Find lines of text
             double maxDistWL = Math.Min(3 * withinLineDistance, Math.Sqrt(2) * betweenLineDistance);
-            var lines = GetLines(pageWordsArr, maxDistWL).ToArray();
+            var lines = GetLines(pageWordsArr, maxDistWL, wlAngleLB, wlAngleUB).ToArray();
 
             // 3. Find blocks of text
             double maxDistBL = blMultiplier * betweenLineDistance;
-            return GetLinesGroups(lines, maxDistBL).ToList();
+            var blocks = GetLinesGroups(lines, maxDistBL).ToList();
+
+            // 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
+            for (int b = 0; b < blocks.Count; b++)
+            {
+                if (blocks[b] == null) continue;
+
+                for (int c = 0; c < blocks.Count; c++)
+                {
+                    if (b == c) continue;
+                    if (blocks[c] == null) continue;
+
+                    if (AreRectangleOverlapping(blocks[b].BoundingBox, blocks[c].BoundingBox))
+                    {
+                        // Merge
+                        // 1. Merge all words
+                        var mergedWords = new List<Word>(blocks[b].TextLines.SelectMany(l => l.Words));
+                        mergedWords.AddRange(blocks[c].TextLines.SelectMany(l => l.Words));
+
+                        // 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
+                        // same block. Filtering will still be done based on angle.
+                        var mergedLines = GetLines(mergedWords.ToArray(), wlAngleLB, wlAngleUB, double.MaxValue);
+                        blocks[b] = new TextBlock(mergedLines.ToList());
+
+                        // Remove
+                        blocks[c] = null;
+                    }
+                }
+            }
+
+            return blocks.Where(b => b != null).ToList();
+        }
+
+        private bool AreRectangleOverlapping(PdfRectangle rectangle1, PdfRectangle rectangle2)
+        {
+            if (rectangle1.Left > rectangle2.Right || rectangle2.Left > rectangle1.Right) return false;
+            if (rectangle1.Top < rectangle2.Bottom || rectangle2.Top < rectangle1.Bottom) return false;
+            return true;
         }
 
         /// <summary>
@@ -104,6 +141,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
             Func<PdfPoint, PdfPoint, double> finalDistMEasure)
         {
             var pointR = funcPivotDist(pivot.BoundingBox);
+
+            // Filter by angle
             var filtered = words.Where(w =>
             {
                 var angleWL = Distances.Angle(funcPivotAngle(pivot.BoundingBox), funcPointsAngle(w.BoundingBox));
@@ -135,18 +174,27 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
         /// </summary>
         /// <param name="words"></param>
         /// <param name="maxDist"></param>
+        /// <param name="wlAngleLB"></param>
+        /// <param name="wlAngleUB"></param>
         /// <returns></returns>
-        private IEnumerable<TextLine> GetLines(Word[] words, double maxDist)
+        private IEnumerable<TextLine> GetLines(Word[] words, double maxDist, double wlAngleLB, double wlAngleUB)
         {
+            /***************************************************************************************************
+             * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not 
+             * work as the FindIndexNearest() function might pair the pivot with itself (the pivot's right point 
+             * (distance = width) is closer than other words' left point).
+             * -> Solution would be to find more than one nearest neighbours. Use KDTree?
+             ***************************************************************************************************/
+
             TextDirection textDirection = words[0].TextDirection;
             var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean,
-                (w1, w2) => maxDist,
-                w => w.BoundingBox.BottomRight, w => w.BoundingBox.BottomLeft,
-                w => true,
-                (w1, w2) =>
+                (pivot, candidate) => maxDist,
+                pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
+                pivot => true,
+                (pivot, candidate) =>
                 {
-                    var angleWL = Distances.Angle(w1.BoundingBox.BottomRight, w2.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle
-                    return (angleWL >= -30 && angleWL <= 30);
+                    var angleWL = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle
+                    return (angleWL >= wlAngleLB && angleWL <= wlAngleUB);
                 }).ToList();
 
             Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
@@ -177,10 +225,37 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
         /// <returns></returns>
         private IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
         {
-            var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, Distances.Euclidean,
-                (l1, l2) => maxDist,
-                l => l.BoundingBox.TopLeft, l => l.BoundingBox.BottomLeft,
-                l => true, (l1, l2) => true).ToList();
+            /**************************************************************************************************
+             * We want to measure the distance between two lines using the following method:
+             *  We check if two lines are overlapping horizontally.
+             *  If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
+             *  We finally compute the Euclidean distance between these two middle points.
+             *  If the two lines are not overlapping, the distance is set to the max distance.
+             * 
+             * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'line Height', the algo won't 
+             * work as the FindIndexNearest() function will always pair the pivot with itself (the pivot's top
+             * point (distance = height) is closer than other lines' top point).
+             * -> Solution would be to find more than one nearest neighbours. Use KDTree?
+             **************************************************************************************************/
+
+            Func<PdfLine, PdfLine, double> euclidianOverlappingMiddleDistance = (l1, l2) =>
+            {
+                var left = Math.Max(l1.Point1.X, l2.Point1.X);
+                var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left);
+
+                if (d < 0) return double.MaxValue; // not overlapping -> max distance
+
+                return Distances.Euclidean(
+                    new PdfPoint(left + d / 2, l1.Point1.Y),
+                    new PdfPoint(left + d / 2, l2.Point1.Y));
+            };
+
+            var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, 
+                euclidianOverlappingMiddleDistance,
+                (pivot, candidate) => maxDist,
+                pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
+                candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
+                pivot => true, (pivot, candidate) => true).ToList();
 
             for (int a = 0; a < groupedIndexes.Count(); a++)
             {

From 9f13739addc80e2ea5becea23be9aeeea964b00e Mon Sep 17 00:00:00 2001
From: BobLd <BobLd@email>
Date: Sun, 11 Aug 2019 13:54:47 +0100
Subject: [PATCH 4/4] correcting typo

---
 src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
index 43d083a5..b2fca5c3 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
@@ -9,7 +9,7 @@ using UglyToad.PdfPig.Geometry;
 namespace UglyToad.PdfPig.DocumentLayoutAnalysis
 {
     /// <summary>
-    /// The Docstrum algorithm is a bottom-up page segmentation technique based on nearest-neighborhood 
+    /// The Docstrum algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood 
     /// clustering of connected components extracted from the document. 
     /// This implementation leverages bounding boxes and does not exactly replicates the original algorithm.
     /// <para>See 'The document spectrum for page layout analysis.' by L. O’Gorman.</para>