From eb9a9fd00ecb14409a132e0d20e07c4fbce24acc Mon Sep 17 00:00:00 2001 From: BobLd Date: Sat, 10 Aug 2019 16:01:27 +0100 Subject: [PATCH 1/4] Document Layout Analysis - IPageSegmenter, Docstrum - Create a TextBlock class - Creates IPageSegmenter - Add other useful distances: angle, etc. - Update RecursiveXYCut - With IPageSegmenter and TextBlock - Make XYNode and XYLeaf internal - Optimise (faster) NearestNeighbourWordExtractor and isolate the clustering algorithms for use outside of this class - Implement a Docstrum inspired page segmentation algorithm --- .../PublicApiScannerTests.cs | 6 +- src/UglyToad.PdfPig/Content/TextBlock.cs | 68 ++++++ .../ClusteringAlgorithms.cs | 164 ++++++++++++++ .../DocumentLayoutAnalysis/Distances.cs | 33 +++ .../DocumentLayoutAnalysis/DocstrumBB.cs | 212 ++++++++++++++++++ .../DocumentLayoutAnalysis/IPageSegmenter.cs | 19 ++ .../NearestNeighbourWordExtractor .cs | 115 +--------- .../DocumentLayoutAnalysis/RecursiveXYCut.cs | 42 +++- .../DocumentLayoutAnalysis/XYLeaf.cs | 2 +- .../DocumentLayoutAnalysis/XYNode.cs | 2 +- 10 files changed, 544 insertions(+), 119 deletions(-) create mode 100644 src/UglyToad.PdfPig/Content/TextBlock.cs create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 865a6240..273f59f7 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -54,17 +54,19 @@ "UglyToad.PdfPig.Content.PageSize", "UglyToad.PdfPig.Content.Word", "UglyToad.PdfPig.Content.TextLine", + "UglyToad.PdfPig.Content.TextBlock", "UglyToad.PdfPig.Content.TextDirection", "UglyToad.PdfPig.Core.TransformationMatrix", "UglyToad.PdfPig.CrossReference.CrossReferenceTable", "UglyToad.PdfPig.CrossReference.CrossReferenceType", "UglyToad.PdfPig.CrossReference.TrailerDictionary", + "UglyToad.PdfPig.DocumentLayoutAnalysis.ClusteringAlgorithms", "UglyToad.PdfPig.DocumentLayoutAnalysis.Distances", + "UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBB", + "UglyToad.PdfPig.DocumentLayoutAnalysis.IPageSegmenter", "UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions", "UglyToad.PdfPig.DocumentLayoutAnalysis.NearestNeighbourWordExtractor", "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", - "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", - "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf", "UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor", "UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", diff --git a/src/UglyToad.PdfPig/Content/TextBlock.cs b/src/UglyToad.PdfPig/Content/TextBlock.cs new file mode 100644 index 00000000..85c10b63 --- /dev/null +++ b/src/UglyToad.PdfPig/Content/TextBlock.cs @@ -0,0 +1,68 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.Content +{ + /// + /// A block of text. + /// + public class TextBlock + { + /// + /// The text of the block. + /// + public string Text { get; } + + /// + /// The text direction of the block. + /// + public TextDirection TextDirection { get; } + + /// + /// The rectangle completely containing the block. + /// + public PdfRectangle BoundingBox { get; } + + /// + /// The text lines contained in the block. + /// + public IReadOnlyList TextLines { get; } + + /// + /// Create a new . + /// + /// + public TextBlock(IReadOnlyList lines) + { + if (lines == null) + { + throw new ArgumentNullException(nameof(lines)); + } + + if (lines.Count == 0) + { + throw new ArgumentException("Empty lines provided.", nameof(lines)); + } + + TextLines = lines; + + Text = string.Join(" ", lines.Select(x => x.Text)); + + var minX = lines.Min(x => x.BoundingBox.Left); + var minY = lines.Min(x => x.BoundingBox.Bottom); + var maxX = lines.Max(x => x.BoundingBox.Right); + var maxY = lines.Max(x => x.BoundingBox.Top); + BoundingBox = new PdfRectangle(minX, minY, maxX, maxY); + + TextDirection = lines[0].TextDirection; + } + + /// + public override string ToString() + { + return Text; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs new file mode 100644 index 00000000..be7a8299 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs @@ -0,0 +1,164 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Clustering Algorithms. + /// + internal class ClusteringAlgorithms + { + /// + /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance. + /// https://en.wikipedia.org/wiki/Transitive_closure + /// + /// Letter, Word, TextLine, etc. + /// Array of elements to group. + /// The distance measure between two points. + /// The function that determines the distance between to points in the same cluster. + /// The pivot's point to use. + /// The candidates to pair point to use. + /// Filter to apply to the pivot point. + /// Filter to apply to both the pivot and the paired point. + internal static IEnumerable> SimpleTransitiveClosure(T[] elements, + Func distMeasure, + Func maxDistanceFunction, + Func pivotPoint, Func candidatesPoint, + Func filterPivot, Func filterFinal) + { + /************************************************************************************* + * Algorithm steps + * 1. Find nearest neighbours indexes (done in parallel) + * Iterate every point (pivot) and put its nearest neighbour's index in an array + * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. + * Only conciders a neighbour if it is within the maximum distance. + * If not within the maximum distance, index will be set to -1. + * NB: Given the possible asymmetry in the relationship, it is possible + * that if indexes[i] = j then indexes[j] != i. + * + * 2. Group indexes + * Group indexes if share neighbours in common - Transitive closure + * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 + * (i,j,k) will form a group and (m,n) will form another group. + * + * 3. Merge groups that have indexes in common - If any + * If there are group with indexes in common, merge them. + * (Could be improved and put in step 2) + *************************************************************************************/ + + int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray(); + var candidatesPoints = elements.Select(x => candidatesPoint(x)).ToList(); + + // 1. Find nearest neighbours indexes + Parallel.For(0, elements.Length, e => + { + var pivot = elements[e]; + + if (filterPivot(pivot)) + { + int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist); + var paired = elements[index]; + + if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) + { + indexes[e] = index; + } + } + }); + + // 2. Group indexes + List> groupedIndexes = new List>(); + HashSet indexDone = new HashSet(); + + for (int e = 0; e < elements.Length; e++) + { + int index = indexes[e]; + + if (index == -1) // This element is not connected + { + // Check if another element index is connected to this element (nb: distance measure is asymetric) + if (!indexes.Contains(e)) + { + // If no other element is connected to this element, add it as a standalone element + groupedIndexes.Add(new HashSet() { e }); + indexDone.Add(e); + } + continue; + } + + bool isDoneC = indexDone.Contains(e); + bool isDoneI = indexDone.Contains(index); + if (isDoneC || isDoneI) + { + if (isDoneC && !isDoneI) + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(e))) + { + pair.Add(index); + } + indexDone.Add(index); + } + else if (!isDoneC && isDoneI) + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(index))) + { + pair.Add(e); + } + indexDone.Add(e); + } + else // isDoneC && isDoneI + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(index))) + { + if (!pair.Contains(e)) pair.Add(e); + } + + foreach (var pair in groupedIndexes.Where(x => x.Contains(e))) + { + if (!pair.Contains(index)) pair.Add(index); + } + } + } + else + { + groupedIndexes.Add(new HashSet() { e, index }); + indexDone.Add(e); + indexDone.Add(index); + } + } + + // Check that all elements are done + if (elements.Length != indexDone.Count) + { + throw new Exception("ClusteringAlgorithms.GetNNGroupedIndexes(): Some elements were not done."); + } + + // 3. Merge groups that have indexes in common + // Check if duplicates (if duplicates, then same index in different groups) + if (indexDone.Count != groupedIndexes.SelectMany(x => x).Count()) + { + for (int e = 0; e < elements.Length; e++) + { + List> candidates = groupedIndexes.Where(x => x.Contains(e)).ToList(); + int count = candidates.Count(); + if (count < 2) continue; // Only one group with this index + + HashSet merged = candidates.First(); + groupedIndexes.Remove(merged); + for (int i = 1; i < count; i++) + { + var current = candidates.ElementAt(i); + merged.UnionWith(current); + groupedIndexes.Remove(current); + } + groupedIndexes.Add(merged); + } + } + + return groupedIndexes; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs index 2b06eea9..8921392f 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs @@ -47,6 +47,39 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y)); } + /// + /// The angle in degrees between the horizontal axis and the line between two points. + /// + /// The first point. + /// The second point. + /// + public static double Angle(PdfPoint point1, PdfPoint point2) + { + return Math.Atan2((float)(point2.Y - point1.Y), (float)(point2.X - point1.X)) * 180.0 / Math.PI; + } + + /// + /// The absolute distance between the Y coordinates of two points. + /// + /// The first point. + /// The second point. + /// + public static double Vertical(PdfPoint point1, PdfPoint point2) + { + return Math.Abs((double)(point2.Y - point1.Y)); + } + + /// + /// The absolute distance between the X coordinates of two points. + /// + /// The first point. + /// The second point. + /// + public static double Horizontal(PdfPoint point1, PdfPoint point2) + { + return Math.Abs((double)(point2.X - point1.X)); + } + /// /// Find the nearest point. /// diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs new file mode 100644 index 00000000..3ced0778 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs @@ -0,0 +1,212 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// The Docstrum algorithm is a bottom-up page segmentation technique based on nearest-neighborhood + /// clustering of connected components extracted from the document. + /// This implementation leverages bounding boxes and does not exactly replicates the original algorithm. + /// See 'The document spectrum for page layout analysis.' by L. O’Gorman. + /// + public class DocstrumBB : IPageSegmenter + { + /// + /// Create an instance of Docstrum for bounding boxes page segmenter, . + /// + public static DocstrumBB Instance { get; } = new DocstrumBB(); + + /// + /// Get the blocks. + /// Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3. + /// + /// + /// + public IReadOnlyList GetBlocks(IEnumerable pageWords) + { + return GetBlocks(pageWords, -30, 30, -135, -45, 1.3); + } + + /// + /// Get the blocks. See original paper for more information. + /// + /// + /// Within-line lower bound angle. + /// Within-line upper bound angle. + /// Between-line lower bound angle. + /// Between-line upper bound angle. + /// Multiplier that gives the maximum perpendicular distance between + /// text lines for blocking. Maximum distance will be this number times the between-line + /// distance found by the analysis. + /// + public IReadOnlyList GetBlocks(IEnumerable pageWords, double wlAngleLB, double wlAngleUB, + double blAngleLB, double blAngleUB, double blMultiplier) + { + var pageWordsArr = pageWords.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToArray(); // remove white spaces + + var withinLineDistList = new ConcurrentBag(); + var betweenLineDistList = new ConcurrentBag(); + + // 1. Estimate in line and between line spacing + Parallel.For(0, pageWordsArr.Length, i => + { + var word = pageWordsArr[i]; + + // Within-line distance + var pointWL = GetNearestPointData(pageWordsArr, word, + bb => bb.BottomRight, bb => bb.BottomRight, + bb => bb.BottomLeft, bb => bb.BottomLeft, + wlAngleLB, wlAngleUB, Distances.Horizontal); + if (pointWL != null) withinLineDistList.Add(pointWL); + + // Between-line distance + var pointBL = GetNearestPointData(pageWordsArr, word, + bb => bb.BottomLeft, bb => bb.Centroid, + bb => bb.TopLeft, bb => bb.Centroid, + blAngleLB, blAngleUB, Distances.Vertical); + if (pointBL != null) betweenLineDistList.Add(pointBL); + }); + + double withinLineDistance = GetPeakAverageDistance(withinLineDistList); + double betweenLineDistance = GetPeakAverageDistance(betweenLineDistList); + + // 2. Find lines of text + double maxDistWL = Math.Min(3 * withinLineDistance, Math.Sqrt(2) * betweenLineDistance); + var lines = GetLines(pageWordsArr, maxDistWL).ToArray(); + + // 3. Find blocks of text + double maxDistBL = blMultiplier * betweenLineDistance; + return GetLinesGroups(lines, maxDistBL).ToList(); + } + + /// + /// Get information on the nearest point, filtered for angle. + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + private double[] GetNearestPointData(Word[] words, Word pivot, Func funcPivotDist, Func funcPivotAngle, + Func funcPointsDist, Func funcPointsAngle, + double angleStart, double angleEnd, + Func finalDistMEasure) + { + var pointR = funcPivotDist(pivot.BoundingBox); + var filtered = words.Where(w => + { + var angleWL = Distances.Angle(funcPivotAngle(pivot.BoundingBox), funcPointsAngle(w.BoundingBox)); + return (angleWL >= angleStart && angleWL <= angleEnd); + }).ToList(); + filtered.Remove(pivot); // remove itself + + if (filtered.Count > 0) + { + int index = pointR.FindIndexNearest( + filtered.Select(w => funcPointsDist(w.BoundingBox)).ToList(), + Distances.Euclidean, out double distWL); + + if (index >= 0) + { + var matchWL = filtered[index]; + return new double[] + { + (double)pivot.Letters.Select(l => l.FontSize).Mode(), + finalDistMEasure(pointR, funcPointsDist(matchWL.BoundingBox)) + }; + } + } + return null; + } + + /// + /// Build lines via transitive closure. + /// + /// + /// + /// + private IEnumerable GetLines(Word[] words, double maxDist) + { + TextDirection textDirection = words[0].TextDirection; + var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean, + (w1, w2) => maxDist, + w => w.BoundingBox.BottomRight, w => w.BoundingBox.BottomLeft, + w => true, + (w1, w2) => + { + var angleWL = Distances.Angle(w1.BoundingBox.BottomRight, w2.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle + return (angleWL >= -30 && angleWL <= 30); + }).ToList(); + + Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList(); + if (textDirection == TextDirection.Rotate180) + { + orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList(); + } + else if (textDirection == TextDirection.Rotate90) + { + orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList(); + } + else if (textDirection == TextDirection.Rotate270) + { + orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList(); + } + + for (int a = 0; a < groupedIndexes.Count(); a++) + { + yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i]))); + } + } + + /// + /// Build blocks via transitive closure. + /// + /// + /// + /// + private IEnumerable GetLinesGroups(TextLine[] lines, double maxDist) + { + var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, Distances.Euclidean, + (l1, l2) => maxDist, + l => l.BoundingBox.TopLeft, l => l.BoundingBox.BottomLeft, + l => true, (l1, l2) => true).ToList(); + + for (int a = 0; a < groupedIndexes.Count(); a++) + { + yield return new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList()); + } + } + + /// + /// Get the average distance value of the peak bucket of the histogram. + /// + /// array[0]=font size, array[1]=distance + /// + private double GetPeakAverageDistance(IEnumerable values) + { + int max = (int)values.Max(x => x[1]) + 1; + int[] distrib = new int[max]; + + // Create histogram with buckets of size 1. + for (int i = 0; i < max; i++) + { + distrib[i] = values.Where(x => x[1] > i && x[1] <= i + 1).Count(); + } + + var peakIndex = Array.IndexOf(distrib, distrib.Max()); + + return values.Where(v => v[1] > peakIndex && v[1] <= peakIndex + 1).Average(x => x[1]); + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs new file mode 100644 index 00000000..27511786 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs @@ -0,0 +1,19 @@ +using System.Collections.Generic; +using UglyToad.PdfPig.Content; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Page segmentation divides a page into areas, each consisting of a layout structure (blocks, lines, etc.). + /// See 'Performance Comparison of Six Algorithms for Page Segmentation' by Faisal Shafait, Daniel Keysers, and Thomas M. Breuel. + /// + public interface IPageSegmenter + { + /// + /// Get the text blocks. + /// + /// The words to generate text blocks for. + /// A list of text blocks from this approach. + IReadOnlyList GetBlocks(IEnumerable pageWords); + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs index 34455cda..3efb19be 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs @@ -1,7 +1,6 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Threading.Tasks; using UglyToad.PdfPig.Content; using UglyToad.PdfPig.Geometry; using UglyToad.PdfPig.Util; @@ -71,7 +70,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height. /// The distance measure between two start and end base line points, /// e.g. the Manhattan distance. - private static List GetWords(IEnumerable pageLetters, + private List GetWords(IEnumerable pageLetters, Func metric, Func distMeasure) { if (pageLetters == null || pageLetters.Count() == 0) return new List(); @@ -97,116 +96,18 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis } Letter[] letters = pageLetters.ToArray(); - int lettersCount = letters.Length; - List startBaseLines = letters.Select(x => x.StartBaseLine).ToList(); - int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray(); - - // Find nearest neighbours indexes - Parallel.For(0, lettersCount, c => - { - var currentLetter = letters[c]; - // only check neighbours if not a white space - if (!string.IsNullOrWhiteSpace(currentLetter.Value)) - { - int index = currentLetter.EndBaseLine.FindIndexNearest(startBaseLines, distMeasure, out double dist); - var pairedLetter = letters[index]; - - if (!string.IsNullOrWhiteSpace(pairedLetter.Value) && - string.Equals(currentLetter.FontName, pairedLetter.FontName, StringComparison.OrdinalIgnoreCase)) - { - decimal minDist = Math.Max(Math.Abs(metric(currentLetter)), Math.Abs(metric(pairedLetter))) * 0.60m; - if ((decimal)dist < minDist) - { - indexes[c] = index; - } - } - } - }); - - // Group indexes - List> groupedIndexes = new List>(); - List indexDone = new List(); - for (int c = 0; c < lettersCount; c++) - { - int i = indexes[c]; - if (i == -1) continue; - - bool isDoneC = indexDone.Contains(c); - bool isDoneI = indexDone.Contains(i); - if (isDoneC || isDoneI) - { - if (isDoneC && !isDoneI) - { - foreach (var pair in groupedIndexes.Where(x => x.Contains(c))) - { - pair.Add(i); - } - indexDone.Add(i); - } - else if (!isDoneC && isDoneI) - { - foreach (var pair in groupedIndexes.Where(x => x.Contains(i))) - { - pair.Add(c); - } - indexDone.Add(c); - } - else - { - foreach (var pair in groupedIndexes.Where(x => x.Contains(i))) - { - if (!pair.Contains(c)) pair.Add(c); - } - - foreach (var pair in groupedIndexes.Where(x => x.Contains(c))) - { - if (!pair.Contains(i)) pair.Add(i); - } - } - } - else - { - List pair = new List() { c, i }; - groupedIndexes.Add(pair); - indexDone.AddRange(pair); - } - } - - // Merge lists with common index - for (int c = 0; c < lettersCount; c++) - { - List> candidates = groupedIndexes.Where(x => x.Any(t => t == c)).ToList(); - if (candidates.Count < 2) continue; // only one group with this index - - List merged = candidates.First(); - groupedIndexes.Remove(merged); - for (int i = 1; i < candidates.Count; i++) - { - var current = candidates[i]; - merged = merged.Union(current).ToList(); - groupedIndexes.Remove(current); - } - groupedIndexes.Add(merged); - } + var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(letters, + distMeasure, + (l1, l2) => Math.Max((double)metric(l1), (double)metric(l2)) * 0.60, + l => l.EndBaseLine, l => l.StartBaseLine, + l => !string.IsNullOrWhiteSpace(l.Value), + (l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value)).ToList(); List words = new List(); for (int a = 0; a < groupedIndexes.Count(); a++) { - List groupedLetters = new List(); - foreach (int s in groupedIndexes[a]) - { - groupedLetters.Add(letters[s]); - } - - words.Add(new Word(orderFunc(groupedLetters))); - } - - List indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList(); - for (int n = 0; n < indexesNotDone.Count(); n++) - { - Letter letter = letters[indexesNotDone[n]]; - words.Add(new Word(new Letter[] { letter })); + words.Add(new Word(orderFunc(groupedIndexes[a].Select(i => letters[i])))); } return words; diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs index ecaa610b..16258f1f 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs @@ -11,14 +11,31 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips /// - public static class RecursiveXYCut + public class RecursiveXYCut : IPageSegmenter { + /// + /// Create an instance of Recursive X-Y Cut page segmenter, . + /// + public static RecursiveXYCut Instance { get; } = new RecursiveXYCut(); + /// /// Get the blocks. + /// Uses 'minimumWidth' = 0, 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height) + /// + /// The words in the page. + /// + public IReadOnlyList GetBlocks(IEnumerable pageWords) + { + return GetBlocks(pageWords, 0); + } + + /// + /// Get the blocks. + /// Uses 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height) /// /// The words in the page. /// The minimum width for a block. - public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth = 0) + public IReadOnlyList GetBlocks(IEnumerable pageWords, decimal minimumWidth) { return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5m, 3)); } @@ -30,7 +47,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The minimum width for a block. /// The dominant font width. /// The dominant font height. - public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth, + public IReadOnlyList GetBlocks(IEnumerable pageWords, decimal minimumWidth, decimal dominantFontWidth, decimal dominantFontHeight) { return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight); @@ -43,15 +60,24 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The minimum width for a block. /// The function that determines the dominant font width. /// The function that determines the dominant font height. - public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth, + public IReadOnlyList GetBlocks(IEnumerable pageWords, decimal minimumWidth, Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc) { - var root = new XYLeaf(pageWords); // Create a root node. - return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); + XYLeaf root = new XYLeaf(pageWords); // Create a root node. + XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); + + var leafs = node.GetLeafs(); + + if (leafs.Count > 0) + { + return leafs.Select(l => new TextBlock(l.GetLines())).ToList(); + } + + return new List(); } - private static XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth, + private XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth, Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc, int level = 0) { @@ -144,7 +170,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return new XYNode(newNodes); } - private static XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth, + private XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth, Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc, int level = 0) { diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs index 9dab8daf..a5970693 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs @@ -9,7 +9,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// A Leaf node used in the algorithm, i.e. a block. /// - public class XYLeaf : XYNode + internal class XYLeaf : XYNode { /// /// Returns true if this node is a leaf, false otherwise. diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs index db423c55..9bac97fb 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs @@ -8,7 +8,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// A Node used in the algorithm. /// - public class XYNode + internal class XYNode { /// /// Returns true if this node is a leaf, false otherwise. From c14d77e414e115c0f8b7f30efb156f07d675aab4 Mon Sep 17 00:00:00 2001 From: BobLd Date: Sat, 10 Aug 2019 16:36:50 +0100 Subject: [PATCH 2/4] PublicApiScannerTests updated --- src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 273f59f7..b0de7ab1 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -60,7 +60,6 @@ "UglyToad.PdfPig.CrossReference.CrossReferenceTable", "UglyToad.PdfPig.CrossReference.CrossReferenceType", "UglyToad.PdfPig.CrossReference.TrailerDictionary", - "UglyToad.PdfPig.DocumentLayoutAnalysis.ClusteringAlgorithms", "UglyToad.PdfPig.DocumentLayoutAnalysis.Distances", "UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBB", "UglyToad.PdfPig.DocumentLayoutAnalysis.IPageSegmenter", From 7e8b3bdc854eb56be34ba4f4432a4d279a9ac337 Mon Sep 17 00:00:00 2001 From: BobLd Date: Sun, 11 Aug 2019 13:45:08 +0100 Subject: [PATCH 3/4] Update DocstrumBB to account for middle point of the overlapping area distance. For this, using distance between 2 lines. --- .../ClusteringAlgorithms.cs | 99 +++++++++++++++-- .../DocumentLayoutAnalysis/Distances.cs | 40 ++++++- .../DocumentLayoutAnalysis/DocstrumBB.cs | 101 +++++++++++++++--- 3 files changed, 215 insertions(+), 25 deletions(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs index be7a8299..4e9e6182 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs @@ -18,11 +18,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// Letter, Word, TextLine, etc. /// Array of elements to group. /// The distance measure between two points. - /// The function that determines the distance between to points in the same cluster. - /// The pivot's point to use. - /// The candidates to pair point to use. - /// Filter to apply to the pivot point. - /// Filter to apply to both the pivot and the paired point. + /// The function that determines the maximum distance between two points in the same cluster. + /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft. + /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. + /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. + /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. internal static IEnumerable> SimpleTransitiveClosure(T[] elements, Func distMeasure, Func maxDistanceFunction, @@ -69,17 +69,97 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis } }); + // 2. Group indexes + // 3. Merge groups that have indexes in common + var groupedIndexes = GroupMergeIndexes(indexes); + + return groupedIndexes; + } + + /// + /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance. + /// https://en.wikipedia.org/wiki/Transitive_closure + /// + /// Letter, Word, TextLine, etc. + /// Array of elements to group. + /// The distance measure between two lines. + /// The function that determines the maximum distance between two points in the same cluster. + /// The pivot's line to use for pairing. + /// The candidates' line to use for pairing. + /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. + /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. + internal static IEnumerable> SimpleTransitiveClosure(T[] elements, + Func distMeasure, + Func maxDistanceFunction, + Func pivotLine, Func candidatesLine, + Func filterPivot, Func filterFinal) + { + /************************************************************************************* + * Algorithm steps + * 1. Find nearest neighbours indexes (done in parallel) + * Iterate every point (pivot) and put its nearest neighbour's index in an array + * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. + * Only conciders a neighbour if it is within the maximum distance. + * If not within the maximum distance, index will be set to -1. + * NB: Given the possible asymmetry in the relationship, it is possible + * that if indexes[i] = j then indexes[j] != i. + * + * 2. Group indexes + * Group indexes if share neighbours in common - Transitive closure + * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 + * (i,j,k) will form a group and (m,n) will form another group. + * + * 3. Merge groups that have indexes in common - If any + * If there are group with indexes in common, merge them. + * (Could be improved and put in step 2) + *************************************************************************************/ + + int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray(); + var candidatesLines = elements.Select(x => candidatesLine(x)).ToList(); + + // 1. Find nearest neighbours indexes + Parallel.For(0, elements.Length, e => + { + var pivot = elements[e]; + + if (filterPivot(pivot)) + { + int index = pivotLine(pivot).FindIndexNearest(candidatesLines, distMeasure, out double dist); + var paired = elements[index]; + + if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) + { + indexes[e] = index; + } + } + }); + + // 2. Group indexes + // 3. Merge groups that have indexes in common + var groupedIndexes = GroupMergeIndexes(indexes); + + return groupedIndexes; + } + + /// + /// Group elements via transitive closure. + /// https://en.wikipedia.org/wiki/Transitive_closure + /// + /// Array of paired elements index. + /// + internal static List> GroupMergeIndexes(int[] indexes) + { // 2. Group indexes List> groupedIndexes = new List>(); HashSet indexDone = new HashSet(); - for (int e = 0; e < elements.Length; e++) + for (int e = 0; e < indexes.Length; e++) { int index = indexes[e]; if (index == -1) // This element is not connected { - // Check if another element index is connected to this element (nb: distance measure is asymetric) + // Check if another element's index is connected to this element (nb: distance measure is asymmetric) if (!indexes.Contains(e)) { // If no other element is connected to this element, add it as a standalone element @@ -131,7 +211,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis } // Check that all elements are done - if (elements.Length != indexDone.Count) + if (indexes.Length != indexDone.Count) { throw new Exception("ClusteringAlgorithms.GetNNGroupedIndexes(): Some elements were not done."); } @@ -140,7 +220,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis // Check if duplicates (if duplicates, then same index in different groups) if (indexDone.Count != groupedIndexes.SelectMany(x => x).Count()) { - for (int e = 0; e < elements.Length; e++) + for (int e = 0; e < indexes.Length; e++) { List> candidates = groupedIndexes.Where(x => x.Contains(e)).ToList(); int count = candidates.Count(); @@ -157,7 +237,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis groupedIndexes.Add(merged); } } - return groupedIndexes; } } diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs index 8921392f..f099c175 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs @@ -86,7 +86,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The reference point, for which to find the nearest neighbour. /// The list of neighbours candidates. /// The distance measure to use. - /// The distance between reference point, and its nearest neighbour + /// The distance between reference point, and its nearest neighbour. public static PdfPoint FindNearest(this PdfPoint pdfPoint, IReadOnlyList points, Func distanceMeasure, out double distance) { @@ -122,7 +122,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The reference point, for which to find the nearest neighbour. /// The list of neighbours candidates. /// The distance measure to use. - /// The distance between reference point, and its nearest neighbour + /// The distance between reference point, and its nearest neighbour. public static int FindIndexNearest(this PdfPoint pdfPoint, IReadOnlyList points, Func distanceMeasure, out double distance) { @@ -151,5 +151,41 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return closestPointIndex; } + + /// + /// Find the index of the nearest line. + /// + /// The reference line, for which to find the nearest neighbour. + /// The list of neighbours candidates. + /// The distance measure between two lines to use. + /// The distance between reference line, and its nearest neighbour. + public static int FindIndexNearest(this PdfLine pdfLine, IReadOnlyList lines, + Func distanceMeasure, out double distance) + { + if (lines == null || lines.Count == 0) + { + throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "lines"); + } + + if (distanceMeasure == null) + { + throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", "distanceMeasure"); + } + + distance = double.MaxValue; + int closestLineIndex = -1; + + for (var i = 0; i < lines.Count; i++) + { + double currentDistance = distanceMeasure(lines[i], pdfLine); + if (currentDistance < distance) + { + distance = currentDistance; + closestLineIndex = i; + } + } + + return closestLineIndex; + } } } diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs index 3ced0778..43d083a5 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs @@ -77,11 +77,48 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis // 2. Find lines of text double maxDistWL = Math.Min(3 * withinLineDistance, Math.Sqrt(2) * betweenLineDistance); - var lines = GetLines(pageWordsArr, maxDistWL).ToArray(); + var lines = GetLines(pageWordsArr, maxDistWL, wlAngleLB, wlAngleUB).ToArray(); // 3. Find blocks of text double maxDistBL = blMultiplier * betweenLineDistance; - return GetLinesGroups(lines, maxDistBL).ToList(); + var blocks = GetLinesGroups(lines, maxDistBL).ToList(); + + // 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text. + for (int b = 0; b < blocks.Count; b++) + { + if (blocks[b] == null) continue; + + for (int c = 0; c < blocks.Count; c++) + { + if (b == c) continue; + if (blocks[c] == null) continue; + + if (AreRectangleOverlapping(blocks[b].BoundingBox, blocks[c].BoundingBox)) + { + // Merge + // 1. Merge all words + var mergedWords = new List(blocks[b].TextLines.SelectMany(l => l.Words)); + mergedWords.AddRange(blocks[c].TextLines.SelectMany(l => l.Words)); + + // 2. Rebuild lines, using max distance = +Inf as we know all words will be in the + // same block. Filtering will still be done based on angle. + var mergedLines = GetLines(mergedWords.ToArray(), wlAngleLB, wlAngleUB, double.MaxValue); + blocks[b] = new TextBlock(mergedLines.ToList()); + + // Remove + blocks[c] = null; + } + } + } + + return blocks.Where(b => b != null).ToList(); + } + + private bool AreRectangleOverlapping(PdfRectangle rectangle1, PdfRectangle rectangle2) + { + if (rectangle1.Left > rectangle2.Right || rectangle2.Left > rectangle1.Right) return false; + if (rectangle1.Top < rectangle2.Bottom || rectangle2.Top < rectangle1.Bottom) return false; + return true; } /// @@ -104,6 +141,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis Func finalDistMEasure) { var pointR = funcPivotDist(pivot.BoundingBox); + + // Filter by angle var filtered = words.Where(w => { var angleWL = Distances.Angle(funcPivotAngle(pivot.BoundingBox), funcPointsAngle(w.BoundingBox)); @@ -135,18 +174,27 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// /// + /// + /// /// - private IEnumerable GetLines(Word[] words, double maxDist) + private IEnumerable GetLines(Word[] words, double maxDist, double wlAngleLB, double wlAngleUB) { + /*************************************************************************************************** + * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not + * work as the FindIndexNearest() function might pair the pivot with itself (the pivot's right point + * (distance = width) is closer than other words' left point). + * -> Solution would be to find more than one nearest neighbours. Use KDTree? + ***************************************************************************************************/ + TextDirection textDirection = words[0].TextDirection; var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean, - (w1, w2) => maxDist, - w => w.BoundingBox.BottomRight, w => w.BoundingBox.BottomLeft, - w => true, - (w1, w2) => + (pivot, candidate) => maxDist, + pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft, + pivot => true, + (pivot, candidate) => { - var angleWL = Distances.Angle(w1.BoundingBox.BottomRight, w2.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle - return (angleWL >= -30 && angleWL <= 30); + var angleWL = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle + return (angleWL >= wlAngleLB && angleWL <= wlAngleUB); }).ToList(); Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList(); @@ -177,10 +225,37 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// private IEnumerable GetLinesGroups(TextLine[] lines, double maxDist) { - var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, Distances.Euclidean, - (l1, l2) => maxDist, - l => l.BoundingBox.TopLeft, l => l.BoundingBox.BottomLeft, - l => true, (l1, l2) => true).ToList(); + /************************************************************************************************** + * We want to measure the distance between two lines using the following method: + * We check if two lines are overlapping horizontally. + * If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area. + * We finally compute the Euclidean distance between these two middle points. + * If the two lines are not overlapping, the distance is set to the max distance. + * + * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'line Height', the algo won't + * work as the FindIndexNearest() function will always pair the pivot with itself (the pivot's top + * point (distance = height) is closer than other lines' top point). + * -> Solution would be to find more than one nearest neighbours. Use KDTree? + **************************************************************************************************/ + + Func euclidianOverlappingMiddleDistance = (l1, l2) => + { + var left = Math.Max(l1.Point1.X, l2.Point1.X); + var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left); + + if (d < 0) return double.MaxValue; // not overlapping -> max distance + + return Distances.Euclidean( + new PdfPoint(left + d / 2, l1.Point1.Y), + new PdfPoint(left + d / 2, l2.Point1.Y)); + }; + + var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, + euclidianOverlappingMiddleDistance, + (pivot, candidate) => maxDist, + pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight), + candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight), + pivot => true, (pivot, candidate) => true).ToList(); for (int a = 0; a < groupedIndexes.Count(); a++) { From 9f13739addc80e2ea5becea23be9aeeea964b00e Mon Sep 17 00:00:00 2001 From: BobLd Date: Sun, 11 Aug 2019 13:54:47 +0100 Subject: [PATCH 4/4] correcting typo --- src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs index 43d083a5..b2fca5c3 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs @@ -9,7 +9,7 @@ using UglyToad.PdfPig.Geometry; namespace UglyToad.PdfPig.DocumentLayoutAnalysis { /// - /// The Docstrum algorithm is a bottom-up page segmentation technique based on nearest-neighborhood + /// The Docstrum algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood /// clustering of connected components extracted from the document. /// This implementation leverages bounding boxes and does not exactly replicates the original algorithm. /// See 'The document spectrum for page layout analysis.' by L. O’Gorman.