From 7e8b3bdc854eb56be34ba4f4432a4d279a9ac337 Mon Sep 17 00:00:00 2001 From: BobLd Date: Sun, 11 Aug 2019 13:45:08 +0100 Subject: [PATCH] Update DocstrumBB to account for middle point of the overlapping area distance. For this, using distance between 2 lines. --- .../ClusteringAlgorithms.cs | 99 +++++++++++++++-- .../DocumentLayoutAnalysis/Distances.cs | 40 ++++++- .../DocumentLayoutAnalysis/DocstrumBB.cs | 101 +++++++++++++++--- 3 files changed, 215 insertions(+), 25 deletions(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs index be7a8299..4e9e6182 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs @@ -18,11 +18,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// Letter, Word, TextLine, etc. /// Array of elements to group. /// The distance measure between two points. - /// The function that determines the distance between to points in the same cluster. - /// The pivot's point to use. - /// The candidates to pair point to use. - /// Filter to apply to the pivot point. - /// Filter to apply to both the pivot and the paired point. + /// The function that determines the maximum distance between two points in the same cluster. + /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft. + /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. + /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. + /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. internal static IEnumerable> SimpleTransitiveClosure(T[] elements, Func distMeasure, Func maxDistanceFunction, @@ -69,17 +69,97 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis } }); + // 2. Group indexes + // 3. Merge groups that have indexes in common + var groupedIndexes = GroupMergeIndexes(indexes); + + return groupedIndexes; + } + + /// + /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance. + /// https://en.wikipedia.org/wiki/Transitive_closure + /// + /// Letter, Word, TextLine, etc. + /// Array of elements to group. + /// The distance measure between two lines. + /// The function that determines the maximum distance between two points in the same cluster. + /// The pivot's line to use for pairing. + /// The candidates' line to use for pairing. + /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. + /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. + internal static IEnumerable> SimpleTransitiveClosure(T[] elements, + Func distMeasure, + Func maxDistanceFunction, + Func pivotLine, Func candidatesLine, + Func filterPivot, Func filterFinal) + { + /************************************************************************************* + * Algorithm steps + * 1. Find nearest neighbours indexes (done in parallel) + * Iterate every point (pivot) and put its nearest neighbour's index in an array + * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. + * Only conciders a neighbour if it is within the maximum distance. + * If not within the maximum distance, index will be set to -1. + * NB: Given the possible asymmetry in the relationship, it is possible + * that if indexes[i] = j then indexes[j] != i. + * + * 2. Group indexes + * Group indexes if share neighbours in common - Transitive closure + * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 + * (i,j,k) will form a group and (m,n) will form another group. + * + * 3. Merge groups that have indexes in common - If any + * If there are group with indexes in common, merge them. + * (Could be improved and put in step 2) + *************************************************************************************/ + + int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray(); + var candidatesLines = elements.Select(x => candidatesLine(x)).ToList(); + + // 1. Find nearest neighbours indexes + Parallel.For(0, elements.Length, e => + { + var pivot = elements[e]; + + if (filterPivot(pivot)) + { + int index = pivotLine(pivot).FindIndexNearest(candidatesLines, distMeasure, out double dist); + var paired = elements[index]; + + if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) + { + indexes[e] = index; + } + } + }); + + // 2. Group indexes + // 3. Merge groups that have indexes in common + var groupedIndexes = GroupMergeIndexes(indexes); + + return groupedIndexes; + } + + /// + /// Group elements via transitive closure. + /// https://en.wikipedia.org/wiki/Transitive_closure + /// + /// Array of paired elements index. + /// + internal static List> GroupMergeIndexes(int[] indexes) + { // 2. Group indexes List> groupedIndexes = new List>(); HashSet indexDone = new HashSet(); - for (int e = 0; e < elements.Length; e++) + for (int e = 0; e < indexes.Length; e++) { int index = indexes[e]; if (index == -1) // This element is not connected { - // Check if another element index is connected to this element (nb: distance measure is asymetric) + // Check if another element's index is connected to this element (nb: distance measure is asymmetric) if (!indexes.Contains(e)) { // If no other element is connected to this element, add it as a standalone element @@ -131,7 +211,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis } // Check that all elements are done - if (elements.Length != indexDone.Count) + if (indexes.Length != indexDone.Count) { throw new Exception("ClusteringAlgorithms.GetNNGroupedIndexes(): Some elements were not done."); } @@ -140,7 +220,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis // Check if duplicates (if duplicates, then same index in different groups) if (indexDone.Count != groupedIndexes.SelectMany(x => x).Count()) { - for (int e = 0; e < elements.Length; e++) + for (int e = 0; e < indexes.Length; e++) { List> candidates = groupedIndexes.Where(x => x.Contains(e)).ToList(); int count = candidates.Count(); @@ -157,7 +237,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis groupedIndexes.Add(merged); } } - return groupedIndexes; } } diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs index 8921392f..f099c175 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs @@ -86,7 +86,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The reference point, for which to find the nearest neighbour. /// The list of neighbours candidates. /// The distance measure to use. - /// The distance between reference point, and its nearest neighbour + /// The distance between reference point, and its nearest neighbour. public static PdfPoint FindNearest(this PdfPoint pdfPoint, IReadOnlyList points, Func distanceMeasure, out double distance) { @@ -122,7 +122,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The reference point, for which to find the nearest neighbour. /// The list of neighbours candidates. /// The distance measure to use. - /// The distance between reference point, and its nearest neighbour + /// The distance between reference point, and its nearest neighbour. public static int FindIndexNearest(this PdfPoint pdfPoint, IReadOnlyList points, Func distanceMeasure, out double distance) { @@ -151,5 +151,41 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return closestPointIndex; } + + /// + /// Find the index of the nearest line. + /// + /// The reference line, for which to find the nearest neighbour. + /// The list of neighbours candidates. + /// The distance measure between two lines to use. + /// The distance between reference line, and its nearest neighbour. + public static int FindIndexNearest(this PdfLine pdfLine, IReadOnlyList lines, + Func distanceMeasure, out double distance) + { + if (lines == null || lines.Count == 0) + { + throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "lines"); + } + + if (distanceMeasure == null) + { + throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", "distanceMeasure"); + } + + distance = double.MaxValue; + int closestLineIndex = -1; + + for (var i = 0; i < lines.Count; i++) + { + double currentDistance = distanceMeasure(lines[i], pdfLine); + if (currentDistance < distance) + { + distance = currentDistance; + closestLineIndex = i; + } + } + + return closestLineIndex; + } } } diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs index 3ced0778..43d083a5 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs @@ -77,11 +77,48 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis // 2. Find lines of text double maxDistWL = Math.Min(3 * withinLineDistance, Math.Sqrt(2) * betweenLineDistance); - var lines = GetLines(pageWordsArr, maxDistWL).ToArray(); + var lines = GetLines(pageWordsArr, maxDistWL, wlAngleLB, wlAngleUB).ToArray(); // 3. Find blocks of text double maxDistBL = blMultiplier * betweenLineDistance; - return GetLinesGroups(lines, maxDistBL).ToList(); + var blocks = GetLinesGroups(lines, maxDistBL).ToList(); + + // 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text. + for (int b = 0; b < blocks.Count; b++) + { + if (blocks[b] == null) continue; + + for (int c = 0; c < blocks.Count; c++) + { + if (b == c) continue; + if (blocks[c] == null) continue; + + if (AreRectangleOverlapping(blocks[b].BoundingBox, blocks[c].BoundingBox)) + { + // Merge + // 1. Merge all words + var mergedWords = new List(blocks[b].TextLines.SelectMany(l => l.Words)); + mergedWords.AddRange(blocks[c].TextLines.SelectMany(l => l.Words)); + + // 2. Rebuild lines, using max distance = +Inf as we know all words will be in the + // same block. Filtering will still be done based on angle. + var mergedLines = GetLines(mergedWords.ToArray(), wlAngleLB, wlAngleUB, double.MaxValue); + blocks[b] = new TextBlock(mergedLines.ToList()); + + // Remove + blocks[c] = null; + } + } + } + + return blocks.Where(b => b != null).ToList(); + } + + private bool AreRectangleOverlapping(PdfRectangle rectangle1, PdfRectangle rectangle2) + { + if (rectangle1.Left > rectangle2.Right || rectangle2.Left > rectangle1.Right) return false; + if (rectangle1.Top < rectangle2.Bottom || rectangle2.Top < rectangle1.Bottom) return false; + return true; } /// @@ -104,6 +141,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis Func finalDistMEasure) { var pointR = funcPivotDist(pivot.BoundingBox); + + // Filter by angle var filtered = words.Where(w => { var angleWL = Distances.Angle(funcPivotAngle(pivot.BoundingBox), funcPointsAngle(w.BoundingBox)); @@ -135,18 +174,27 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// /// + /// + /// /// - private IEnumerable GetLines(Word[] words, double maxDist) + private IEnumerable GetLines(Word[] words, double maxDist, double wlAngleLB, double wlAngleUB) { + /*************************************************************************************************** + * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not + * work as the FindIndexNearest() function might pair the pivot with itself (the pivot's right point + * (distance = width) is closer than other words' left point). + * -> Solution would be to find more than one nearest neighbours. Use KDTree? + ***************************************************************************************************/ + TextDirection textDirection = words[0].TextDirection; var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean, - (w1, w2) => maxDist, - w => w.BoundingBox.BottomRight, w => w.BoundingBox.BottomLeft, - w => true, - (w1, w2) => + (pivot, candidate) => maxDist, + pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft, + pivot => true, + (pivot, candidate) => { - var angleWL = Distances.Angle(w1.BoundingBox.BottomRight, w2.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle - return (angleWL >= -30 && angleWL <= 30); + var angleWL = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle + return (angleWL >= wlAngleLB && angleWL <= wlAngleUB); }).ToList(); Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList(); @@ -177,10 +225,37 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// private IEnumerable GetLinesGroups(TextLine[] lines, double maxDist) { - var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, Distances.Euclidean, - (l1, l2) => maxDist, - l => l.BoundingBox.TopLeft, l => l.BoundingBox.BottomLeft, - l => true, (l1, l2) => true).ToList(); + /************************************************************************************************** + * We want to measure the distance between two lines using the following method: + * We check if two lines are overlapping horizontally. + * If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area. + * We finally compute the Euclidean distance between these two middle points. + * If the two lines are not overlapping, the distance is set to the max distance. + * + * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'line Height', the algo won't + * work as the FindIndexNearest() function will always pair the pivot with itself (the pivot's top + * point (distance = height) is closer than other lines' top point). + * -> Solution would be to find more than one nearest neighbours. Use KDTree? + **************************************************************************************************/ + + Func euclidianOverlappingMiddleDistance = (l1, l2) => + { + var left = Math.Max(l1.Point1.X, l2.Point1.X); + var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left); + + if (d < 0) return double.MaxValue; // not overlapping -> max distance + + return Distances.Euclidean( + new PdfPoint(left + d / 2, l1.Point1.Y), + new PdfPoint(left + d / 2, l2.Point1.Y)); + }; + + var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, + euclidianOverlappingMiddleDistance, + (pivot, candidate) => maxDist, + pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight), + candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight), + pivot => true, (pivot, candidate) => true).ToList(); for (int a = 0; a < groupedIndexes.Count(); a++) {