diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs index 072469a5..bec3a6da 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs @@ -13,6 +13,7 @@ { /// /// Algorithm to group elements using nearest neighbours. + /// Uses the nearest neighbour as candidate. /// /// Letter, Word, TextLine, etc. /// Elements to group. @@ -61,7 +62,7 @@ if (filterPivot(pivot)) { - var paired = kdTree.FindNearestNeighbours(pivot, pivotPoint, distMeasure, out int index, out double dist); + var paired = kdTree.FindNearestNeighbour(pivot, pivotPoint, distMeasure, out int index, out double dist); if (index != -1) { @@ -77,6 +78,77 @@ return GroupIndexes(indexes); } + /// + /// Algorithm to group elements using nearest neighbours. + /// Uses the k-nearest neighbours as candidates. + /// + /// Letter, Word, TextLine, etc. + /// Elements to group. + /// The k-nearest neighbours to consider as candidates. + /// The distance measure between two points. + /// The function that determines the maximum distance between two points in the same cluster. + /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft. + /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. + /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. + /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. + internal static IEnumerable> ClusterNearestNeighbours(IReadOnlyList elements, int k, + Func distMeasure, + Func maxDistanceFunction, + Func pivotPoint, Func candidatesPoint, + Func filterPivot, Func filterFinal, + int maxDegreeOfParallelism) + { + /************************************************************************************* + * Algorithm steps + * 1. Find nearest neighbours indexes (done in parallel) + * Iterate every point (pivot) and put its nearest neighbour's index in an array + * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. + * Only conciders a neighbour if it is within the maximum distance. + * If not within the maximum distance, index will be set to -1. + * Each element has only one connected neighbour. + * NB: Given the possible asymmetry in the relationship, it is possible + * that if indexes[i] = j then indexes[j] != i. + * + * 2. Group indexes + * Group indexes if share neighbours in common - Depth-first search + * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 + * (i,j,k) will form a group and (m,n) will form another group. + *************************************************************************************/ + + int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray(); + KdTree kdTree = new KdTree(elements, candidatesPoint); + + ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; + + // 1. Find nearest neighbours indexes + Parallel.For(0, elements.Count, parallelOptions, e => + { + var pivot = elements[e]; + + if (filterPivot(pivot)) + { + var paired = kdTree.FindNearestNeighbours(pivot, k, pivotPoint, distMeasure); + + foreach (var c in paired) + { + var filter = filterFinal(pivot, c.Item1); + var maxDist = maxDistanceFunction(pivot, c.Item1); + if (filter && c.Item3 < maxDist) + { + indexes[e] = c.Item2; + break; + } + } + } + }); + + // 2. Group indexes + return GroupIndexes(indexes); + } + /// /// Algorithm to group elements using nearest neighbours. /// diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs index 50a41bdc..8e75be44 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs @@ -14,13 +14,20 @@ public PdfPoint FindNearestNeighbours(PdfPoint pivot, Func distanceMeasure, out int index, out double distance) { - return FindNearestNeighbours(pivot, p => p, distanceMeasure, out index, out distance); + return FindNearestNeighbour(pivot, p => p, distanceMeasure, out index, out distance); + } + + public IReadOnlyList<(PdfPoint, int, double)> FindNearestNeighbours(PdfPoint pivot, int k, Func distanceMeasure) + { + return FindNearestNeighbours(pivot, k, p => p, distanceMeasure); } } internal class KdTree { - private KdTreeNode Root; + private readonly KdTreeNode Root; + + public readonly int Count; public KdTree(IReadOnlyList candidates, Func candidatesPointFunc) { @@ -29,6 +36,7 @@ throw new ArgumentException("KdTree(): candidates cannot be null or empty.", nameof(candidates)); } + Count = candidates.Count; Root = BuildTree(Enumerable.Range(0, candidates.Count).Zip(candidates, (e, p) => (e, candidatesPointFunc(p), p)).ToArray(), 0); } @@ -67,23 +75,23 @@ #region NN /// - /// + /// Get the nearest neighbour to the pivot element. /// - /// + /// The element for which to find the nearest neighbour. /// /// /// The nearest neighbour's index (returns -1 if not found). /// The distance between the pivot and the nearest neighbour (returns if not found). /// The nearest neighbour's element. - public T FindNearestNeighbours(T pivot, Func pivotPointFunc, Func distanceMeasure, out int index, out double distance) + public T FindNearestNeighbour(T pivot, Func pivotPointFunc, Func distanceMeasure, out int index, out double distance) { - var result = FindNearestNeighbours(Root, pivot, pivotPointFunc, distanceMeasure); + var result = FindNearestNeighbour(Root, pivot, pivotPointFunc, distanceMeasure); index = result.Item1 != null ? result.Item1.Index : -1; - distance = result.Item2.HasValue ? result.Item2.Value : double.NaN; + distance = result.Item2 ?? double.NaN; return result.Item1 != null ? result.Item1.Element : default; } - private static (KdTreeNode, double?) FindNearestNeighbours(KdTreeNode node, T pivot, Func pivotPointFunc, Func distance) + private static (KdTreeNode, double?) FindNearestNeighbour(KdTreeNode node, T pivot, Func pivotPointFunc, Func distance) { if (node == null) { @@ -111,7 +119,7 @@ if (pointValue < node.L) { // start left - (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, pivotPointFunc, distance); + (newNode, newDist) = FindNearestNeighbour(node.LeftChild, pivot, pivotPointFunc, distance); if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot)) { @@ -121,13 +129,13 @@ if (node.RightChild != null && pointValue + currentDistance >= node.L) { - (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, pivotPointFunc, distance); + (newNode, newDist) = FindNearestNeighbour(node.RightChild, pivot, pivotPointFunc, distance); } } else { // start right - (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, pivotPointFunc, distance); + (newNode, newDist) = FindNearestNeighbour(node.RightChild, pivot, pivotPointFunc, distance); if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot)) { @@ -137,7 +145,7 @@ if (node.LeftChild != null && pointValue - currentDistance <= node.L) { - (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, pivotPointFunc, distance); + (newNode, newDist) = FindNearestNeighbour(node.LeftChild, pivot, pivotPointFunc, distance); } } @@ -152,6 +160,170 @@ } #endregion + #region k-NN + /***************************************************************************** + * WARNING: k-nearest neighbours algo will need more checks and tests. + *****************************************************************************/ + + /// + /// Get the k nearest neighbours to the pivot element. If elements are equidistant, they are counted as one. + /// + /// The element for which to find the k nearest neighbours. + /// The number of neighbours to return. If elements are equidistant, they are counted as one. + /// + /// + /// Returns a list of tuples of the k nearest neighbours. Tuples are (element, index, distance). + public IReadOnlyList<(T, int, double)> FindNearestNeighbours(T pivot, int k, Func pivotPointFunc, Func distanceMeasure) + { + if (k == 1) + { + // if only 1 neighbour required, use default to avoid creating KNearestNeighboursQueue + var nn = FindNearestNeighbour(pivot, pivotPointFunc, distanceMeasure, out int index, out double distance); + if (index == -1) + { + return EmptyArray<(T, int, double)>.Instance; + } + return new List<(T, int, double)>() { (nn, index, distance) }; + } + else + { + var kdTreeNodes = new KNearestNeighboursQueue(k); + FindNearestNeighbours(Root, pivot, k, pivotPointFunc, distanceMeasure, kdTreeNodes); + return kdTreeNodes.SelectMany(n => n.Value.Select(e => (e.Element, e.Index, n.Key))).ToList(); + } + } + + private static (KdTreeNode, double) FindNearestNeighbours(KdTreeNode node, T pivot, int k, + Func pivotPointFunc, Func distance, KNearestNeighboursQueue queue) + { + if (node == null) + { + return (null, double.NaN); + } + else if (node.IsLeaf) + { + if (node.Element.Equals(pivot)) + { + return (null, double.NaN); + } + + var currentDistance = distance(node.Value, pivotPointFunc(pivot)); + var currentNearestNode = node; + + if (!queue.IsFull || currentDistance <= queue.LastDistance) + { + queue.Add(currentDistance, currentNearestNode); + currentDistance = queue.LastDistance; + currentNearestNode = queue.LastElement; + } + + return (currentNearestNode, currentDistance); + } + else + { + var point = pivotPointFunc(pivot); + var currentNearestNode = node; + var currentDistance = distance(node.Value, point); + if (!queue.IsFull || currentDistance <= queue.LastDistance) + { + queue.Add(currentDistance, currentNearestNode); + currentDistance = queue.LastDistance; + currentNearestNode = queue.LastElement; + } + + KdTreeNode newNode = null; + double newDist = double.NaN; + + var pointValue = node.IsAxisCutX ? point.X : point.Y; + + if (pointValue < node.L) + { + // start left + (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, k, pivotPointFunc, distance, queue); + + if (!double.IsNaN(newDist) && newDist <= currentDistance && !newNode.Element.Equals(pivot)) + { + queue.Add(newDist, newNode); + currentDistance = queue.LastDistance; + currentNearestNode = queue.LastElement; + } + + if (node.RightChild != null && pointValue + currentDistance >= node.L) + { + (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, k, pivotPointFunc, distance, queue); + } + } + else + { + // start right + (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, k, pivotPointFunc, distance, queue); + + if (!double.IsNaN(newDist) && newDist <= currentDistance && !newNode.Element.Equals(pivot)) + { + queue.Add(newDist, newNode); + currentDistance = queue.LastDistance; + currentNearestNode = queue.LastElement; + } + + if (node.LeftChild != null && pointValue - currentDistance <= node.L) + { + (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, k, pivotPointFunc, distance, queue); + } + } + + if (!double.IsNaN(newDist) && newDist <= currentDistance && !newNode.Element.Equals(pivot)) + { + queue.Add(newDist, newNode); + currentDistance = queue.LastDistance; + currentNearestNode = queue.LastElement; + } + + return (currentNearestNode, currentDistance); + } + } + + private class KNearestNeighboursQueue : SortedList>> + { + public readonly int K; + + public KdTreeNode LastElement { get; private set; } + + public double LastDistance { get; private set; } + + public bool IsFull => Count >= K; + + public KNearestNeighboursQueue(int k) : base(k) + { + K = k; + LastDistance = double.PositiveInfinity; + } + + public void Add(double key, KdTreeNode value) + { + if (key > LastDistance && IsFull) + { + return; + } + + if (!ContainsKey(key)) + { + base.Add(key, new HashSet>()); + if (Count > K) + { + RemoveAt(Count - 1); + } + } + + if (this[key].Add(value)) + { + var last = this.Last(); + LastElement = last.Value.Last(); + LastDistance = last.Key; + } + } + } + #endregion + private class KdTreeLeaf : KdTreeNode { public override bool IsLeaf => true; diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs index 92a432db..9a624a45 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs @@ -108,38 +108,39 @@ ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; - // 1. Estimate in line and between line spacing + // 1. Estimate within line and between line spacing + KdTree kdTreeWL = new KdTree(wordsList, w => w.BoundingBox.BottomLeft); + KdTree kdTreeBL = new KdTree(wordsList, w => w.BoundingBox.TopLeft); + Parallel.For(0, wordsList.Count, parallelOptions, i => { var word = wordsList[i]; // Within-line distance - var pointsWithinLine = GetNearestPointDistance(wordsList, word, - bb => bb.BottomRight, bb => bb.BottomRight, - bb => bb.BottomLeft, bb => bb.BottomLeft, - withinLine, Distances.Horizontal); - - if (pointsWithinLine != null) + var neighbourWL = kdTreeWL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 0.5)); + foreach (var n in neighbourWL) { - withinLineDistList.Add(pointsWithinLine.Value); + if (withinLine.Contains(Distances.Angle(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft))) + { + withinLineDistList.Add(Distances.Horizontal(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft)); + } } // Between-line distance - var pointsBetweenLine = GetNearestPointDistance(wordsList, word, - bb => bb.BottomLeft, bb => bb.Centroid, - bb => bb.TopLeft, bb => bb.Centroid, - betweenLine, Distances.Vertical); - - if (pointsBetweenLine != null) + var neighbourBL = kdTreeBL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomLeft, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 50)); + foreach (var n in neighbourBL) { - betweenLineDistList.Add(pointsBetweenLine.Value); + if (betweenLine.Contains(Distances.Angle(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid))) + { + betweenLineDistList.Add(Distances.Vertical(word.BoundingBox.BottomLeft, n.Item1.BoundingBox.TopLeft)); + } } }); double? withinLineDistance = GetPeakAverageDistance(withinLineDistList); double? betweenLineDistance = GetPeakAverageDistance(betweenLineDistList); - if (withinLineDistance == null || betweenLineDistance == null) + if (!withinLineDistance.HasValue || !betweenLineDistance.HasValue) { return new[] { new TextBlock(new[] { new TextLine(wordsList) }) }; } @@ -193,69 +194,15 @@ return blocks.Where(b => b != null).ToList(); } - /// - /// Get information on the nearest point, filtered for angle. - /// - private double? GetNearestPointDistance(List words, Word pivot, Func funcPivotDist, Func funcPivotAngle, - Func funcPointsDist, Func funcPointsAngle, - AngleBounds angleBounds, - Func finalDistanceMeasure) - { - var pointR = funcPivotDist(pivot.BoundingBox); - - var pivotPoint = funcPivotAngle(pivot.BoundingBox); - - var wordsWithinAngleBoundDistancePoints = new List(); - - // Filter to words within the angle range. - foreach (var word in words) - { - // Ignore the pivot word. - if (ReferenceEquals(word, pivot)) - { - continue; - } - - var angle = Distances.Angle(pivotPoint, funcPointsAngle(word.BoundingBox)); - - if (angleBounds.Contains(angle)) - { - wordsWithinAngleBoundDistancePoints.Add(funcPointsDist(word.BoundingBox)); - } - } - - if (wordsWithinAngleBoundDistancePoints.Count == 0) - { - return null; - } - - var closestWordIndex = Distances.FindIndexNearest(pointR, wordsWithinAngleBoundDistancePoints, p => p, - p => p, Distances.Euclidean, out _); - - if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count) - { - return null; - } - - return finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex]); - } - private static IEnumerable GetLines(List words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism) { TextDirection textDirection = words[0].TextDirection; - var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, Distances.Euclidean, - (pivot, candidate) => maxDist, - pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft, - pivot => true, - (pivot, candidate) => - { - // Compare bottom right with bottom left for angle - var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft); - - return (withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper); - }, - maxDegreeOfParallelism).ToList(); + var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, 2, Distances.Euclidean, + (pivot, candidate) => maxDist, + pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft, + pivot => true, + (pivot, candidate) => withinLine.Contains(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft)), + maxDegreeOfParallelism).ToList(); Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList(); if (textDirection == TextDirection.Rotate180) @@ -287,7 +234,7 @@ * If the two lines are not overlapping, the distance is set to the max distance. **************************************************************************************************/ - Func euclidianOverlappingMiddleDistance = (l1, l2) => + double euclidianOverlappingMiddleDistance(PdfLine l1, PdfLine l2) { var left = Math.Max(l1.Point1.X, l2.Point1.X); var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left); @@ -295,9 +242,9 @@ if (d < 0) return double.MaxValue; // not overlapping -> max distance return Distances.Euclidean( - new PdfPoint(left + d / 2, l1.Point1.Y), - new PdfPoint(left + d / 2, l2.Point1.Y)); - }; + new PdfPoint(left + d / 2, l1.Point1.Y), + new PdfPoint(left + d / 2, l2.Point1.Y)); + } var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(lines, euclidianOverlappingMiddleDistance, diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs index b08164a0..1788639a 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs @@ -254,7 +254,7 @@ #region Sorted Queue private class QueueEntries : SortedSet { - int bound; + readonly int bound; public QueueEntries(int maximumBound) { diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs index 756eafec..05989bfe 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs @@ -31,14 +31,14 @@ /// The letters in the page. public IEnumerable GetWords(IReadOnlyList letters) { - Func baseMaxFunc = (l1, l2) => + double baseMaxFunc(Letter l1, Letter l2) { return Math.Max(Math.Max(Math.Max( Math.Abs(l1.GlyphRectangle.Width), Math.Abs(l2.GlyphRectangle.Width)), - Math.Abs(l1.Width)), + Math.Abs(l1.Width)), Math.Abs(l2.Width)); - }; + } List wordsH = GetWords( letters.Where(l => l.TextDirection == TextDirection.Horizontal).ToList(), diff --git a/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs b/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs index b557661e..3da754de 100644 --- a/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs +++ b/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs @@ -382,9 +382,15 @@ } else { - if (!rectangle.Normalise().IntersectsWith(other.Normalise())) + var r1 = rectangle.Normalise(); + var r2 = other.Normalise(); + if (Math.Abs(r1.Rotation) < epsilon && Math.Abs(r2.Rotation) < epsilon) { - return false; + // check rotation to avoid stackoverflow + if (!r1.IntersectsWith(r2)) + { + return false; + } } if (rectangle.Contains(other.BottomLeft)) return true;