diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs index fa2b4494..58bc2d5e 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs @@ -1,11 +1,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { + using Core; using System; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; - using Geometry; - using Core; /// /// Clustering Algorithms. @@ -16,7 +15,7 @@ /// Algorithm to group elements using nearest neighbours. /// /// Letter, Word, TextLine, etc. - /// List of elements to group. + /// Elements to group. /// The distance measure between two points. /// The function that determines the maximum distance between two points in the same cluster. /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft. @@ -26,7 +25,7 @@ /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. - internal static IEnumerable> ClusterNearestNeighbours(List elements, + internal static IEnumerable> ClusterNearestNeighbours(IReadOnlyList elements, Func distMeasure, Func maxDistanceFunction, Func pivotPoint, Func candidatesPoint, @@ -51,7 +50,7 @@ *************************************************************************************/ int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray(); - var candidatesPoints = elements.Select(candidatesPoint).ToList(); + KdTree kdTree = new KdTree(elements, candidatesPoint); ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; @@ -62,92 +61,17 @@ if (filterPivot(pivot)) { - int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist); + var paired = kdTree.FindNearestNeighbours(pivot, pivotPoint, distMeasure, out int index, out double dist); - if (index != -1) + if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) { - var paired = elements[index]; - if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) - { - indexes[e] = index; - } + indexes[e] = index; } } }); // 2. Group indexes - var groupedIndexes = GroupIndexes(indexes); - - return groupedIndexes; - } - - /// - /// Algorithm to group elements using nearest neighbours. - /// - /// Letter, Word, TextLine, etc. - /// Array of elements to group. - /// The distance measure between two points. - /// The function that determines the maximum distance between two points in the same cluster. - /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft. - /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. - /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. - /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. - /// Sets the maximum number of concurrent tasks enabled. - /// A positive property value limits the number of concurrent operations to the set value. - /// If it is -1, there is no limit on the number of concurrently running operations. - internal static IEnumerable> ClusterNearestNeighbours(T[] elements, - Func distMeasure, - Func maxDistanceFunction, - Func pivotPoint, Func candidatesPoint, - Func filterPivot, Func filterFinal, - int maxDegreeOfParallelism) - { - /************************************************************************************* - * Algorithm steps - * 1. Find nearest neighbours indexes (done in parallel) - * Iterate every point (pivot) and put its nearest neighbour's index in an array - * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. - * Only conciders a neighbour if it is within the maximum distance. - * If not within the maximum distance, index will be set to -1. - * Each element has only one connected neighbour. - * NB: Given the possible asymmetry in the relationship, it is possible - * that if indexes[i] = j then indexes[j] != i. - * - * 2. Group indexes - * Group indexes if share neighbours in common - Depth-first search - * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 - * (i,j,k) will form a group and (m,n) will form another group. - *************************************************************************************/ - - int[] indexes = Enumerable.Repeat(-1, elements.Length).ToArray(); - var candidatesPoints = elements.Select(candidatesPoint).ToList(); - - ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; - - // 1. Find nearest neighbours indexes - Parallel.For(0, elements.Length, parallelOptions, e => - { - var pivot = elements[e]; - - if (filterPivot(pivot)) - { - int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist); - - if (index != -1) - { - var paired = elements[index]; - if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) - { - indexes[e] = index; - } - } - } - }); - - // 2. Group indexes - var groupedIndexes = GroupIndexes(indexes); - - return groupedIndexes; + return GroupIndexes(indexes); } /// @@ -189,7 +113,6 @@ *************************************************************************************/ int[] indexes = Enumerable.Repeat(-1, elements.Length).ToArray(); - var candidatesLines = elements.Select(x => candidatesLine(x)).ToList(); ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; @@ -200,7 +123,7 @@ if (filterPivot(pivot)) { - int index = pivot.FindIndexNearest(elements, candidatesLine, pivotLine, distMeasure, out double dist); + int index = Distances.FindIndexNearest(pivot, elements, candidatesLine, pivotLine, distMeasure, out double dist); if (index != -1) { @@ -214,9 +137,7 @@ }); // 2. Group indexes - var groupedIndexes = GroupIndexes(indexes); - - return groupedIndexes; + return GroupIndexes(indexes); } /// diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs index 7f06f7e5..9fd5ad20 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs @@ -130,7 +130,7 @@ /// /// The distance measure to use. /// The distance between reference point, and its nearest neighbour. - internal static int FindIndexNearest(this T element, IReadOnlyList candidates, + internal static int FindIndexNearest(T element, IReadOnlyList candidates, Func candidatesPoint, Func pivotPoint, Func distanceMeasure, out double distance) { @@ -172,7 +172,7 @@ /// /// The distance measure between two lines to use. /// The distance between reference line, and its nearest neighbour. - internal static int FindIndexNearest(this T element, IReadOnlyList candidates, + internal static int FindIndexNearest(T element, IReadOnlyList candidates, Func candidatesLine, Func pivotLine, Func distanceMeasure, out double distance) { diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs new file mode 100644 index 00000000..61f2329d --- /dev/null +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs @@ -0,0 +1,230 @@ +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + using System; + using System.Collections.Generic; + using System.Linq; + using UglyToad.PdfPig.Core; + + // for kd-tree with line segments, see https://stackoverflow.com/questions/14376679/how-to-represent-line-segments-in-kd-tree + + internal class KdTree : KdTree + { + public KdTree(PdfPoint[] candidates) : base(candidates, p => p) + { } + + public PdfPoint FindNearestNeighbours(PdfPoint pivot, Func distanceMeasure, out int index, out double distance) + { + return FindNearestNeighbours(pivot, p => p, distanceMeasure, out index, out distance); + } + } + + internal class KdTree + { + private KdTreeNode Root; + + public KdTree(IReadOnlyList candidates, Func candidatesPointFunc) + { + var pointsIndex = Enumerable.Range(0, candidates.Count).Zip(candidates, (e, p) => (e, candidatesPointFunc(p), p)).ToList(); + if (candidates != null && candidates.Count > 0) + { + Root = BuildTree(pointsIndex, 0); + } + } + + private KdTreeNode BuildTree(IReadOnlyList<(int, PdfPoint, T)> P, int depth) + { + var median = P.Count / 2; + if (depth % 2 == 0) // depth is even + { + P = P.OrderBy(p => p.Item2.X).ToArray(); + } + else + { + P = P.OrderBy(p => p.Item2.Y).ToArray(); + } + + // left side + var P1 = P.Take(median).ToArray(); + KdTreeNode vLeft = null; + if (P1.Length == 1) + { + var item = P1[0]; + vLeft = new KdTreeLeaf(item.Item2, item.Item3, depth, item.Item1); + } + else if (P1.Length > 1) + { + vLeft = BuildTree(P1, depth + 1); + } + + // right side + var P2 = P.Skip(median + 1).ToArray(); + KdTreeNode vRight = null; + if (P2.Length == 1) + { + var item = P2[0]; + vRight = new KdTreeLeaf(item.Item2, item.Item3, depth, item.Item1); + } + else if (P2.Length > 1) + { + vRight = BuildTree(P2, depth + 1); + } + + var medianItem = P[median]; + return new KdTreeNode(vLeft, vRight, medianItem.Item2, medianItem.Item3, depth, medianItem.Item1); + } + + #region NN + public T FindNearestNeighbours(T pivot, Func pivotPointFunc, Func distanceMeasure, out int index, out double distance) + { + var result = FindNearestNeighbours(Root, pivot, pivotPointFunc, distanceMeasure); + index = result.Item1.Index; + distance = result.Item2.Value; + return result.Item1.Element; + } + + private static (KdTreeNode, double?) FindNearestNeighbours(KdTreeNode node, T pivot, Func pivotPointFunc, Func distance) + { + if (node == null) + { + return (null, null); + } + else if (node.IsLeaf) + { + if (node.Element.Equals(pivot)) + { + return (null, null); + } + return (node, distance(node.Value, pivotPointFunc(pivot))); + } + else + { + var point = pivotPointFunc(pivot); + var currentNearestNode = node; + var currentDistance = distance(node.Value, point); + + KdTreeNode newNode = null; + double? newDist = null; + + var pointValue = node.Depth == 0 ? point.X : point.Y; + + if (pointValue < node.L) + { + // start left + (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, pivotPointFunc, distance); + + if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot)) + { + currentDistance = newDist.Value; + currentNearestNode = newNode; + } + + if (node.RightChild != null && pointValue + currentDistance >= node.L) + { + (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, pivotPointFunc, distance); + } + } + else + { + // start right + (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, pivotPointFunc, distance); + + if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot)) + { + currentDistance = newDist.Value; + currentNearestNode = newNode; + } + + if (node.LeftChild != null && pointValue - currentDistance <= node.L) + { + (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, pivotPointFunc, distance); + } + } + + if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot)) + { + currentDistance = newDist.Value; + currentNearestNode = newNode; + } + + return (currentNearestNode, currentDistance); + } + } + #endregion + + private class KdTreeLeaf : KdTreeNode + { + public override bool IsLeaf => true; + + public KdTreeLeaf(PdfPoint l, Q element, int depth, int index) + : base(null, null, l, element, depth, index) + { } + + public override string ToString() + { + return "Leaf->" + Value.ToString(); + } + } + + private class KdTreeNode + { + /// + /// Split value. + /// + public double L => Depth == 0 ? Value.X : Value.Y; + + public PdfPoint Value { get; } + + public KdTreeNode LeftChild { get; internal set; } + + public KdTreeNode RightChild { get; internal set; } + + public Q Element { get; } + + /// + /// 0 is even (x), 1 is odd (y). + /// + public int Depth { get; } + + public virtual bool IsLeaf => false; + + public int Index { get; } + + public KdTreeNode(KdTreeNode leftChild, KdTreeNode rightChild, PdfPoint l, Q element, int depth, int index) + { + LeftChild = leftChild; + RightChild = rightChild; + Value = l; + Element = element; + Depth = depth % 2; + Index = index; + } + + public IEnumerable> GetLeaves() + { + var leafs = new List>(); + RecursiveGetLeaves(LeftChild, ref leafs); + RecursiveGetLeaves(RightChild, ref leafs); + return leafs; + } + + private void RecursiveGetLeaves(KdTreeNode leaf, ref List> leafs) + { + if (leaf == null) return; + if (leaf is KdTreeLeaf lLeaf) + { + leafs.Add(lLeaf); + } + else + { + RecursiveGetLeaves(leaf.LeftChild, ref leafs); + RecursiveGetLeaves(leaf.RightChild, ref leafs); + } + } + + public override string ToString() + { + return "Node->" + Value.ToString(); + } + } + } +} diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs index 34df2ade..dbb2ad8a 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs @@ -230,7 +230,7 @@ return null; } - var closestWordIndex = pointR.FindIndexNearest(wordsWithinAngleBoundDistancePoints, p => p, + var closestWordIndex = Distances.FindIndexNearest(pointR, wordsWithinAngleBoundDistancePoints, p => p, p => p, Distances.Euclidean, out _); if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count)