diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs
index fa2b4494..58bc2d5e 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs
@@ -1,11 +1,10 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
+ using Core;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
- using Geometry;
- using Core;
///
/// Clustering Algorithms.
@@ -16,7 +15,7 @@
/// Algorithm to group elements using nearest neighbours.
///
/// Letter, Word, TextLine, etc.
- /// List of elements to group.
+ /// Elements to group.
/// The distance measure between two points.
/// The function that determines the maximum distance between two points in the same cluster.
/// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.
@@ -26,7 +25,7 @@
/// Sets the maximum number of concurrent tasks enabled.
/// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
- internal static IEnumerable> ClusterNearestNeighbours(List elements,
+ internal static IEnumerable> ClusterNearestNeighbours(IReadOnlyList elements,
Func distMeasure,
Func maxDistanceFunction,
Func pivotPoint, Func candidatesPoint,
@@ -51,7 +50,7 @@
*************************************************************************************/
int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray();
- var candidatesPoints = elements.Select(candidatesPoint).ToList();
+ KdTree kdTree = new KdTree(elements, candidatesPoint);
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
@@ -62,92 +61,17 @@
if (filterPivot(pivot))
{
- int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist);
+ var paired = kdTree.FindNearestNeighbours(pivot, pivotPoint, distMeasure, out int index, out double dist);
- if (index != -1)
+ if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
{
- var paired = elements[index];
- if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
- {
- indexes[e] = index;
- }
+ indexes[e] = index;
}
}
});
// 2. Group indexes
- var groupedIndexes = GroupIndexes(indexes);
-
- return groupedIndexes;
- }
-
- ///
- /// Algorithm to group elements using nearest neighbours.
- ///
- /// Letter, Word, TextLine, etc.
- /// Array of elements to group.
- /// The distance measure between two points.
- /// The function that determines the maximum distance between two points in the same cluster.
- /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.
- /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.
- /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
- /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
- /// Sets the maximum number of concurrent tasks enabled.
- /// A positive property value limits the number of concurrent operations to the set value.
- /// If it is -1, there is no limit on the number of concurrently running operations.
- internal static IEnumerable> ClusterNearestNeighbours(T[] elements,
- Func distMeasure,
- Func maxDistanceFunction,
- Func pivotPoint, Func candidatesPoint,
- Func filterPivot, Func filterFinal,
- int maxDegreeOfParallelism)
- {
- /*************************************************************************************
- * Algorithm steps
- * 1. Find nearest neighbours indexes (done in parallel)
- * Iterate every point (pivot) and put its nearest neighbour's index in an array
- * e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
- * Only conciders a neighbour if it is within the maximum distance.
- * If not within the maximum distance, index will be set to -1.
- * Each element has only one connected neighbour.
- * NB: Given the possible asymmetry in the relationship, it is possible
- * that if indexes[i] = j then indexes[j] != i.
- *
- * 2. Group indexes
- * Group indexes if share neighbours in common - Depth-first search
- * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
- * (i,j,k) will form a group and (m,n) will form another group.
- *************************************************************************************/
-
- int[] indexes = Enumerable.Repeat(-1, elements.Length).ToArray();
- var candidatesPoints = elements.Select(candidatesPoint).ToList();
-
- ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
-
- // 1. Find nearest neighbours indexes
- Parallel.For(0, elements.Length, parallelOptions, e =>
- {
- var pivot = elements[e];
-
- if (filterPivot(pivot))
- {
- int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist);
-
- if (index != -1)
- {
- var paired = elements[index];
- if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
- {
- indexes[e] = index;
- }
- }
- }
- });
-
- // 2. Group indexes
- var groupedIndexes = GroupIndexes(indexes);
-
- return groupedIndexes;
+ return GroupIndexes(indexes);
}
///
@@ -189,7 +113,6 @@
*************************************************************************************/
int[] indexes = Enumerable.Repeat(-1, elements.Length).ToArray();
- var candidatesLines = elements.Select(x => candidatesLine(x)).ToList();
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
@@ -200,7 +123,7 @@
if (filterPivot(pivot))
{
- int index = pivot.FindIndexNearest(elements, candidatesLine, pivotLine, distMeasure, out double dist);
+ int index = Distances.FindIndexNearest(pivot, elements, candidatesLine, pivotLine, distMeasure, out double dist);
if (index != -1)
{
@@ -214,9 +137,7 @@
});
// 2. Group indexes
- var groupedIndexes = GroupIndexes(indexes);
-
- return groupedIndexes;
+ return GroupIndexes(indexes);
}
///
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs
index 7f06f7e5..9fd5ad20 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs
@@ -130,7 +130,7 @@
///
/// The distance measure to use.
/// The distance between reference point, and its nearest neighbour.
- internal static int FindIndexNearest(this T element, IReadOnlyList candidates,
+ internal static int FindIndexNearest(T element, IReadOnlyList candidates,
Func candidatesPoint, Func pivotPoint,
Func distanceMeasure, out double distance)
{
@@ -172,7 +172,7 @@
///
/// The distance measure between two lines to use.
/// The distance between reference line, and its nearest neighbour.
- internal static int FindIndexNearest(this T element, IReadOnlyList candidates,
+ internal static int FindIndexNearest(T element, IReadOnlyList candidates,
Func candidatesLine, Func pivotLine,
Func distanceMeasure, out double distance)
{
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs
new file mode 100644
index 00000000..61f2329d
--- /dev/null
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs
@@ -0,0 +1,230 @@
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ using System;
+ using System.Collections.Generic;
+ using System.Linq;
+ using UglyToad.PdfPig.Core;
+
+ // for kd-tree with line segments, see https://stackoverflow.com/questions/14376679/how-to-represent-line-segments-in-kd-tree
+
+ internal class KdTree : KdTree
+ {
+ public KdTree(PdfPoint[] candidates) : base(candidates, p => p)
+ { }
+
+ public PdfPoint FindNearestNeighbours(PdfPoint pivot, Func distanceMeasure, out int index, out double distance)
+ {
+ return FindNearestNeighbours(pivot, p => p, distanceMeasure, out index, out distance);
+ }
+ }
+
+ internal class KdTree
+ {
+ private KdTreeNode Root;
+
+ public KdTree(IReadOnlyList candidates, Func candidatesPointFunc)
+ {
+ var pointsIndex = Enumerable.Range(0, candidates.Count).Zip(candidates, (e, p) => (e, candidatesPointFunc(p), p)).ToList();
+ if (candidates != null && candidates.Count > 0)
+ {
+ Root = BuildTree(pointsIndex, 0);
+ }
+ }
+
+ private KdTreeNode BuildTree(IReadOnlyList<(int, PdfPoint, T)> P, int depth)
+ {
+ var median = P.Count / 2;
+ if (depth % 2 == 0) // depth is even
+ {
+ P = P.OrderBy(p => p.Item2.X).ToArray();
+ }
+ else
+ {
+ P = P.OrderBy(p => p.Item2.Y).ToArray();
+ }
+
+ // left side
+ var P1 = P.Take(median).ToArray();
+ KdTreeNode vLeft = null;
+ if (P1.Length == 1)
+ {
+ var item = P1[0];
+ vLeft = new KdTreeLeaf(item.Item2, item.Item3, depth, item.Item1);
+ }
+ else if (P1.Length > 1)
+ {
+ vLeft = BuildTree(P1, depth + 1);
+ }
+
+ // right side
+ var P2 = P.Skip(median + 1).ToArray();
+ KdTreeNode vRight = null;
+ if (P2.Length == 1)
+ {
+ var item = P2[0];
+ vRight = new KdTreeLeaf(item.Item2, item.Item3, depth, item.Item1);
+ }
+ else if (P2.Length > 1)
+ {
+ vRight = BuildTree(P2, depth + 1);
+ }
+
+ var medianItem = P[median];
+ return new KdTreeNode(vLeft, vRight, medianItem.Item2, medianItem.Item3, depth, medianItem.Item1);
+ }
+
+ #region NN
+ public T FindNearestNeighbours(T pivot, Func pivotPointFunc, Func distanceMeasure, out int index, out double distance)
+ {
+ var result = FindNearestNeighbours(Root, pivot, pivotPointFunc, distanceMeasure);
+ index = result.Item1.Index;
+ distance = result.Item2.Value;
+ return result.Item1.Element;
+ }
+
+ private static (KdTreeNode, double?) FindNearestNeighbours(KdTreeNode node, T pivot, Func pivotPointFunc, Func distance)
+ {
+ if (node == null)
+ {
+ return (null, null);
+ }
+ else if (node.IsLeaf)
+ {
+ if (node.Element.Equals(pivot))
+ {
+ return (null, null);
+ }
+ return (node, distance(node.Value, pivotPointFunc(pivot)));
+ }
+ else
+ {
+ var point = pivotPointFunc(pivot);
+ var currentNearestNode = node;
+ var currentDistance = distance(node.Value, point);
+
+ KdTreeNode newNode = null;
+ double? newDist = null;
+
+ var pointValue = node.Depth == 0 ? point.X : point.Y;
+
+ if (pointValue < node.L)
+ {
+ // start left
+ (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, pivotPointFunc, distance);
+
+ if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot))
+ {
+ currentDistance = newDist.Value;
+ currentNearestNode = newNode;
+ }
+
+ if (node.RightChild != null && pointValue + currentDistance >= node.L)
+ {
+ (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, pivotPointFunc, distance);
+ }
+ }
+ else
+ {
+ // start right
+ (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, pivotPointFunc, distance);
+
+ if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot))
+ {
+ currentDistance = newDist.Value;
+ currentNearestNode = newNode;
+ }
+
+ if (node.LeftChild != null && pointValue - currentDistance <= node.L)
+ {
+ (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, pivotPointFunc, distance);
+ }
+ }
+
+ if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot))
+ {
+ currentDistance = newDist.Value;
+ currentNearestNode = newNode;
+ }
+
+ return (currentNearestNode, currentDistance);
+ }
+ }
+ #endregion
+
+ private class KdTreeLeaf : KdTreeNode
+ {
+ public override bool IsLeaf => true;
+
+ public KdTreeLeaf(PdfPoint l, Q element, int depth, int index)
+ : base(null, null, l, element, depth, index)
+ { }
+
+ public override string ToString()
+ {
+ return "Leaf->" + Value.ToString();
+ }
+ }
+
+ private class KdTreeNode
+ {
+ ///
+ /// Split value.
+ ///
+ public double L => Depth == 0 ? Value.X : Value.Y;
+
+ public PdfPoint Value { get; }
+
+ public KdTreeNode LeftChild { get; internal set; }
+
+ public KdTreeNode RightChild { get; internal set; }
+
+ public Q Element { get; }
+
+ ///
+ /// 0 is even (x), 1 is odd (y).
+ ///
+ public int Depth { get; }
+
+ public virtual bool IsLeaf => false;
+
+ public int Index { get; }
+
+ public KdTreeNode(KdTreeNode leftChild, KdTreeNode rightChild, PdfPoint l, Q element, int depth, int index)
+ {
+ LeftChild = leftChild;
+ RightChild = rightChild;
+ Value = l;
+ Element = element;
+ Depth = depth % 2;
+ Index = index;
+ }
+
+ public IEnumerable> GetLeaves()
+ {
+ var leafs = new List>();
+ RecursiveGetLeaves(LeftChild, ref leafs);
+ RecursiveGetLeaves(RightChild, ref leafs);
+ return leafs;
+ }
+
+ private void RecursiveGetLeaves(KdTreeNode leaf, ref List> leafs)
+ {
+ if (leaf == null) return;
+ if (leaf is KdTreeLeaf lLeaf)
+ {
+ leafs.Add(lLeaf);
+ }
+ else
+ {
+ RecursiveGetLeaves(leaf.LeftChild, ref leafs);
+ RecursiveGetLeaves(leaf.RightChild, ref leafs);
+ }
+ }
+
+ public override string ToString()
+ {
+ return "Node->" + Value.ToString();
+ }
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
index 34df2ade..dbb2ad8a 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
@@ -230,7 +230,7 @@
return null;
}
- var closestWordIndex = pointR.FindIndexNearest(wordsWithinAngleBoundDistancePoints, p => p,
+ var closestWordIndex = Distances.FindIndexNearest(pointR, wordsWithinAngleBoundDistancePoints, p => p,
p => p, Distances.Euclidean, out _);
if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count)