mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-19 10:47:56 +08:00
Add kd tree and improve clustering
This commit is contained in:
@@ -1,11 +1,10 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
using Core;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
using Geometry;
|
||||
using Core;
|
||||
|
||||
/// <summary>
|
||||
/// Clustering Algorithms.
|
||||
@@ -16,7 +15,7 @@
|
||||
/// Algorithm to group elements using nearest neighbours.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
|
||||
/// <param name="elements">List of elements to group.</param>
|
||||
/// <param name="elements">Elements to group.</param>
|
||||
/// <param name="distMeasure">The distance measure between two points.</param>
|
||||
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two points in the same cluster.</param>
|
||||
/// <param name="pivotPoint">The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
@@ -26,7 +25,7 @@
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(List<T> elements,
|
||||
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(IReadOnlyList<T> elements,
|
||||
Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
Func<T, T, double> maxDistanceFunction,
|
||||
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
|
||||
@@ -51,7 +50,7 @@
|
||||
*************************************************************************************/
|
||||
|
||||
int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray();
|
||||
var candidatesPoints = elements.Select(candidatesPoint).ToList();
|
||||
KdTree<T> kdTree = new KdTree<T>(elements, candidatesPoint);
|
||||
|
||||
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
|
||||
|
||||
@@ -62,92 +61,17 @@
|
||||
|
||||
if (filterPivot(pivot))
|
||||
{
|
||||
int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist);
|
||||
var paired = kdTree.FindNearestNeighbours(pivot, pivotPoint, distMeasure, out int index, out double dist);
|
||||
|
||||
if (index != -1)
|
||||
{
|
||||
var paired = elements[index];
|
||||
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
|
||||
{
|
||||
indexes[e] = index;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 2. Group indexes
|
||||
var groupedIndexes = GroupIndexes(indexes);
|
||||
|
||||
return groupedIndexes;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm to group elements using nearest neighbours.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
|
||||
/// <param name="elements">Array of elements to group.</param>
|
||||
/// <param name="distMeasure">The distance measure between two points.</param>
|
||||
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two points in the same cluster.</param>
|
||||
/// <param name="pivotPoint">The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(T[] elements,
|
||||
Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
Func<T, T, double> maxDistanceFunction,
|
||||
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
|
||||
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal,
|
||||
int maxDegreeOfParallelism)
|
||||
{
|
||||
/*************************************************************************************
|
||||
* Algorithm steps
|
||||
* 1. Find nearest neighbours indexes (done in parallel)
|
||||
* Iterate every point (pivot) and put its nearest neighbour's index in an array
|
||||
* e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
|
||||
* Only conciders a neighbour if it is within the maximum distance.
|
||||
* If not within the maximum distance, index will be set to -1.
|
||||
* Each element has only one connected neighbour.
|
||||
* NB: Given the possible asymmetry in the relationship, it is possible
|
||||
* that if indexes[i] = j then indexes[j] != i.
|
||||
*
|
||||
* 2. Group indexes
|
||||
* Group indexes if share neighbours in common - Depth-first search
|
||||
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
|
||||
* (i,j,k) will form a group and (m,n) will form another group.
|
||||
*************************************************************************************/
|
||||
|
||||
int[] indexes = Enumerable.Repeat(-1, elements.Length).ToArray();
|
||||
var candidatesPoints = elements.Select(candidatesPoint).ToList();
|
||||
|
||||
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
|
||||
|
||||
// 1. Find nearest neighbours indexes
|
||||
Parallel.For(0, elements.Length, parallelOptions, e =>
|
||||
{
|
||||
var pivot = elements[e];
|
||||
|
||||
if (filterPivot(pivot))
|
||||
{
|
||||
int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist);
|
||||
|
||||
if (index != -1)
|
||||
{
|
||||
var paired = elements[index];
|
||||
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
|
||||
{
|
||||
indexes[e] = index;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 2. Group indexes
|
||||
var groupedIndexes = GroupIndexes(indexes);
|
||||
|
||||
return groupedIndexes;
|
||||
return GroupIndexes(indexes);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -189,7 +113,6 @@
|
||||
*************************************************************************************/
|
||||
|
||||
int[] indexes = Enumerable.Repeat(-1, elements.Length).ToArray();
|
||||
var candidatesLines = elements.Select(x => candidatesLine(x)).ToList();
|
||||
|
||||
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
|
||||
|
||||
@@ -200,7 +123,7 @@
|
||||
|
||||
if (filterPivot(pivot))
|
||||
{
|
||||
int index = pivot.FindIndexNearest(elements, candidatesLine, pivotLine, distMeasure, out double dist);
|
||||
int index = Distances.FindIndexNearest(pivot, elements, candidatesLine, pivotLine, distMeasure, out double dist);
|
||||
|
||||
if (index != -1)
|
||||
{
|
||||
@@ -214,9 +137,7 @@
|
||||
});
|
||||
|
||||
// 2. Group indexes
|
||||
var groupedIndexes = GroupIndexes(indexes);
|
||||
|
||||
return groupedIndexes;
|
||||
return GroupIndexes(indexes);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@@ -130,7 +130,7 @@
|
||||
/// <param name="pivotPoint"></param>
|
||||
/// <param name="distanceMeasure">The distance measure to use.</param>
|
||||
/// <param name="distance">The distance between reference point, and its nearest neighbour.</param>
|
||||
internal static int FindIndexNearest<T>(this T element, IReadOnlyList<T> candidates,
|
||||
internal static int FindIndexNearest<T>(T element, IReadOnlyList<T> candidates,
|
||||
Func<T, PdfPoint> candidatesPoint, Func<T, PdfPoint> pivotPoint,
|
||||
Func<PdfPoint, PdfPoint, double> distanceMeasure, out double distance)
|
||||
{
|
||||
@@ -172,7 +172,7 @@
|
||||
/// <param name="pivotLine"></param>
|
||||
/// <param name="distanceMeasure">The distance measure between two lines to use.</param>
|
||||
/// <param name="distance">The distance between reference line, and its nearest neighbour.</param>
|
||||
internal static int FindIndexNearest<T>(this T element, IReadOnlyList<T> candidates,
|
||||
internal static int FindIndexNearest<T>(T element, IReadOnlyList<T> candidates,
|
||||
Func<T, PdfLine> candidatesLine, Func<T, PdfLine> pivotLine,
|
||||
Func<PdfLine, PdfLine, double> distanceMeasure, out double distance)
|
||||
{
|
||||
|
230
src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs
Normal file
230
src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs
Normal file
@@ -0,0 +1,230 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.Core;
|
||||
|
||||
// for kd-tree with line segments, see https://stackoverflow.com/questions/14376679/how-to-represent-line-segments-in-kd-tree
|
||||
|
||||
internal class KdTree : KdTree<PdfPoint>
|
||||
{
|
||||
public KdTree(PdfPoint[] candidates) : base(candidates, p => p)
|
||||
{ }
|
||||
|
||||
public PdfPoint FindNearestNeighbours(PdfPoint pivot, Func<PdfPoint, PdfPoint, double> distanceMeasure, out int index, out double distance)
|
||||
{
|
||||
return FindNearestNeighbours(pivot, p => p, distanceMeasure, out index, out distance);
|
||||
}
|
||||
}
|
||||
|
||||
internal class KdTree<T>
|
||||
{
|
||||
private KdTreeNode<T> Root;
|
||||
|
||||
public KdTree(IReadOnlyList<T> candidates, Func<T, PdfPoint> candidatesPointFunc)
|
||||
{
|
||||
var pointsIndex = Enumerable.Range(0, candidates.Count).Zip(candidates, (e, p) => (e, candidatesPointFunc(p), p)).ToList();
|
||||
if (candidates != null && candidates.Count > 0)
|
||||
{
|
||||
Root = BuildTree(pointsIndex, 0);
|
||||
}
|
||||
}
|
||||
|
||||
private KdTreeNode<T> BuildTree(IReadOnlyList<(int, PdfPoint, T)> P, int depth)
|
||||
{
|
||||
var median = P.Count / 2;
|
||||
if (depth % 2 == 0) // depth is even
|
||||
{
|
||||
P = P.OrderBy(p => p.Item2.X).ToArray();
|
||||
}
|
||||
else
|
||||
{
|
||||
P = P.OrderBy(p => p.Item2.Y).ToArray();
|
||||
}
|
||||
|
||||
// left side
|
||||
var P1 = P.Take(median).ToArray();
|
||||
KdTreeNode<T> vLeft = null;
|
||||
if (P1.Length == 1)
|
||||
{
|
||||
var item = P1[0];
|
||||
vLeft = new KdTreeLeaf<T>(item.Item2, item.Item3, depth, item.Item1);
|
||||
}
|
||||
else if (P1.Length > 1)
|
||||
{
|
||||
vLeft = BuildTree(P1, depth + 1);
|
||||
}
|
||||
|
||||
// right side
|
||||
var P2 = P.Skip(median + 1).ToArray();
|
||||
KdTreeNode<T> vRight = null;
|
||||
if (P2.Length == 1)
|
||||
{
|
||||
var item = P2[0];
|
||||
vRight = new KdTreeLeaf<T>(item.Item2, item.Item3, depth, item.Item1);
|
||||
}
|
||||
else if (P2.Length > 1)
|
||||
{
|
||||
vRight = BuildTree(P2, depth + 1);
|
||||
}
|
||||
|
||||
var medianItem = P[median];
|
||||
return new KdTreeNode<T>(vLeft, vRight, medianItem.Item2, medianItem.Item3, depth, medianItem.Item1);
|
||||
}
|
||||
|
||||
#region NN
|
||||
public T FindNearestNeighbours(T pivot, Func<T, PdfPoint> pivotPointFunc, Func<PdfPoint, PdfPoint, double> distanceMeasure, out int index, out double distance)
|
||||
{
|
||||
var result = FindNearestNeighbours(Root, pivot, pivotPointFunc, distanceMeasure);
|
||||
index = result.Item1.Index;
|
||||
distance = result.Item2.Value;
|
||||
return result.Item1.Element;
|
||||
}
|
||||
|
||||
private static (KdTreeNode<T>, double?) FindNearestNeighbours(KdTreeNode<T> node, T pivot, Func<T, PdfPoint> pivotPointFunc, Func<PdfPoint, PdfPoint, double> distance)
|
||||
{
|
||||
if (node == null)
|
||||
{
|
||||
return (null, null);
|
||||
}
|
||||
else if (node.IsLeaf)
|
||||
{
|
||||
if (node.Element.Equals(pivot))
|
||||
{
|
||||
return (null, null);
|
||||
}
|
||||
return (node, distance(node.Value, pivotPointFunc(pivot)));
|
||||
}
|
||||
else
|
||||
{
|
||||
var point = pivotPointFunc(pivot);
|
||||
var currentNearestNode = node;
|
||||
var currentDistance = distance(node.Value, point);
|
||||
|
||||
KdTreeNode<T> newNode = null;
|
||||
double? newDist = null;
|
||||
|
||||
var pointValue = node.Depth == 0 ? point.X : point.Y;
|
||||
|
||||
if (pointValue < node.L)
|
||||
{
|
||||
// start left
|
||||
(newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, pivotPointFunc, distance);
|
||||
|
||||
if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot))
|
||||
{
|
||||
currentDistance = newDist.Value;
|
||||
currentNearestNode = newNode;
|
||||
}
|
||||
|
||||
if (node.RightChild != null && pointValue + currentDistance >= node.L)
|
||||
{
|
||||
(newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, pivotPointFunc, distance);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// start right
|
||||
(newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, pivotPointFunc, distance);
|
||||
|
||||
if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot))
|
||||
{
|
||||
currentDistance = newDist.Value;
|
||||
currentNearestNode = newNode;
|
||||
}
|
||||
|
||||
if (node.LeftChild != null && pointValue - currentDistance <= node.L)
|
||||
{
|
||||
(newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, pivotPointFunc, distance);
|
||||
}
|
||||
}
|
||||
|
||||
if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot))
|
||||
{
|
||||
currentDistance = newDist.Value;
|
||||
currentNearestNode = newNode;
|
||||
}
|
||||
|
||||
return (currentNearestNode, currentDistance);
|
||||
}
|
||||
}
|
||||
#endregion
|
||||
|
||||
private class KdTreeLeaf<Q> : KdTreeNode<Q>
|
||||
{
|
||||
public override bool IsLeaf => true;
|
||||
|
||||
public KdTreeLeaf(PdfPoint l, Q element, int depth, int index)
|
||||
: base(null, null, l, element, depth, index)
|
||||
{ }
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return "Leaf->" + Value.ToString();
|
||||
}
|
||||
}
|
||||
|
||||
private class KdTreeNode<Q>
|
||||
{
|
||||
/// <summary>
|
||||
/// Split value.
|
||||
/// </summary>
|
||||
public double L => Depth == 0 ? Value.X : Value.Y;
|
||||
|
||||
public PdfPoint Value { get; }
|
||||
|
||||
public KdTreeNode<Q> LeftChild { get; internal set; }
|
||||
|
||||
public KdTreeNode<Q> RightChild { get; internal set; }
|
||||
|
||||
public Q Element { get; }
|
||||
|
||||
/// <summary>
|
||||
/// 0 is even (x), 1 is odd (y).
|
||||
/// </summary>
|
||||
public int Depth { get; }
|
||||
|
||||
public virtual bool IsLeaf => false;
|
||||
|
||||
public int Index { get; }
|
||||
|
||||
public KdTreeNode(KdTreeNode<Q> leftChild, KdTreeNode<Q> rightChild, PdfPoint l, Q element, int depth, int index)
|
||||
{
|
||||
LeftChild = leftChild;
|
||||
RightChild = rightChild;
|
||||
Value = l;
|
||||
Element = element;
|
||||
Depth = depth % 2;
|
||||
Index = index;
|
||||
}
|
||||
|
||||
public IEnumerable<KdTreeLeaf<Q>> GetLeaves()
|
||||
{
|
||||
var leafs = new List<KdTreeLeaf<Q>>();
|
||||
RecursiveGetLeaves(LeftChild, ref leafs);
|
||||
RecursiveGetLeaves(RightChild, ref leafs);
|
||||
return leafs;
|
||||
}
|
||||
|
||||
private void RecursiveGetLeaves(KdTreeNode<Q> leaf, ref List<KdTreeLeaf<Q>> leafs)
|
||||
{
|
||||
if (leaf == null) return;
|
||||
if (leaf is KdTreeLeaf<Q> lLeaf)
|
||||
{
|
||||
leafs.Add(lLeaf);
|
||||
}
|
||||
else
|
||||
{
|
||||
RecursiveGetLeaves(leaf.LeftChild, ref leafs);
|
||||
RecursiveGetLeaves(leaf.RightChild, ref leafs);
|
||||
}
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return "Node->" + Value.ToString();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -230,7 +230,7 @@
|
||||
return null;
|
||||
}
|
||||
|
||||
var closestWordIndex = pointR.FindIndexNearest(wordsWithinAngleBoundDistancePoints, p => p,
|
||||
var closestWordIndex = Distances.FindIndexNearest(pointR, wordsWithinAngleBoundDistancePoints, p => p,
|
||||
p => p, Distances.Euclidean, out _);
|
||||
|
||||
if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count)
|
||||
|
Reference in New Issue
Block a user