2020-02-12 16:02:47 +00:00
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
using System ;
using System.Collections.Generic ;
using System.Linq ;
using UglyToad.PdfPig.Core ;
// for kd-tree with line segments, see https://stackoverflow.com/questions/14376679/how-to-represent-line-segments-in-kd-tree
2020-03-18 00:52:35 +00:00
/// <summary>
/// K-D tree data structure of <see cref="PdfPoint"/>.
/// </summary>
public class KdTree : KdTree < PdfPoint >
2020-02-12 16:02:47 +00:00
{
2020-03-18 00:52:35 +00:00
/// <summary>
/// K-D tree data structure of <see cref="PdfPoint"/>.
/// </summary>
/// <param name="points">The points used to build the tree.</param>
2020-05-23 19:11:53 +01:00
public KdTree ( IReadOnlyList < PdfPoint > points ) : base ( points , p = > p )
2020-02-12 16:02:47 +00:00
{ }
2020-03-18 00:52:35 +00:00
/// <summary>
/// Get the nearest neighbour to the pivot point.
/// Only returns 1 neighbour, even if equidistant points are found.
/// </summary>
/// <param name="pivot">The point for which to find the nearest neighbour.</param>
/// <param name="distanceMeasure">The distance measure used, e.g. the Euclidian distance.</param>
/// <param name="index">The nearest neighbour's index (returns -1 if not found).</param>
/// <param name="distance">The distance between the pivot and the nearest neighbour (returns <see cref="double.NaN"/> if not found).</param>
/// <returns>The nearest neighbour's point.</returns>
public PdfPoint FindNearestNeighbour ( PdfPoint pivot , Func < PdfPoint , PdfPoint , double > distanceMeasure , out int index , out double distance )
2020-02-12 16:02:47 +00:00
{
2020-03-10 13:36:44 +00:00
return FindNearestNeighbour ( pivot , p = > p , distanceMeasure , out index , out distance ) ;
}
2020-03-18 00:52:35 +00:00
/// <summary>
/// Get the k nearest neighbours to the pivot point.
/// Might return more than k neighbours if points are equidistant.
/// <para>Use <see cref="FindNearestNeighbour(PdfPoint, Func{PdfPoint, PdfPoint, double}, out int, out double)"/> if only looking for the (single) closest point.</para>
/// </summary>
/// <param name="pivot">The point for which to find the nearest neighbour.</param>
/// <param name="k">The number of neighbours to return. Might return more than k neighbours if points are equidistant.</param>
/// <param name="distanceMeasure">The distance measure used, e.g. the Euclidian distance.</param>
/// <returns>Returns a list of tuples of the k nearest neighbours. Tuples are (element, index, distance).</returns>
2020-03-10 13:36:44 +00:00
public IReadOnlyList < ( PdfPoint , int , double ) > FindNearestNeighbours ( PdfPoint pivot , int k , Func < PdfPoint , PdfPoint , double > distanceMeasure )
{
return FindNearestNeighbours ( pivot , k , p = > p , distanceMeasure ) ;
2020-02-12 16:02:47 +00:00
}
}
2020-03-18 00:52:35 +00:00
/// <summary>
/// K-D tree data structure.
/// </summary>
/// <typeparam name="T"></typeparam>
public class KdTree < T >
2020-02-12 16:02:47 +00:00
{
2020-03-18 00:52:35 +00:00
/// <summary>
/// The root of the tree.
/// </summary>
public readonly KdTreeNode < T > Root ;
2020-03-10 13:36:44 +00:00
2020-03-18 00:52:35 +00:00
/// <summary>
/// Number of elements in the tree.
/// </summary>
2020-03-10 13:36:44 +00:00
public readonly int Count ;
2020-02-12 16:02:47 +00:00
2020-03-18 00:52:35 +00:00
/// <summary>
/// K-D tree data structure.
/// </summary>
/// <param name="elements">The elements used to build the tree.</param>
/// <param name="elementsPointFunc">The function that converts the candidate elements into a <see cref="PdfPoint"/>.</param>
public KdTree ( IReadOnlyList < T > elements , Func < T , PdfPoint > elementsPointFunc )
2020-02-12 16:02:47 +00:00
{
2020-03-18 00:52:35 +00:00
if ( elements = = null | | elements . Count = = 0 )
2020-02-12 16:02:47 +00:00
{
2020-03-18 00:52:35 +00:00
throw new ArgumentException ( "KdTree(): candidates cannot be null or empty." , nameof ( elements ) ) ;
2020-02-12 16:02:47 +00:00
}
2020-03-02 23:27:35 +00:00
2020-03-18 00:52:35 +00:00
Count = elements . Count ;
Root = BuildTree ( Enumerable . Range ( 0 , elements . Count ) . Zip ( elements , ( e , p ) = > ( e , elementsPointFunc ( p ) , p ) ) . ToArray ( ) , 0 ) ;
2020-02-12 16:02:47 +00:00
}
2020-03-04 10:53:25 +00:00
private KdTreeNode < T > BuildTree ( ( int , PdfPoint , T ) [ ] P , int depth )
2020-02-12 16:02:47 +00:00
{
2020-03-04 10:53:25 +00:00
if ( P . Length = = 0 )
{
return null ;
}
else if ( P . Length = = 1 )
{
return new KdTreeLeaf < T > ( P [ 0 ] , depth ) ;
}
2020-03-02 23:27:35 +00:00
if ( depth % 2 = = 0 )
2020-02-12 16:02:47 +00:00
{
2020-03-02 23:27:35 +00:00
Array . Sort ( P , ( p0 , p1 ) = > p0 . Item2 . X . CompareTo ( p1 . Item2 . X ) ) ;
2020-02-12 16:02:47 +00:00
}
else
{
2020-03-02 23:27:35 +00:00
Array . Sort ( P , ( p0 , p1 ) = > p0 . Item2 . Y . CompareTo ( p1 . Item2 . Y ) ) ;
2020-02-12 16:02:47 +00:00
}
2020-03-04 10:53:25 +00:00
if ( P . Length = = 2 )
2020-02-12 16:02:47 +00:00
{
2020-03-06 13:27:32 +00:00
return new KdTreeNode < T > ( new KdTreeLeaf < T > ( P [ 0 ] , depth + 1 ) , null , P [ 1 ] , depth ) ;
2020-02-12 16:02:47 +00:00
}
2020-03-04 10:53:25 +00:00
int median = P . Length / 2 ;
KdTreeNode < T > vLeft = BuildTree ( P . Take ( median ) . ToArray ( ) , depth + 1 ) ;
KdTreeNode < T > vRight = BuildTree ( P . Skip ( median + 1 ) . ToArray ( ) , depth + 1 ) ;
2020-02-12 16:02:47 +00:00
2020-03-02 23:27:35 +00:00
return new KdTreeNode < T > ( vLeft , vRight , P [ median ] , depth ) ;
2020-02-12 16:02:47 +00:00
}
#region NN
2020-03-09 11:10:33 +00:00
/// <summary>
2020-03-10 13:36:44 +00:00
/// Get the nearest neighbour to the pivot element.
2020-03-18 00:52:35 +00:00
/// Only returns 1 neighbour, even if equidistant points are found.
2020-03-09 11:10:33 +00:00
/// </summary>
2020-03-10 13:36:44 +00:00
/// <param name="pivot">The element for which to find the nearest neighbour.</param>
2020-03-18 00:52:35 +00:00
/// <param name="pivotPointFunc">The function that converts the pivot element into a <see cref="PdfPoint"/>.</param>
/// <param name="distanceMeasure">The distance measure used, e.g. the Euclidian distance.</param>
2020-03-09 11:10:33 +00:00
/// <param name="index">The nearest neighbour's index (returns -1 if not found).</param>
/// <param name="distance">The distance between the pivot and the nearest neighbour (returns <see cref="double.NaN"/> if not found).</param>
/// <returns>The nearest neighbour's element.</returns>
2020-03-10 13:36:44 +00:00
public T FindNearestNeighbour ( T pivot , Func < T , PdfPoint > pivotPointFunc , Func < PdfPoint , PdfPoint , double > distanceMeasure , out int index , out double distance )
2020-02-12 16:02:47 +00:00
{
2020-03-10 13:36:44 +00:00
var result = FindNearestNeighbour ( Root , pivot , pivotPointFunc , distanceMeasure ) ;
2020-03-09 11:10:33 +00:00
index = result . Item1 ! = null ? result . Item1 . Index : - 1 ;
2020-03-10 13:36:44 +00:00
distance = result . Item2 ? ? double . NaN ;
2020-03-09 11:10:33 +00:00
return result . Item1 ! = null ? result . Item1 . Element : default ;
2020-02-12 16:02:47 +00:00
}
2020-03-10 13:36:44 +00:00
private static ( KdTreeNode < T > , double? ) FindNearestNeighbour ( KdTreeNode < T > node , T pivot , Func < T , PdfPoint > pivotPointFunc , Func < PdfPoint , PdfPoint , double > distance )
2020-02-12 16:02:47 +00:00
{
if ( node = = null )
{
return ( null , null ) ;
}
else if ( node . IsLeaf )
{
if ( node . Element . Equals ( pivot ) )
{
return ( null , null ) ;
}
return ( node , distance ( node . Value , pivotPointFunc ( pivot ) ) ) ;
}
else
{
var point = pivotPointFunc ( pivot ) ;
var currentNearestNode = node ;
var currentDistance = distance ( node . Value , point ) ;
KdTreeNode < T > newNode = null ;
double? newDist = null ;
2020-03-06 13:27:32 +00:00
var pointValue = node . IsAxisCutX ? point . X : point . Y ;
2020-02-12 16:02:47 +00:00
if ( pointValue < node . L )
{
// start left
2020-03-10 13:36:44 +00:00
( newNode , newDist ) = FindNearestNeighbour ( node . LeftChild , pivot , pivotPointFunc , distance ) ;
2020-02-12 16:02:47 +00:00
if ( newDist . HasValue & & newDist < = currentDistance & & ! newNode . Element . Equals ( pivot ) )
{
currentDistance = newDist . Value ;
currentNearestNode = newNode ;
}
if ( node . RightChild ! = null & & pointValue + currentDistance > = node . L )
{
2020-03-10 13:36:44 +00:00
( newNode , newDist ) = FindNearestNeighbour ( node . RightChild , pivot , pivotPointFunc , distance ) ;
2020-02-12 16:02:47 +00:00
}
}
else
{
// start right
2020-03-10 13:36:44 +00:00
( newNode , newDist ) = FindNearestNeighbour ( node . RightChild , pivot , pivotPointFunc , distance ) ;
2020-02-12 16:02:47 +00:00
if ( newDist . HasValue & & newDist < = currentDistance & & ! newNode . Element . Equals ( pivot ) )
{
currentDistance = newDist . Value ;
currentNearestNode = newNode ;
}
if ( node . LeftChild ! = null & & pointValue - currentDistance < = node . L )
{
2020-03-10 13:36:44 +00:00
( newNode , newDist ) = FindNearestNeighbour ( node . LeftChild , pivot , pivotPointFunc , distance ) ;
2020-02-12 16:02:47 +00:00
}
}
if ( newDist . HasValue & & newDist < = currentDistance & & ! newNode . Element . Equals ( pivot ) )
{
currentDistance = newDist . Value ;
currentNearestNode = newNode ;
}
return ( currentNearestNode , currentDistance ) ;
}
}
#endregion
2020-03-10 13:36:44 +00:00
#region k - NN
/// <summary>
2020-03-18 00:52:35 +00:00
/// Get the k nearest neighbours to the pivot element.
/// Might return more than k neighbours if points are equidistant.
/// <para>Use <see cref="FindNearestNeighbour(KdTreeNode{T}, T, Func{T, PdfPoint}, Func{PdfPoint, PdfPoint, double})"/> if only looking for the (single) closest point.</para>
2020-03-10 13:36:44 +00:00
/// </summary>
/// <param name="pivot">The element for which to find the k nearest neighbours.</param>
2020-03-18 00:52:35 +00:00
/// <param name="k">The number of neighbours to return. Might return more than k neighbours if points are equidistant.</param>
/// <param name="pivotPointFunc">The function that converts the pivot element into a <see cref="PdfPoint"/>.</param>
/// <param name="distanceMeasure">The distance measure used, e.g. the Euclidian distance.</param>
2020-03-10 13:36:44 +00:00
/// <returns>Returns a list of tuples of the k nearest neighbours. Tuples are (element, index, distance).</returns>
public IReadOnlyList < ( T , int , double ) > FindNearestNeighbours ( T pivot , int k , Func < T , PdfPoint > pivotPointFunc , Func < PdfPoint , PdfPoint , double > distanceMeasure )
{
2020-03-18 00:52:35 +00:00
var kdTreeNodes = new KNearestNeighboursQueue ( k ) ;
FindNearestNeighbours ( Root , pivot , k , pivotPointFunc , distanceMeasure , kdTreeNodes ) ;
return kdTreeNodes . SelectMany ( n = > n . Value . Select ( e = > ( e . Element , e . Index , n . Key ) ) ) . ToList ( ) ;
2020-03-10 13:36:44 +00:00
}
private static ( KdTreeNode < T > , double ) FindNearestNeighbours ( KdTreeNode < T > node , T pivot , int k ,
Func < T , PdfPoint > pivotPointFunc , Func < PdfPoint , PdfPoint , double > distance , KNearestNeighboursQueue queue )
{
if ( node = = null )
{
return ( null , double . NaN ) ;
}
else if ( node . IsLeaf )
{
if ( node . Element . Equals ( pivot ) )
{
return ( null , double . NaN ) ;
}
var currentDistance = distance ( node . Value , pivotPointFunc ( pivot ) ) ;
var currentNearestNode = node ;
if ( ! queue . IsFull | | currentDistance < = queue . LastDistance )
{
queue . Add ( currentDistance , currentNearestNode ) ;
currentDistance = queue . LastDistance ;
currentNearestNode = queue . LastElement ;
}
return ( currentNearestNode , currentDistance ) ;
}
else
{
var point = pivotPointFunc ( pivot ) ;
var currentNearestNode = node ;
var currentDistance = distance ( node . Value , point ) ;
2020-04-28 22:01:50 +01:00
if ( ( ! queue . IsFull | | currentDistance < = queue . LastDistance ) & & ! node . Element . Equals ( pivot ) )
2020-03-10 13:36:44 +00:00
{
queue . Add ( currentDistance , currentNearestNode ) ;
currentDistance = queue . LastDistance ;
currentNearestNode = queue . LastElement ;
}
KdTreeNode < T > newNode = null ;
double newDist = double . NaN ;
var pointValue = node . IsAxisCutX ? point . X : point . Y ;
if ( pointValue < node . L )
{
// start left
( newNode , newDist ) = FindNearestNeighbours ( node . LeftChild , pivot , k , pivotPointFunc , distance , queue ) ;
if ( ! double . IsNaN ( newDist ) & & newDist < = currentDistance & & ! newNode . Element . Equals ( pivot ) )
{
queue . Add ( newDist , newNode ) ;
currentDistance = queue . LastDistance ;
currentNearestNode = queue . LastElement ;
}
if ( node . RightChild ! = null & & pointValue + currentDistance > = node . L )
{
( newNode , newDist ) = FindNearestNeighbours ( node . RightChild , pivot , k , pivotPointFunc , distance , queue ) ;
}
}
else
{
// start right
( newNode , newDist ) = FindNearestNeighbours ( node . RightChild , pivot , k , pivotPointFunc , distance , queue ) ;
if ( ! double . IsNaN ( newDist ) & & newDist < = currentDistance & & ! newNode . Element . Equals ( pivot ) )
{
queue . Add ( newDist , newNode ) ;
currentDistance = queue . LastDistance ;
currentNearestNode = queue . LastElement ;
}
if ( node . LeftChild ! = null & & pointValue - currentDistance < = node . L )
{
( newNode , newDist ) = FindNearestNeighbours ( node . LeftChild , pivot , k , pivotPointFunc , distance , queue ) ;
}
}
if ( ! double . IsNaN ( newDist ) & & newDist < = currentDistance & & ! newNode . Element . Equals ( pivot ) )
{
queue . Add ( newDist , newNode ) ;
currentDistance = queue . LastDistance ;
currentNearestNode = queue . LastElement ;
}
return ( currentNearestNode , currentDistance ) ;
}
}
private class KNearestNeighboursQueue : SortedList < double , HashSet < KdTreeNode < T > > >
{
public readonly int K ;
public KdTreeNode < T > LastElement { get ; private set ; }
public double LastDistance { get ; private set ; }
public bool IsFull = > Count > = K ;
public KNearestNeighboursQueue ( int k ) : base ( k )
{
K = k ;
LastDistance = double . PositiveInfinity ;
}
public void Add ( double key , KdTreeNode < T > value )
{
if ( key > LastDistance & & IsFull )
{
return ;
}
if ( ! ContainsKey ( key ) )
{
base . Add ( key , new HashSet < KdTreeNode < T > > ( ) ) ;
if ( Count > K )
{
RemoveAt ( Count - 1 ) ;
}
}
if ( this [ key ] . Add ( value ) )
{
var last = this . Last ( ) ;
LastElement = last . Value . Last ( ) ;
LastDistance = last . Key ;
}
}
}
#endregion
2020-03-18 00:52:35 +00:00
/// <summary>
/// K-D tree leaf.
/// </summary>
/// <typeparam name="Q"></typeparam>
public class KdTreeLeaf < Q > : KdTreeNode < Q >
2020-02-12 16:02:47 +00:00
{
2020-03-18 00:52:35 +00:00
/// <summary>
/// Return true if leaf.
/// </summary>
2020-02-12 16:02:47 +00:00
public override bool IsLeaf = > true ;
2020-03-18 00:52:35 +00:00
internal KdTreeLeaf ( ( int , PdfPoint , Q ) point , int depth )
2020-03-02 23:27:35 +00:00
: base ( null , null , point , depth )
2020-02-12 16:02:47 +00:00
{ }
2020-03-18 00:52:35 +00:00
/// <inheritdoc />
2020-02-12 16:02:47 +00:00
public override string ToString ( )
{
return "Leaf->" + Value . ToString ( ) ;
}
}
2020-03-18 00:52:35 +00:00
/// <summary>
/// K-D tree node.
/// </summary>
/// <typeparam name="Q"></typeparam>
public class KdTreeNode < Q >
2020-02-12 16:02:47 +00:00
{
/// <summary>
2020-03-18 00:52:35 +00:00
/// Split value (X or Y axis).
2020-02-12 16:02:47 +00:00
/// </summary>
2020-03-06 13:27:32 +00:00
public double L = > IsAxisCutX ? Value . X : Value . Y ;
2020-02-12 16:02:47 +00:00
2020-03-18 00:52:35 +00:00
/// <summary>
/// Split point.
/// </summary>
2020-02-12 16:02:47 +00:00
public PdfPoint Value { get ; }
2020-03-18 00:52:35 +00:00
/// <summary>
/// Left child.
/// </summary>
2020-02-12 16:02:47 +00:00
public KdTreeNode < Q > LeftChild { get ; internal set ; }
2020-03-18 00:52:35 +00:00
/// <summary>
/// Right child.
/// </summary>
2020-02-12 16:02:47 +00:00
public KdTreeNode < Q > RightChild { get ; internal set ; }
2020-03-18 00:52:35 +00:00
/// <summary>
/// The node's element.
/// </summary>
2020-02-12 16:02:47 +00:00
public Q Element { get ; }
/// <summary>
2020-03-06 13:27:32 +00:00
/// True if this cuts with X axis, false if cuts with Y axis.
2020-02-12 16:02:47 +00:00
/// </summary>
2020-03-06 13:27:32 +00:00
public bool IsAxisCutX { get ; }
2020-03-18 00:52:35 +00:00
/// <summary>
/// The element's depth in the tree.
/// </summary>
2020-02-12 16:02:47 +00:00
public int Depth { get ; }
2020-03-18 00:52:35 +00:00
/// <summary>
/// Return true if leaf.
/// </summary>
2020-02-12 16:02:47 +00:00
public virtual bool IsLeaf = > false ;
2020-03-18 00:52:35 +00:00
/// <summary>
/// The index of the element in the original array.
/// </summary>
2020-02-12 16:02:47 +00:00
public int Index { get ; }
2020-03-18 00:52:35 +00:00
internal KdTreeNode ( KdTreeNode < Q > leftChild , KdTreeNode < Q > rightChild , ( int , PdfPoint , Q ) point , int depth )
2020-02-12 16:02:47 +00:00
{
LeftChild = leftChild ;
RightChild = rightChild ;
2020-03-02 23:27:35 +00:00
Value = point . Item2 ;
Element = point . Item3 ;
2020-03-06 13:27:32 +00:00
Depth = depth ;
IsAxisCutX = depth % 2 = = 0 ;
2020-03-02 23:27:35 +00:00
Index = point . Item1 ;
2020-02-12 16:02:47 +00:00
}
2020-03-18 00:52:35 +00:00
/// <summary>
/// Get the leaves.
/// </summary>
2020-02-12 16:02:47 +00:00
public IEnumerable < KdTreeLeaf < Q > > GetLeaves ( )
{
2020-03-03 09:34:23 +00:00
var leaves = new List < KdTreeLeaf < Q > > ( ) ;
RecursiveGetLeaves ( LeftChild , ref leaves ) ;
RecursiveGetLeaves ( RightChild , ref leaves ) ;
return leaves ;
2020-02-12 16:02:47 +00:00
}
2020-03-03 09:34:23 +00:00
private void RecursiveGetLeaves ( KdTreeNode < Q > leaf , ref List < KdTreeLeaf < Q > > leaves )
2020-02-12 16:02:47 +00:00
{
if ( leaf = = null ) return ;
if ( leaf is KdTreeLeaf < Q > lLeaf )
{
2020-03-03 09:34:23 +00:00
leaves . Add ( lLeaf ) ;
2020-02-12 16:02:47 +00:00
}
else
{
2020-03-03 09:34:23 +00:00
RecursiveGetLeaves ( leaf . LeftChild , ref leaves ) ;
RecursiveGetLeaves ( leaf . RightChild , ref leaves ) ;
2020-02-12 16:02:47 +00:00
}
}
2020-03-18 00:52:35 +00:00
/// <inheritdoc />
2020-02-12 16:02:47 +00:00
public override string ToString ( )
{
return "Node->" + Value . ToString ( ) ;
}
}
}
}