namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
using Core;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
///
/// Clustering Algorithms.
///
public static class Clustering
{
///
/// Algorithm to group elements using nearest neighbours.
/// Uses the nearest neighbour as candidate.
///
/// Letter, Word, TextLine, etc.
/// Elements to group.
/// The distance measure between two points.
/// The function that determines the maximum distance between two points in the same cluster.
/// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.
/// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.
/// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
/// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
/// Sets the maximum number of concurrent tasks enabled.
/// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
public static IEnumerable> NearestNeighbours(IReadOnlyList elements,
Func distMeasure,
Func maxDistanceFunction,
Func pivotPoint, Func candidatesPoint,
Func filterPivot, Func filterFinal,
int maxDegreeOfParallelism)
{
/*************************************************************************************
* Algorithm steps
* 1. Find nearest neighbours indexes (done in parallel)
* Iterate every point (pivot) and put its nearest neighbour's index in an array
* e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
* Only conciders a neighbour if it is within the maximum distance.
* If not within the maximum distance, index will be set to -1.
* Each element has only one connected neighbour.
* NB: Given the possible asymmetry in the relationship, it is possible
* that if indexes[i] = j then indexes[j] != i.
*
* 2. Group indexes
* Group indexes if share neighbours in common - Depth-first search
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
* (i,j,k) will form a group and (m,n) will form another group.
*************************************************************************************/
int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray();
KdTree kdTree = new KdTree(elements, candidatesPoint);
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
// 1. Find nearest neighbours indexes
Parallel.For(0, elements.Count, parallelOptions, e =>
{
var pivot = elements[e];
if (filterPivot(pivot))
{
var paired = kdTree.FindNearestNeighbour(pivot, pivotPoint, distMeasure, out int index, out double dist);
if (index != -1 && filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
{
indexes[e] = index;
}
}
});
// 2. Group indexes
foreach (var group in GroupIndexes(indexes))
{
yield return group.Select(i => elements[i]).ToList();
}
}
///
/// Algorithm to group elements using nearest neighbours.
/// Uses the k-nearest neighbours as candidates.
///
/// Letter, Word, TextLine, etc.
/// Elements to group.
/// The k-nearest neighbours to consider as candidates.
/// The distance measure between two points.
/// The function that determines the maximum distance between two points in the same cluster.
/// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.
/// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.
/// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
/// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
/// Sets the maximum number of concurrent tasks enabled.
/// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
public static IEnumerable> NearestNeighbours(IReadOnlyList elements, int k,
Func distMeasure,
Func maxDistanceFunction,
Func pivotPoint, Func candidatesPoint,
Func filterPivot, Func filterFinal,
int maxDegreeOfParallelism)
{
/*************************************************************************************
* Algorithm steps
* 1. Find nearest neighbours indexes (done in parallel)
* Iterate every point (pivot) and put its nearest neighbour's index in an array
* e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
* Only conciders a neighbour if it is within the maximum distance.
* If not within the maximum distance, index will be set to -1.
* Each element has only one connected neighbour.
* NB: Given the possible asymmetry in the relationship, it is possible
* that if indexes[i] = j then indexes[j] != i.
*
* 2. Group indexes
* Group indexes if share neighbours in common - Depth-first search
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
* (i,j,k) will form a group and (m,n) will form another group.
*************************************************************************************/
int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray();
KdTree kdTree = new KdTree(elements, candidatesPoint);
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
// 1. Find nearest neighbours indexes
Parallel.For(0, elements.Count, parallelOptions, e =>
{
var pivot = elements[e];
if (filterPivot(pivot))
{
foreach (var c in kdTree.FindNearestNeighbours(pivot, k, pivotPoint, distMeasure))
{
if (filterFinal(pivot, c.Item1) && c.Item3 < maxDistanceFunction(pivot, c.Item1))
{
indexes[e] = c.Item2;
break;
}
}
}
});
// 2. Group indexes
foreach (var group in GroupIndexes(indexes))
{
yield return group.Select(i => elements[i]).ToList();
}
}
///
/// Algorithm to group elements using nearest neighbours.
///
/// Letter, Word, TextLine, etc.
/// Array of elements to group.
/// The distance measure between two lines.
/// The function that determines the maximum distance between two points in the same cluster.
/// The pivot's line to use for pairing.
/// The candidates' line to use for pairing.
/// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
/// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
/// Sets the maximum number of concurrent tasks enabled.
/// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
public static IEnumerable> NearestNeighbours(IReadOnlyList elements,
Func distMeasure,
Func maxDistanceFunction,
Func pivotLine, Func candidatesLine,
Func filterPivot, Func filterFinal,
int maxDegreeOfParallelism)
{
/*************************************************************************************
* Algorithm steps
* 1. Find nearest neighbours indexes (done in parallel)
* Iterate every point (pivot) and put its nearest neighbour's index in an array
* e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
* Only conciders a neighbour if it is within the maximum distance.
* If not within the maximum distance, index will be set to -1.
* Each element has only one connected neighbour.
* NB: Given the possible asymmetry in the relationship, it is possible
* that if indexes[i] = j then indexes[j] != i.
*
* 2. Group indexes
* Group indexes if share neighbours in common - Depth-first search
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
* (i,j,k) will form a group and (m,n) will form another group.
*************************************************************************************/
int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray();
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
// 1. Find nearest neighbours indexes
Parallel.For(0, elements.Count, parallelOptions, e =>
{
var pivot = elements[e];
if (filterPivot(pivot))
{
int index = Distances.FindIndexNearest(pivot, elements, pivotLine, candidatesLine, distMeasure, out double dist);
if (index != -1)
{
var paired = elements[index];
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
{
indexes[e] = index;
}
}
}
});
// 2. Group indexes
foreach (var group in GroupIndexes(indexes))
{
yield return group.Select(i => elements[i]).ToList();
}
}
///
/// Group elements using Depth-first search.
/// https://en.wikipedia.org/wiki/Depth-first_search
///
/// The graph. edges[i] = j indicates that there is an edge between i and j.
/// A List of HashSets containing the grouped indexes.
internal static List> GroupIndexes(int[] edges)
{
int[][] adjacency = new int[edges.Length][];
for (int i = 0; i < edges.Length; i++)
{
HashSet matches = new HashSet();
if (edges[i] != -1) matches.Add(edges[i]);
for (int j = 0; j < edges.Length; j++)
{
if (edges[j] == i) matches.Add(j);
}
adjacency[i] = matches.ToArray();
}
List> groupedIndexes = new List>();
bool[] isDone = new bool[edges.Length];
for (int p = 0; p < edges.Length; p++)
{
if (isDone[p]) continue;
groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone));
}
return groupedIndexes;
}
///
/// Group elements using Depth-first search.
/// https://en.wikipedia.org/wiki/Depth-first_search
///
/// The graph. edges[i] = [j, k, l, ...] indicates that there is an edge between i and each element j, k, l, ...
/// A List of HashSets containing the grouped indexes.
internal static List> GroupIndexes(int[][] edges)
{
int[][] adjacency = new int[edges.Length][];
for (int i = 0; i < edges.Length; i++)
{
HashSet matches = new HashSet();
for (int j = 0; j < edges[i].Length; j++)
{
if (edges[i][j] != -1) matches.Add(edges[i][j]);
}
for (int j = 0; j < edges.Length; j++)
{
for (int k = 0; k < edges[j].Length; k++)
{
if (edges[j][k] == i) matches.Add(j);
}
}
adjacency[i] = matches.ToArray();
}
List> groupedIndexes = new List>();
bool[] isDone = new bool[edges.Length];
for (int p = 0; p < edges.Length; p++)
{
if (isDone[p]) continue;
groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone));
}
return groupedIndexes;
}
///
/// Depth-first search
/// https://en.wikipedia.org/wiki/Depth-first_search
///
private static HashSet DfsIterative(int s, int[][] adj, ref bool[] isDone)
{
HashSet group = new HashSet();
Stack S = new Stack();
S.Push(s);
while (S.Count > 0)
{
var u = S.Pop();
if (!isDone[u])
{
group.Add(u);
isDone[u] = true;
foreach (var v in adj[u])
{
S.Push(v);
}
}
}
return group;
}
}
}