namespace UglyToad.PdfPig.DocumentLayoutAnalysis { using Core; using System; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; ///

/// Clustering Algorithms. ///

public static class Clustering { ///

/// Algorithm to group elements using nearest neighbours. /// Uses the nearest neighbour as candidate. ///

/// Letter, Word, TextLine, etc. /// Elements to group. /// The distance measure between two points. /// The function that determines the maximum distance between two points in the same cluster. /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft. /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public static IEnumerable> NearestNeighbours(IReadOnlyList elements, Func distMeasure, Func maxDistanceFunction, Func pivotPoint, Func candidatesPoint, Func filterPivot, Func filterFinal, int maxDegreeOfParallelism) { /************************************************************************************* * Algorithm steps * 1. Find nearest neighbours indexes (done in parallel) * Iterate every point (pivot) and put its nearest neighbour's index in an array * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. * Only conciders a neighbour if it is within the maximum distance. * If not within the maximum distance, index will be set to -1. * Each element has only one connected neighbour. * NB: Given the possible asymmetry in the relationship, it is possible * that if indexes[i] = j then indexes[j] != i. * * 2. Group indexes * Group indexes if share neighbours in common - Depth-first search * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 * (i,j,k) will form a group and (m,n) will form another group. *************************************************************************************/ int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray(); KdTree kdTree = new KdTree(elements, candidatesPoint); ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; // 1. Find nearest neighbours indexes Parallel.For(0, elements.Count, parallelOptions, e => { var pivot = elements[e]; if (filterPivot(pivot)) { var paired = kdTree.FindNearestNeighbour(pivot, pivotPoint, distMeasure, out int index, out double dist); if (index != -1 && filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) { indexes[e] = index; } } }); // 2. Group indexes foreach (var group in GroupIndexes(indexes)) { yield return group.Select(i => elements[i]).ToList(); } } ///

/// Algorithm to group elements using nearest neighbours. /// Uses the k-nearest neighbours as candidates. ///

/// Letter, Word, TextLine, etc. /// Elements to group. /// The k-nearest neighbours to consider as candidates. /// The distance measure between two points. /// The function that determines the maximum distance between two points in the same cluster. /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft. /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public static IEnumerable> NearestNeighbours(IReadOnlyList elements, int k, Func distMeasure, Func maxDistanceFunction, Func pivotPoint, Func candidatesPoint, Func filterPivot, Func filterFinal, int maxDegreeOfParallelism) { /************************************************************************************* * Algorithm steps * 1. Find nearest neighbours indexes (done in parallel) * Iterate every point (pivot) and put its nearest neighbour's index in an array * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. * Only conciders a neighbour if it is within the maximum distance. * If not within the maximum distance, index will be set to -1. * Each element has only one connected neighbour. * NB: Given the possible asymmetry in the relationship, it is possible * that if indexes[i] = j then indexes[j] != i. * * 2. Group indexes * Group indexes if share neighbours in common - Depth-first search * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 * (i,j,k) will form a group and (m,n) will form another group. *************************************************************************************/ int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray(); KdTree kdTree = new KdTree(elements, candidatesPoint); ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; // 1. Find nearest neighbours indexes Parallel.For(0, elements.Count, parallelOptions, e => { var pivot = elements[e]; if (filterPivot(pivot)) { foreach (var c in kdTree.FindNearestNeighbours(pivot, k, pivotPoint, distMeasure)) { if (filterFinal(pivot, c.Item1) && c.Item3 < maxDistanceFunction(pivot, c.Item1)) { indexes[e] = c.Item2; break; } } } }); // 2. Group indexes foreach (var group in GroupIndexes(indexes)) { yield return group.Select(i => elements[i]).ToList(); } } ///

/// Algorithm to group elements using nearest neighbours. ///

/// Letter, Word, TextLine, etc. /// Array of elements to group. /// The distance measure between two lines. /// The function that determines the maximum distance between two points in the same cluster. /// The pivot's line to use for pairing. /// The candidates' line to use for pairing. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public static IEnumerable> NearestNeighbours(IReadOnlyList elements, Func distMeasure, Func maxDistanceFunction, Func pivotLine, Func candidatesLine, Func filterPivot, Func filterFinal, int maxDegreeOfParallelism) { /************************************************************************************* * Algorithm steps * 1. Find nearest neighbours indexes (done in parallel) * Iterate every point (pivot) and put its nearest neighbour's index in an array * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. * Only conciders a neighbour if it is within the maximum distance. * If not within the maximum distance, index will be set to -1. * Each element has only one connected neighbour. * NB: Given the possible asymmetry in the relationship, it is possible * that if indexes[i] = j then indexes[j] != i. * * 2. Group indexes * Group indexes if share neighbours in common - Depth-first search * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 * (i,j,k) will form a group and (m,n) will form another group. *************************************************************************************/ int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray(); ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; // 1. Find nearest neighbours indexes Parallel.For(0, elements.Count, parallelOptions, e => { var pivot = elements[e]; if (filterPivot(pivot)) { int index = Distances.FindIndexNearest(pivot, elements, pivotLine, candidatesLine, distMeasure, out double dist); if (index != -1) { var paired = elements[index]; if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) { indexes[e] = index; } } } }); // 2. Group indexes foreach (var group in GroupIndexes(indexes)) { yield return group.Select(i => elements[i]).ToList(); } } ///

/// Group elements using Depth-first search. /// https://en.wikipedia.org/wiki/Depth-first_search ///

/// The graph. edges[i] = j indicates that there is an edge between i and j. /// A List of HashSets containing the grouped indexes. internal static List> GroupIndexes(int[] edges) { int[][] adjacency = new int[edges.Length][]; for (int i = 0; i < edges.Length; i++) { HashSet matches = new HashSet(); if (edges[i] != -1) matches.Add(edges[i]); for (int j = 0; j < edges.Length; j++) { if (edges[j] == i) matches.Add(j); } adjacency[i] = matches.ToArray(); } List> groupedIndexes = new List>(); bool[] isDone = new bool[edges.Length]; for (int p = 0; p < edges.Length; p++) { if (isDone[p]) continue; groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone)); } return groupedIndexes; } ///

/// Group elements using Depth-first search. /// https://en.wikipedia.org/wiki/Depth-first_search ///

/// The graph. edges[i] = [j, k, l, ...] indicates that there is an edge between i and each element j, k, l, ... /// A List of HashSets containing the grouped indexes. internal static List> GroupIndexes(int[][] edges) { int[][] adjacency = new int[edges.Length][]; for (int i = 0; i < edges.Length; i++) { HashSet matches = new HashSet(); for (int j = 0; j < edges[i].Length; j++) { if (edges[i][j] != -1) matches.Add(edges[i][j]); } for (int j = 0; j < edges.Length; j++) { for (int k = 0; k < edges[j].Length; k++) { if (edges[j][k] == i) matches.Add(j); } } adjacency[i] = matches.ToArray(); } List> groupedIndexes = new List>(); bool[] isDone = new bool[edges.Length]; for (int p = 0; p < edges.Length; p++) { if (isDone[p]) continue; groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone)); } return groupedIndexes; } ///

/// Depth-first search /// https://en.wikipedia.org/wiki/Depth-first_search ///

private static HashSet DfsIterative(int s, int[][] adj, ref bool[] isDone) { HashSet group = new HashSet(); Stack S = new Stack(); S.Push(s); while (S.Count > 0) { var u = S.Pop(); if (!isDone[u]) { group.Add(u); isDone[u] = true; foreach (var v in adj[u]) { S.Push(v); } } } return group; } } }