using System; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; using UglyToad.PdfPig.Geometry; namespace UglyToad.PdfPig.DocumentLayoutAnalysis { /// /// Clustering Algorithms. /// internal class ClusteringAlgorithms { /// /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance. /// https://en.wikipedia.org/wiki/Transitive_closure /// /// Letter, Word, TextLine, etc. /// List of elements to group. /// The distance measure between two points. /// The function that determines the maximum distance between two points in the same cluster. /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft. /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. internal static IEnumerable> ClusterNearestNeighbours(List elements, Func distMeasure, Func maxDistanceFunction, Func pivotPoint, Func candidatesPoint, Func filterPivot, Func filterFinal) { /************************************************************************************* * Algorithm steps * 1. Find nearest neighbours indexes (done in parallel) * Iterate every point (pivot) and put its nearest neighbour's index in an array * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. * Only conciders a neighbour if it is within the maximum distance. * If not within the maximum distance, index will be set to -1. * Each element has only one connected neighbour. * NB: Given the possible asymmetry in the relationship, it is possible * that if indexes[i] = j then indexes[j] != i. * * 2. Group indexes * Group indexes if share neighbours in common - Depth-first search * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 * (i,j,k) will form a group and (m,n) will form another group. *************************************************************************************/ int[] indexes = Enumerable.Repeat((int)-1, elements.Count).ToArray(); var candidatesPoints = elements.Select(candidatesPoint).ToList(); // 1. Find nearest neighbours indexes Parallel.For(0, elements.Count, e => { var pivot = elements[e]; if (filterPivot(pivot)) { int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist); if (index != -1) { var paired = elements[index]; if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) { indexes[e] = index; } } } }); // 2. Group indexes var groupedIndexes = GroupIndexes(indexes); return groupedIndexes; } /// /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance. /// https://en.wikipedia.org/wiki/Transitive_closure /// /// Letter, Word, TextLine, etc. /// Array of elements to group. /// The distance measure between two points. /// The function that determines the maximum distance between two points in the same cluster. /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft. /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. internal static IEnumerable> ClusterNearestNeighbours(T[] elements, Func distMeasure, Func maxDistanceFunction, Func pivotPoint, Func candidatesPoint, Func filterPivot, Func filterFinal) { /************************************************************************************* * Algorithm steps * 1. Find nearest neighbours indexes (done in parallel) * Iterate every point (pivot) and put its nearest neighbour's index in an array * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. * Only conciders a neighbour if it is within the maximum distance. * If not within the maximum distance, index will be set to -1. * Each element has only one connected neighbour. * NB: Given the possible asymmetry in the relationship, it is possible * that if indexes[i] = j then indexes[j] != i. * * 2. Group indexes * Group indexes if share neighbours in common - Depth-first search * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 * (i,j,k) will form a group and (m,n) will form another group. *************************************************************************************/ int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray(); var candidatesPoints = elements.Select(candidatesPoint).ToList(); // 1. Find nearest neighbours indexes Parallel.For(0, elements.Length, e => { var pivot = elements[e]; if (filterPivot(pivot)) { int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist); if (index != -1) { var paired = elements[index]; if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) { indexes[e] = index; } } } }); // 2. Group indexes var groupedIndexes = GroupIndexes(indexes); return groupedIndexes; } /// /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance. /// https://en.wikipedia.org/wiki/Transitive_closure /// /// Letter, Word, TextLine, etc. /// Array of elements to group. /// The distance measure between two lines. /// The function that determines the maximum distance between two points in the same cluster. /// The pivot's line to use for pairing. /// The candidates' line to use for pairing. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. internal static IEnumerable> ClusterNearestNeighbours(T[] elements, Func distMeasure, Func maxDistanceFunction, Func pivotLine, Func candidatesLine, Func filterPivot, Func filterFinal) { /************************************************************************************* * Algorithm steps * 1. Find nearest neighbours indexes (done in parallel) * Iterate every point (pivot) and put its nearest neighbour's index in an array * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. * Only conciders a neighbour if it is within the maximum distance. * If not within the maximum distance, index will be set to -1. * Each element has only one connected neighbour. * NB: Given the possible asymmetry in the relationship, it is possible * that if indexes[i] = j then indexes[j] != i. * * 2. Group indexes * Group indexes if share neighbours in common - Depth-first search * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 * (i,j,k) will form a group and (m,n) will form another group. *************************************************************************************/ int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray(); var candidatesLines = elements.Select(x => candidatesLine(x)).ToList(); // 1. Find nearest neighbours indexes Parallel.For(0, elements.Length, e => { var pivot = elements[e]; if (filterPivot(pivot)) { int index = pivot.FindIndexNearest(elements, candidatesLine, pivotLine, distMeasure, out double dist); if (index != -1) { var paired = elements[index]; if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) { indexes[e] = index; } } } }); // 2. Group indexes var groupedIndexes = GroupIndexes(indexes); return groupedIndexes; } /// /// Group elements using Depth-first search. /// https://en.wikipedia.org/wiki/Depth-first_search /// /// The graph. edges[i] = j indicates that there is an edge between i and j. /// A List of HashSets containing containing the grouped indexes. internal static List> GroupIndexes(int[] edges) { int[][] adjacency = new int[edges.Length][]; for (int i = 0; i < edges.Length; i++) { HashSet matches = new HashSet(); if (edges[i] != -1) matches.Add(edges[i]); for (int j = 0; j < edges.Length; j++) { if (edges[j] == i) matches.Add(j); } adjacency[i] = matches.ToArray(); } List> groupedIndexes = new List>(); bool[] isDone = new bool[edges.Length]; for (int p = 0; p < edges.Length; p++) { if (isDone[p]) continue; groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone)); } return groupedIndexes; } /// /// Group elements using Depth-first search. /// https://en.wikipedia.org/wiki/Depth-first_search /// /// The graph. edges[i] = [j, k, l, ...] indicates that there is an edge between i and each element j, k, l, ... /// A List of HashSets containing containing the grouped indexes. internal static List> GroupIndexes(int[][] edges) { int[][] adjacency = new int[edges.Length][]; for (int i = 0; i < edges.Length; i++) { HashSet matches = new HashSet(); for (int j = 0; j < edges[i].Length; j++) { if (edges[i][j] != -1) matches.Add(edges[i][j]); } for (int j = 0; j < edges.Length; j++) { for (int k = 0; k < edges[j].Length; k++) { if (edges[j][k] == i) matches.Add(j); } } adjacency[i] = matches.ToArray(); } List> groupedIndexes = new List>(); bool[] isDone = new bool[edges.Length]; for (int p = 0; p < edges.Length; p++) { if (isDone[p]) continue; groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone)); } return groupedIndexes; } /// /// Depth-first search /// https://en.wikipedia.org/wiki/Depth-first_search /// private static HashSet DfsIterative(int c, int[][] adj, ref bool[] isDone) { HashSet group = new HashSet(); Stack S = new Stack(); S.Push(c); while (S.Any()) { var v = S.Pop(); if (!isDone[v]) { group.Add(v); isDone[v] = true; foreach (var w in adj[v]) { S.Push(w); } } } return group; } } }