using System; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; using UglyToad.PdfPig.Geometry; namespace UglyToad.PdfPig.DocumentLayoutAnalysis { /// /// Clustering Algorithms. /// internal class ClusteringAlgorithms { /// /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance. /// https://en.wikipedia.org/wiki/Transitive_closure /// /// Letter, Word, TextLine, etc. /// Array of elements to group. /// The distance measure between two points. /// The function that determines the maximum distance between two points in the same cluster. /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft. /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. internal static IEnumerable> SimpleTransitiveClosure(T[] elements, Func distMeasure, Func maxDistanceFunction, Func pivotPoint, Func candidatesPoint, Func filterPivot, Func filterFinal) { /************************************************************************************* * Algorithm steps * 1. Find nearest neighbours indexes (done in parallel) * Iterate every point (pivot) and put its nearest neighbour's index in an array * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. * Only conciders a neighbour if it is within the maximum distance. * If not within the maximum distance, index will be set to -1. * Each element has only one connected neighbour. * NB: Given the possible asymmetry in the relationship, it is possible * that if indexes[i] = j then indexes[j] != i. * * 2. Group indexes * Group indexes if share neighbours in common - Transitive closure * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 * (i,j,k) will form a group and (m,n) will form another group. *************************************************************************************/ int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray(); var candidatesPoints = elements.Select(x => candidatesPoint(x)).ToList(); // 1. Find nearest neighbours indexes Parallel.For(0, elements.Length, e => { var pivot = elements[e]; if (filterPivot(pivot)) { int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist); var paired = elements[index]; if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) { indexes[e] = index; } } }); // 2. Group indexes var groupedIndexes = GroupIndexes(indexes); return groupedIndexes; } /// /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance. /// https://en.wikipedia.org/wiki/Transitive_closure /// /// Letter, Word, TextLine, etc. /// Array of elements to group. /// The distance measure between two lines. /// The function that determines the maximum distance between two points in the same cluster. /// The pivot's line to use for pairing. /// The candidates' line to use for pairing. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. internal static IEnumerable> SimpleTransitiveClosure(T[] elements, Func distMeasure, Func maxDistanceFunction, Func pivotLine, Func candidatesLine, Func filterPivot, Func filterFinal) { /************************************************************************************* * Algorithm steps * 1. Find nearest neighbours indexes (done in parallel) * Iterate every point (pivot) and put its nearest neighbour's index in an array * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. * Only conciders a neighbour if it is within the maximum distance. * If not within the maximum distance, index will be set to -1. * Each element has only one connected neighbour. * NB: Given the possible asymmetry in the relationship, it is possible * that if indexes[i] = j then indexes[j] != i. * * 2. Group indexes * Group indexes if share neighbours in common - Transitive closure * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 * (i,j,k) will form a group and (m,n) will form another group. *************************************************************************************/ int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray(); var candidatesLines = elements.Select(x => candidatesLine(x)).ToList(); // 1. Find nearest neighbours indexes Parallel.For(0, elements.Length, e => { var pivot = elements[e]; if (filterPivot(pivot)) { int index = pivotLine(pivot).FindIndexNearest(candidatesLines, distMeasure, out double dist); var paired = elements[index]; if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) { indexes[e] = index; } } }); // 2. Group indexes var groupedIndexes = GroupIndexes(indexes); return groupedIndexes; } /// /// Group elements via transitive closure. Each element has only one connected neighbour. /// https://en.wikipedia.org/wiki/Transitive_closure /// /// Array of paired elements index. /// private static List> GroupIndexes(int[] indexes) { int[][] adjacency = new int[indexes.Length][]; for (int i = 0; i < indexes.Length; i++) { HashSet matches = new HashSet(); for (int j = 0; j < indexes.Length; ++j) { if (indexes[j] == i) matches.Add(j); } adjacency[i] = matches.ToArray(); } List> groupedIndexes = new List>(); bool[] isDone = new bool[indexes.Length]; for (int p = 0; p < indexes.Length; p++) { if (isDone[p]) continue; LinkedList L = new LinkedList(); HashSet grouped = new HashSet(); L.AddLast(new[] { p, indexes[p] }); while (L.Any()) { var current = L.First.Value; L.RemoveFirst(); var current0 = current[0]; var current1 = current[1]; if (current0 != -1 && !isDone[current0]) { var adjs = adjacency[current0]; foreach (var k in adjs) { if (isDone[k]) continue; L.AddLast(new[] { k, current0 }); } int current0P = indexes[current0]; if (current0P != -1) { var adjsP = adjacency[current0P]; foreach (var k in adjsP) { if (isDone[k]) continue; L.AddLast(new[] { k, current0P }); isDone[k] = true; grouped.Add(k); } } else { L.AddLast(new[] { current0, current0P }); isDone[current0] = true; grouped.Add(current0); } } if (current1 != -1 && !isDone[current1]) { var adjs = adjacency[current1]; foreach (var k in adjs) { if (isDone[k]) continue; L.AddLast(new[] { k, current1 }); } int current1P = indexes[current1]; if (current1P != -1) { var adjsP = adjacency[current1P]; foreach (var k in adjsP) { if (isDone[k]) continue; L.AddLast(new[] { k, current1P }); isDone[k] = true; grouped.Add(k); } } else { L.AddLast(new[] { current1, current1P }); isDone[current1] = true; grouped.Add(current1); } } } groupedIndexes.Add(grouped); } return groupedIndexes; } } }