diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
index 4e9e6182..248ee1da 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
@@ -36,6 +36,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
* e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
* Only conciders a neighbour if it is within the maximum distance.
* If not within the maximum distance, index will be set to -1.
+ * Each element has only one connected neighbour.
* NB: Given the possible asymmetry in the relationship, it is possible
* that if indexes[i] = j then indexes[j] != i.
*
@@ -43,10 +44,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
* Group indexes if share neighbours in common - Transitive closure
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
* (i,j,k) will form a group and (m,n) will form another group.
- *
- * 3. Merge groups that have indexes in common - If any
- * If there are group with indexes in common, merge them.
- * (Could be improved and put in step 2)
*************************************************************************************/
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
@@ -70,8 +67,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
});
// 2. Group indexes
- // 3. Merge groups that have indexes in common
- var groupedIndexes = GroupMergeIndexes(indexes);
+ var groupedIndexes = GroupIndexes(indexes);
return groupedIndexes;
}
@@ -101,6 +97,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
* e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
* Only conciders a neighbour if it is within the maximum distance.
* If not within the maximum distance, index will be set to -1.
+ * Each element has only one connected neighbour.
* NB: Given the possible asymmetry in the relationship, it is possible
* that if indexes[i] = j then indexes[j] != i.
*
@@ -108,10 +105,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
* Group indexes if share neighbours in common - Transitive closure
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
* (i,j,k) will form a group and (m,n) will form another group.
- *
- * 3. Merge groups that have indexes in common - If any
- * If there are group with indexes in common, merge them.
- * (Could be improved and put in step 2)
*************************************************************************************/
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
@@ -135,108 +128,109 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
});
// 2. Group indexes
- // 3. Merge groups that have indexes in common
- var groupedIndexes = GroupMergeIndexes(indexes);
+ var groupedIndexes = GroupIndexes(indexes);
return groupedIndexes;
}
///
- /// Group elements via transitive closure.
+ /// Group elements via transitive closure. Each element has only one connected neighbour.
/// https://en.wikipedia.org/wiki/Transitive_closure
///
/// Array of paired elements index.
///
- internal static List> GroupMergeIndexes(int[] indexes)
+ private static List> GroupIndexes(int[] indexes)
{
- // 2. Group indexes
+ int[][] adjacency = new int[indexes.Length][];
+ for (int i = 0; i < indexes.Length; i++)
+ {
+ HashSet matches = new HashSet();
+ for (int j = 0; j < indexes.Length; ++j)
+ {
+ if (indexes[j] == i) matches.Add(j);
+ }
+ adjacency[i] = matches.ToArray();
+ }
+
List> groupedIndexes = new List>();
- HashSet indexDone = new HashSet();
+ bool[] isDone = new bool[indexes.Length];
- for (int e = 0; e < indexes.Length; e++)
+ for (int p = 0; p < indexes.Length; p++)
{
- int index = indexes[e];
+ if (isDone[p]) continue;
- if (index == -1) // This element is not connected
+ LinkedList L = new LinkedList();
+ HashSet grouped = new HashSet();
+ L.AddLast(new[] { p, indexes[p] });
+
+ while (L.Any())
{
- // Check if another element's index is connected to this element (nb: distance measure is asymmetric)
- if (!indexes.Contains(e))
- {
- // If no other element is connected to this element, add it as a standalone element
- groupedIndexes.Add(new HashSet() { e });
- indexDone.Add(e);
- }
- continue;
- }
+ var current = L.First.Value;
+ L.RemoveFirst();
+ var current0 = current[0];
+ var current1 = current[1];
- bool isDoneC = indexDone.Contains(e);
- bool isDoneI = indexDone.Contains(index);
- if (isDoneC || isDoneI)
- {
- if (isDoneC && !isDoneI)
+ if (current0 != -1 && !isDone[current0])
{
- foreach (var pair in groupedIndexes.Where(x => x.Contains(e)))
+ var adjs = adjacency[current0];
+ foreach (var k in adjs)
{
- pair.Add(index);
- }
- indexDone.Add(index);
- }
- else if (!isDoneC && isDoneI)
- {
- foreach (var pair in groupedIndexes.Where(x => x.Contains(index)))
- {
- pair.Add(e);
- }
- indexDone.Add(e);
- }
- else // isDoneC && isDoneI
- {
- foreach (var pair in groupedIndexes.Where(x => x.Contains(index)))
- {
- if (!pair.Contains(e)) pair.Add(e);
+ if (isDone[k]) continue;
+ L.AddLast(new[] { k, current0 });
}
- foreach (var pair in groupedIndexes.Where(x => x.Contains(e)))
+ int current0P = indexes[current0];
+ if (current0P != -1)
{
- if (!pair.Contains(index)) pair.Add(index);
+ var adjsP = adjacency[current0P];
+ foreach (var k in adjsP)
+ {
+ if (isDone[k]) continue;
+ L.AddLast(new[] { k, current0P });
+ isDone[k] = true;
+ grouped.Add(k);
+ }
+ }
+ else
+ {
+ L.AddLast(new[] { current0, current0P });
+ isDone[current0] = true;
+ grouped.Add(current0);
+ }
+ }
+
+ if (current1 != -1 && !isDone[current1])
+ {
+ var adjs = adjacency[current1];
+ foreach (var k in adjs)
+ {
+ if (isDone[k]) continue;
+ L.AddLast(new[] { k, current1 });
+ }
+
+ int current1P = indexes[current1];
+ if (current1P != -1)
+ {
+ var adjsP = adjacency[current1P];
+ foreach (var k in adjsP)
+ {
+ if (isDone[k]) continue;
+ L.AddLast(new[] { k, current1P });
+ isDone[k] = true;
+ grouped.Add(k);
+ }
+ }
+ else
+ {
+ L.AddLast(new[] { current1, current1P });
+ isDone[current1] = true;
+ grouped.Add(current1);
}
}
}
- else
- {
- groupedIndexes.Add(new HashSet() { e, index });
- indexDone.Add(e);
- indexDone.Add(index);
- }
+ groupedIndexes.Add(grouped);
}
- // Check that all elements are done
- if (indexes.Length != indexDone.Count)
- {
- throw new Exception("ClusteringAlgorithms.GetNNGroupedIndexes(): Some elements were not done.");
- }
-
- // 3. Merge groups that have indexes in common
- // Check if duplicates (if duplicates, then same index in different groups)
- if (indexDone.Count != groupedIndexes.SelectMany(x => x).Count())
- {
- for (int e = 0; e < indexes.Length; e++)
- {
- List> candidates = groupedIndexes.Where(x => x.Contains(e)).ToList();
- int count = candidates.Count();
- if (count < 2) continue; // Only one group with this index
-
- HashSet merged = candidates.First();
- groupedIndexes.Remove(merged);
- for (int i = 1; i < count; i++)
- {
- var current = candidates.ElementAt(i);
- merged.UnionWith(current);
- groupedIndexes.Remove(current);
- }
- groupedIndexes.Add(merged);
- }
- }
return groupedIndexes;
}
}
diff --git a/src/UglyToad.PdfPig/Geometry/PdfLine.cs b/src/UglyToad.PdfPig/Geometry/PdfLine.cs
index 81395325..dc281f0b 100644
--- a/src/UglyToad.PdfPig/Geometry/PdfLine.cs
+++ b/src/UglyToad.PdfPig/Geometry/PdfLine.cs
@@ -52,5 +52,28 @@
Point1 = point1;
Point2 = point2;
}
+
+ ///
+ /// Returns a value indicating whether this is equal to a specified .
+ ///
+ ///
+ ///
+ public override bool Equals(object obj)
+ {
+ if (obj is PdfLine line)
+ {
+ return line.Point1.Equals(this.Point1) && line.Point2.Equals(this.Point2);
+ }
+ return false;
+ }
+
+ ///
+ /// Returns the hash code for this .
+ ///
+ ///
+ public override int GetHashCode()
+ {
+ return (Point1, Point2).GetHashCode();
+ }
}
}