From ffba176060d811dadf7c61f98559ab252d5a1a77 Mon Sep 17 00:00:00 2001
From: BobLd <38405645+BobLd@users.noreply.github.com>
Date: Sun, 12 Oct 2025 20:17:34 +0100
Subject: [PATCH] Improve GroupIndexes() performance with #1178
---
.../Clustering.cs | 74 +++++--------------
1 file changed, 20 insertions(+), 54 deletions(-)
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs
index bd11c7b5..cc39470e 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs
@@ -216,25 +216,26 @@
yield return group.Select(i => elements[i]).ToList();
}
}
-
- ///
- /// Group elements using Depth-first search.
- /// https://en.wikipedia.org/wiki/Depth-first_search
- ///
- /// The graph. edges[i] = j indicates that there is an edge between i and j.
- /// A List of HashSets containing the grouped indexes.
+
internal static List> GroupIndexes(int[] edges)
{
- int[][] adjacency = new int[edges.Length][];
+ // Improved thanks to https://github.com/UglyToad/PdfPig/issues/1178
+ var adjacency = new List[edges.Length];
for (int i = 0; i < edges.Length; i++)
{
- HashSet matches = new HashSet();
- if (edges[i] != -1) matches.Add(edges[i]);
- for (int j = 0; j < edges.Length; j++)
+ adjacency[i] = new List();
+ }
+
+ // one pass O(n)
+ for (int i = 0; i < edges.Length; i++)
+ {
+ int j = edges[i];
+ if (j != -1)
{
- if (edges[j] == i) matches.Add(j);
+ // i <-> j
+ adjacency[i].Add(j);
+ adjacency[j].Add(i);
}
- adjacency[i] = matches.ToArray();
}
List> groupedIndexes = new List>();
@@ -242,55 +243,20 @@
for (int p = 0; p < edges.Length; p++)
{
- if (isDone[p]) continue;
+ if (isDone[p])
+ {
+ continue;
+ }
groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone));
}
return groupedIndexes;
}
-
- ///
- /// Group elements using Depth-first search.
- /// https://en.wikipedia.org/wiki/Depth-first_search
- ///
- /// The graph. edges[i] = [j, k, l, ...] indicates that there is an edge between i and each element j, k, l, ...
- /// A List of HashSets containing the grouped indexes.
- internal static List> GroupIndexes(int[][] edges)
- {
- int[][] adjacency = new int[edges.Length][];
- for (int i = 0; i < edges.Length; i++)
- {
- HashSet matches = new HashSet();
- for (int j = 0; j < edges[i].Length; j++)
- {
- if (edges[i][j] != -1) matches.Add(edges[i][j]);
- }
-
- for (int j = 0; j < edges.Length; j++)
- {
- for (int k = 0; k < edges[j].Length; k++)
- {
- if (edges[j][k] == i) matches.Add(j);
- }
- }
- adjacency[i] = matches.ToArray();
- }
-
- List> groupedIndexes = new List>();
- bool[] isDone = new bool[edges.Length];
-
- for (int p = 0; p < edges.Length; p++)
- {
- if (isDone[p]) continue;
- groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone));
- }
- return groupedIndexes;
- }
-
+
///
/// Depth-first search
/// https://en.wikipedia.org/wiki/Depth-first_search
///
- private static HashSet DfsIterative(int s, int[][] adj, ref bool[] isDone)
+ private static HashSet DfsIterative(int s, List[] adj, ref bool[] isDone)
{
HashSet group = new HashSet();
Stack S = new Stack();