From ffba176060d811dadf7c61f98559ab252d5a1a77 Mon Sep 17 00:00:00 2001 From: BobLd <38405645+BobLd@users.noreply.github.com> Date: Sun, 12 Oct 2025 20:17:34 +0100 Subject: [PATCH] Improve GroupIndexes() performance with #1178 --- .../Clustering.cs | 74 +++++-------------- 1 file changed, 20 insertions(+), 54 deletions(-) diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs index bd11c7b5..cc39470e 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs @@ -216,25 +216,26 @@ yield return group.Select(i => elements[i]).ToList(); } } - - /// - /// Group elements using Depth-first search. - /// https://en.wikipedia.org/wiki/Depth-first_search - /// - /// The graph. edges[i] = j indicates that there is an edge between i and j. - /// A List of HashSets containing the grouped indexes. + internal static List> GroupIndexes(int[] edges) { - int[][] adjacency = new int[edges.Length][]; + // Improved thanks to https://github.com/UglyToad/PdfPig/issues/1178 + var adjacency = new List[edges.Length]; for (int i = 0; i < edges.Length; i++) { - HashSet matches = new HashSet(); - if (edges[i] != -1) matches.Add(edges[i]); - for (int j = 0; j < edges.Length; j++) + adjacency[i] = new List(); + } + + // one pass O(n) + for (int i = 0; i < edges.Length; i++) + { + int j = edges[i]; + if (j != -1) { - if (edges[j] == i) matches.Add(j); + // i <-> j + adjacency[i].Add(j); + adjacency[j].Add(i); } - adjacency[i] = matches.ToArray(); } List> groupedIndexes = new List>(); @@ -242,55 +243,20 @@ for (int p = 0; p < edges.Length; p++) { - if (isDone[p]) continue; + if (isDone[p]) + { + continue; + } groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone)); } return groupedIndexes; } - - /// - /// Group elements using Depth-first search. - /// https://en.wikipedia.org/wiki/Depth-first_search - /// - /// The graph. edges[i] = [j, k, l, ...] indicates that there is an edge between i and each element j, k, l, ... - /// A List of HashSets containing the grouped indexes. - internal static List> GroupIndexes(int[][] edges) - { - int[][] adjacency = new int[edges.Length][]; - for (int i = 0; i < edges.Length; i++) - { - HashSet matches = new HashSet(); - for (int j = 0; j < edges[i].Length; j++) - { - if (edges[i][j] != -1) matches.Add(edges[i][j]); - } - - for (int j = 0; j < edges.Length; j++) - { - for (int k = 0; k < edges[j].Length; k++) - { - if (edges[j][k] == i) matches.Add(j); - } - } - adjacency[i] = matches.ToArray(); - } - - List> groupedIndexes = new List>(); - bool[] isDone = new bool[edges.Length]; - - for (int p = 0; p < edges.Length; p++) - { - if (isDone[p]) continue; - groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone)); - } - return groupedIndexes; - } - + /// /// Depth-first search /// https://en.wikipedia.org/wiki/Depth-first_search /// - private static HashSet DfsIterative(int s, int[][] adj, ref bool[] isDone) + private static HashSet DfsIterative(int s, List[] adj, ref bool[] isDone) { HashSet group = new HashSet(); Stack S = new Stack();