diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
index de016723..0f60779c 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
@@ -23,7 +23,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.
/// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
/// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
- internal static IEnumerable> SimpleTransitiveClosure(List elements,
+ internal static IEnumerable> ClusterNearestNeighbours(List elements,
Func distMeasure,
Func maxDistanceFunction,
Func pivotPoint, Func candidatesPoint,
@@ -41,7 +41,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
* that if indexes[i] = j then indexes[j] != i.
*
* 2. Group indexes
- * Group indexes if share neighbours in common - Transitive closure
+ * Group indexes if share neighbours in common - Depth-first search
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
* (i,j,k) will form a group and (m,n) will form another group.
*************************************************************************************/
@@ -56,12 +56,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
if (filterPivot(pivot))
{
- int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist);
- var paired = elements[index];
+ int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist);
- if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
+ if (index != -1)
{
- indexes[e] = index;
+ var paired = elements[index];
+ if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
+ {
+ indexes[e] = index;
+ }
}
}
});
@@ -84,7 +87,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.
/// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
/// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
- internal static IEnumerable> SimpleTransitiveClosure(T[] elements,
+ internal static IEnumerable> ClusterNearestNeighbours(T[] elements,
Func distMeasure,
Func maxDistanceFunction,
Func pivotPoint, Func candidatesPoint,
@@ -102,7 +105,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
* that if indexes[i] = j then indexes[j] != i.
*
* 2. Group indexes
- * Group indexes if share neighbours in common - Transitive closure
+ * Group indexes if share neighbours in common - Depth-first search
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
* (i,j,k) will form a group and (m,n) will form another group.
*************************************************************************************/
@@ -117,12 +120,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
if (filterPivot(pivot))
{
- int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist);
- var paired = elements[index];
+ int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist);
- if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
+ if (index != -1)
{
- indexes[e] = index;
+ var paired = elements[index];
+ if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
+ {
+ indexes[e] = index;
+ }
}
}
});
@@ -145,7 +151,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// The candidates' line to use for pairing.
/// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
/// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
- internal static IEnumerable> SimpleTransitiveClosure(T[] elements,
+ internal static IEnumerable> ClusterNearestNeighbours(T[] elements,
Func distMeasure,
Func maxDistanceFunction,
Func pivotLine, Func candidatesLine,
@@ -163,7 +169,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
* that if indexes[i] = j then indexes[j] != i.
*
* 2. Group indexes
- * Group indexes if share neighbours in common - Transitive closure
+ * Group indexes if share neighbours in common - Depth-first search
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
* (i,j,k) will form a group and (m,n) will form another group.
*************************************************************************************/
@@ -178,12 +184,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
if (filterPivot(pivot))
{
- int index = pivotLine(pivot).FindIndexNearest(candidatesLines, distMeasure, out double dist);
- var paired = elements[index];
+ int index = pivot.FindIndexNearest(elements, candidatesLine, pivotLine, distMeasure, out double dist);
- if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
+ if (index != -1)
{
- indexes[e] = index;
+ var paired = elements[index];
+ if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
+ {
+ indexes[e] = index;
+ }
}
}
});
@@ -195,104 +204,98 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
}
///
- /// Group elements via transitive closure. Each element has only one connected neighbour.
- /// https://en.wikipedia.org/wiki/Transitive_closure
+ /// Group elements using Depth-first search.
+ /// https://en.wikipedia.org/wiki/Depth-first_search
///
- /// Array of paired elements index.
- ///
- private static List> GroupIndexes(int[] indexes)
+ /// The graph. edges[i] = j indicates that there is an edge between i and j.
+ /// A List of HashSets containing containing the grouped indexes.
+ internal static List> GroupIndexes(int[] edges)
{
- int[][] adjacency = new int[indexes.Length][];
- for (int i = 0; i < indexes.Length; i++)
+ int[][] adjacency = new int[edges.Length][];
+ for (int i = 0; i < edges.Length; i++)
{
HashSet matches = new HashSet();
- for (int j = 0; j < indexes.Length; ++j)
+ if (edges[i] != -1) matches.Add(edges[i]);
+ for (int j = 0; j < edges.Length; j++)
{
- if (indexes[j] == i) matches.Add(j);
+ if (edges[j] == i) matches.Add(j);
}
adjacency[i] = matches.ToArray();
}
List> groupedIndexes = new List>();
- bool[] isDone = new bool[indexes.Length];
+ bool[] isDone = new bool[edges.Length];
- for (int p = 0; p < indexes.Length; p++)
+ for (int p = 0; p < edges.Length; p++)
{
if (isDone[p]) continue;
+ groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone));
+ }
+ return groupedIndexes;
+ }
- LinkedList L = new LinkedList();
- HashSet grouped = new HashSet();
- L.AddLast(new[] { p, indexes[p] });
-
- while (L.Any())
+ ///
+ /// Group elements using Depth-first search.
+ /// https://en.wikipedia.org/wiki/Depth-first_search
+ ///
+ /// The graph. edges[i] = [j, k, l, ...] indicates that there is an edge between i and each element j, k, l, ...
+ /// A List of HashSets containing containing the grouped indexes.
+ internal static List> GroupIndexes(int[][] edges)
+ {
+ int[][] adjacency = new int[edges.Length][];
+ for (int i = 0; i < edges.Length; i++)
+ {
+ HashSet matches = new HashSet();
+ for (int j = 0; j < edges[i].Length; j++)
{
- var current = L.First.Value;
- L.RemoveFirst();
- var current0 = current[0];
- var current1 = current[1];
+ if (edges[i][j] != -1) matches.Add(edges[i][j]);
+ }
- if (current0 != -1 && !isDone[current0])
+ for (int j = 0; j < edges.Length; j++)
+ {
+ for (int k = 0; k < edges[j].Length; k++)
{
- var adjs = adjacency[current0];
- foreach (var k in adjs)
- {
- if (isDone[k]) continue;
- L.AddLast(new[] { k, current0 });
- }
-
- int current0P = indexes[current0];
- if (current0P != -1)
- {
- var adjsP = adjacency[current0P];
- foreach (var k in adjsP)
- {
- if (isDone[k]) continue;
- L.AddLast(new[] { k, current0P });
- isDone[k] = true;
- grouped.Add(k);
- }
- }
- else
- {
- L.AddLast(new[] { current0, current0P });
- isDone[current0] = true;
- grouped.Add(current0);
- }
- }
-
- if (current1 != -1 && !isDone[current1])
- {
- var adjs = adjacency[current1];
- foreach (var k in adjs)
- {
- if (isDone[k]) continue;
- L.AddLast(new[] { k, current1 });
- }
-
- int current1P = indexes[current1];
- if (current1P != -1)
- {
- var adjsP = adjacency[current1P];
- foreach (var k in adjsP)
- {
- if (isDone[k]) continue;
- L.AddLast(new[] { k, current1P });
- isDone[k] = true;
- grouped.Add(k);
- }
- }
- else
- {
- L.AddLast(new[] { current1, current1P });
- isDone[current1] = true;
- grouped.Add(current1);
- }
+ if (edges[j][k] == i) matches.Add(j);
}
}
- groupedIndexes.Add(grouped);
+ adjacency[i] = matches.ToArray();
}
+ List> groupedIndexes = new List>();
+ bool[] isDone = new bool[edges.Length];
+
+ for (int p = 0; p < edges.Length; p++)
+ {
+ if (isDone[p]) continue;
+ groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone));
+ }
return groupedIndexes;
}
+
+ ///
+ /// Depth-first search
+ /// https://en.wikipedia.org/wiki/Depth-first_search
+ ///
+ private static HashSet DfsIterative(int c, int[][] adj, ref bool[] isDone)
+ {
+ HashSet group = new HashSet();
+ Stack S = new Stack();
+ S.Push(c);
+
+ while (S.Any())
+ {
+ var v = S.Pop();
+ if (!isDone[v])
+ {
+ group.Add(v);
+ isDone[v] = true;
+ foreach (var w in adj[v])
+ {
+ S.Push(w);
+ }
+ }
+ }
+ return group;
+ }
}
}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
index f099c175..84535e62 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
@@ -81,52 +81,21 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
}
///
- /// Find the nearest point.
+ /// Find the index of the nearest point, excluding itself.
///
- /// The reference point, for which to find the nearest neighbour.
- /// The list of neighbours candidates.
+ ///
+ /// The reference point, for which to find the nearest neighbour.
+ /// The list of neighbours candidates.
+ ///
+ ///
/// The distance measure to use.
/// The distance between reference point, and its nearest neighbour.
- public static PdfPoint FindNearest(this PdfPoint pdfPoint, IReadOnlyList points,
+ ///
+ public static int FindIndexNearest(this T element, IReadOnlyList candidates,
+ Func candidatesPoint, Func pivotPoint,
Func distanceMeasure, out double distance)
{
- if (points == null || points.Count == 0)
- {
- throw new ArgumentException("Distances.FindNearest(): The list of neighbours candidates is either null or empty.", "points");
- }
-
- if (distanceMeasure == null)
- {
- throw new ArgumentException("Distances.FindNearest(): The distance measure must not be null.", "distanceMeasure");
- }
-
- distance = double.MaxValue;
- PdfPoint closestPoint = default;
-
- for (var i = 0; i < points.Count; i++)
- {
- double currentDistance = distanceMeasure(points[i], pdfPoint);
- if (currentDistance < distance)
- {
- distance = currentDistance;
- closestPoint = points[i];
- }
- }
-
- return closestPoint;
- }
-
- ///
- /// Find the index of the nearest point.
- ///
- /// The reference point, for which to find the nearest neighbour.
- /// The list of neighbours candidates.
- /// The distance measure to use.
- /// The distance between reference point, and its nearest neighbour.
- public static int FindIndexNearest(this PdfPoint pdfPoint, IReadOnlyList points,
- Func distanceMeasure, out double distance)
- {
- if (points == null || points.Count == 0)
+ if (candidates == null || candidates.Count == 0)
{
throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "points");
}
@@ -138,11 +107,13 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
distance = double.MaxValue;
int closestPointIndex = -1;
+ var candidatesPoints = candidates.Select(candidatesPoint).ToList();
+ var pivot = pivotPoint(element);
- for (var i = 0; i < points.Count; i++)
+ for (var i = 0; i < candidates.Count; i++)
{
- double currentDistance = distanceMeasure(points[i], pdfPoint);
- if (currentDistance < distance)
+ double currentDistance = distanceMeasure(candidatesPoints[i], pivot);
+ if (currentDistance < distance && !candidates[i].Equals(element))
{
distance = currentDistance;
closestPointIndex = i;
@@ -153,16 +124,20 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
}
///
- /// Find the index of the nearest line.
+ /// Find the index of the nearest line, excluding itself.
///
- /// The reference line, for which to find the nearest neighbour.
- /// The list of neighbours candidates.
+ ///
+ /// The reference line, for which to find the nearest neighbour.
+ /// The list of neighbours candidates.
+ ///
+ ///
/// The distance measure between two lines to use.
/// The distance between reference line, and its nearest neighbour.
- public static int FindIndexNearest(this PdfLine pdfLine, IReadOnlyList lines,
+ public static int FindIndexNearest(this T element, IReadOnlyList candidates,
+ Func candidatesLine, Func pivotLine,
Func distanceMeasure, out double distance)
{
- if (lines == null || lines.Count == 0)
+ if (candidates == null || candidates.Count == 0)
{
throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "lines");
}
@@ -174,11 +149,13 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
distance = double.MaxValue;
int closestLineIndex = -1;
+ var candidatesLines = candidates.Select(candidatesLine).ToList();
+ var pivot = pivotLine(element);
- for (var i = 0; i < lines.Count; i++)
+ for (var i = 0; i < candidates.Count; i++)
{
- double currentDistance = distanceMeasure(lines[i], pdfLine);
- if (currentDistance < distance)
+ double currentDistance = distanceMeasure(candidatesLines[i], pivot);
+ if (currentDistance < distance && !candidates[i].Equals(element))
{
distance = currentDistance;
closestLineIndex = i;
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs
index fd80cc25..05168b30 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs
@@ -126,6 +126,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
continue;
}
+ // Merge all lines (words)
+ blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(),
+ double.MaxValue, withinLine).ToList());
+
for (var c = 0; c < blocks.Count; c++)
{
if (b == c || blocks[c] == null)
@@ -142,8 +146,9 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
// 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
// same block. Filtering will still be done based on angle.
- var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine);
- blocks[b] = new TextBlock(mergedLines.ToList());
+ // Merge all lines (words) sharing same bottom (baseline)
+ var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine).ToList();
+ blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList());
// Remove
blocks[c] = null;
@@ -191,7 +196,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
return null;
}
- var closestWordIndex = pointR.FindIndexNearest(wordsWithinAngleBoundDistancePoints, Distances.Euclidean, out _);
+ var closestWordIndex = pointR.FindIndexNearest(wordsWithinAngleBoundDistancePoints, p => p,
+ p => p, Distances.Euclidean, out _);
if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count)
{
@@ -206,15 +212,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
///
private static IEnumerable GetLines(List words, double maxDist, AngleBounds withinLine)
{
- /***************************************************************************************************
- * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not
- * work as the FindIndexNearest() function might pair the pivot with itself (the pivot's right point
- * (distance = width) is closer than other words' left point).
- * -> Solution would be to find more than one nearest neighbours. Use KDTree?
- ***************************************************************************************************/
-
TextDirection textDirection = words[0].TextDirection;
- var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean,
+ var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, Distances.Euclidean,
(pivot, candidate) => maxDist,
pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
pivot => true,
@@ -257,11 +256,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
* If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
* We finally compute the Euclidean distance between these two middle points.
* If the two lines are not overlapping, the distance is set to the max distance.
- *
- * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'line Height', the algo won't
- * work as the FindIndexNearest() function will always pair the pivot with itself (the pivot's top
- * point (distance = height) is closer than other lines' top point).
- * -> Solution would be to find more than one nearest neighbours. Use KDTree?
**************************************************************************************************/
Func euclidianOverlappingMiddleDistance = (l1, l2) =>
@@ -276,7 +270,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
new PdfPoint(left + d / 2, l2.Point1.Y));
};
- var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines,
+ var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(lines,
euclidianOverlappingMiddleDistance,
(pivot, candidate) => maxDist,
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs
index 1038530d..62e212da 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs
@@ -102,7 +102,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
Letter[] letters = pageLetters.ToArray();
- var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(letters,
+ var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(letters,
distMeasure, maxDistanceFunction,
l => l.EndBaseLine, l => l.StartBaseLine,
l => !string.IsNullOrWhiteSpace(l.Value),