diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs
index 072469a5..bec3a6da 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs
@@ -13,6 +13,7 @@
{
///
/// Algorithm to group elements using nearest neighbours.
+ /// Uses the nearest neighbour as candidate.
///
/// Letter, Word, TextLine, etc.
/// Elements to group.
@@ -61,7 +62,7 @@
if (filterPivot(pivot))
{
- var paired = kdTree.FindNearestNeighbours(pivot, pivotPoint, distMeasure, out int index, out double dist);
+ var paired = kdTree.FindNearestNeighbour(pivot, pivotPoint, distMeasure, out int index, out double dist);
if (index != -1)
{
@@ -77,6 +78,77 @@
return GroupIndexes(indexes);
}
+ ///
+ /// Algorithm to group elements using nearest neighbours.
+ /// Uses the k-nearest neighbours as candidates.
+ ///
+ /// Letter, Word, TextLine, etc.
+ /// Elements to group.
+ /// The k-nearest neighbours to consider as candidates.
+ /// The distance measure between two points.
+ /// The function that determines the maximum distance between two points in the same cluster.
+ /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.
+ /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.
+ /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
+ /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
+ internal static IEnumerable> ClusterNearestNeighbours(IReadOnlyList elements, int k,
+ Func distMeasure,
+ Func maxDistanceFunction,
+ Func pivotPoint, Func candidatesPoint,
+ Func filterPivot, Func filterFinal,
+ int maxDegreeOfParallelism)
+ {
+ /*************************************************************************************
+ * Algorithm steps
+ * 1. Find nearest neighbours indexes (done in parallel)
+ * Iterate every point (pivot) and put its nearest neighbour's index in an array
+ * e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
+ * Only conciders a neighbour if it is within the maximum distance.
+ * If not within the maximum distance, index will be set to -1.
+ * Each element has only one connected neighbour.
+ * NB: Given the possible asymmetry in the relationship, it is possible
+ * that if indexes[i] = j then indexes[j] != i.
+ *
+ * 2. Group indexes
+ * Group indexes if share neighbours in common - Depth-first search
+ * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
+ * (i,j,k) will form a group and (m,n) will form another group.
+ *************************************************************************************/
+
+ int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray();
+ KdTree kdTree = new KdTree(elements, candidatesPoint);
+
+ ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
+
+ // 1. Find nearest neighbours indexes
+ Parallel.For(0, elements.Count, parallelOptions, e =>
+ {
+ var pivot = elements[e];
+
+ if (filterPivot(pivot))
+ {
+ var paired = kdTree.FindNearestNeighbours(pivot, k, pivotPoint, distMeasure);
+
+ foreach (var c in paired)
+ {
+ var filter = filterFinal(pivot, c.Item1);
+ var maxDist = maxDistanceFunction(pivot, c.Item1);
+ if (filter && c.Item3 < maxDist)
+ {
+ indexes[e] = c.Item2;
+ break;
+ }
+ }
+ }
+ });
+
+ // 2. Group indexes
+ return GroupIndexes(indexes);
+ }
+
///
/// Algorithm to group elements using nearest neighbours.
///
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs
index 50a41bdc..8e75be44 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/KdTree.cs
@@ -14,13 +14,20 @@
public PdfPoint FindNearestNeighbours(PdfPoint pivot, Func distanceMeasure, out int index, out double distance)
{
- return FindNearestNeighbours(pivot, p => p, distanceMeasure, out index, out distance);
+ return FindNearestNeighbour(pivot, p => p, distanceMeasure, out index, out distance);
+ }
+
+ public IReadOnlyList<(PdfPoint, int, double)> FindNearestNeighbours(PdfPoint pivot, int k, Func distanceMeasure)
+ {
+ return FindNearestNeighbours(pivot, k, p => p, distanceMeasure);
}
}
internal class KdTree
{
- private KdTreeNode Root;
+ private readonly KdTreeNode Root;
+
+ public readonly int Count;
public KdTree(IReadOnlyList candidates, Func candidatesPointFunc)
{
@@ -29,6 +36,7 @@
throw new ArgumentException("KdTree(): candidates cannot be null or empty.", nameof(candidates));
}
+ Count = candidates.Count;
Root = BuildTree(Enumerable.Range(0, candidates.Count).Zip(candidates, (e, p) => (e, candidatesPointFunc(p), p)).ToArray(), 0);
}
@@ -67,23 +75,23 @@
#region NN
///
- ///
+ /// Get the nearest neighbour to the pivot element.
///
- ///
+ /// The element for which to find the nearest neighbour.
///
///
/// The nearest neighbour's index (returns -1 if not found).
/// The distance between the pivot and the nearest neighbour (returns if not found).
/// The nearest neighbour's element.
- public T FindNearestNeighbours(T pivot, Func pivotPointFunc, Func distanceMeasure, out int index, out double distance)
+ public T FindNearestNeighbour(T pivot, Func pivotPointFunc, Func distanceMeasure, out int index, out double distance)
{
- var result = FindNearestNeighbours(Root, pivot, pivotPointFunc, distanceMeasure);
+ var result = FindNearestNeighbour(Root, pivot, pivotPointFunc, distanceMeasure);
index = result.Item1 != null ? result.Item1.Index : -1;
- distance = result.Item2.HasValue ? result.Item2.Value : double.NaN;
+ distance = result.Item2 ?? double.NaN;
return result.Item1 != null ? result.Item1.Element : default;
}
- private static (KdTreeNode, double?) FindNearestNeighbours(KdTreeNode node, T pivot, Func pivotPointFunc, Func distance)
+ private static (KdTreeNode, double?) FindNearestNeighbour(KdTreeNode node, T pivot, Func pivotPointFunc, Func distance)
{
if (node == null)
{
@@ -111,7 +119,7 @@
if (pointValue < node.L)
{
// start left
- (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, pivotPointFunc, distance);
+ (newNode, newDist) = FindNearestNeighbour(node.LeftChild, pivot, pivotPointFunc, distance);
if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot))
{
@@ -121,13 +129,13 @@
if (node.RightChild != null && pointValue + currentDistance >= node.L)
{
- (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, pivotPointFunc, distance);
+ (newNode, newDist) = FindNearestNeighbour(node.RightChild, pivot, pivotPointFunc, distance);
}
}
else
{
// start right
- (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, pivotPointFunc, distance);
+ (newNode, newDist) = FindNearestNeighbour(node.RightChild, pivot, pivotPointFunc, distance);
if (newDist.HasValue && newDist <= currentDistance && !newNode.Element.Equals(pivot))
{
@@ -137,7 +145,7 @@
if (node.LeftChild != null && pointValue - currentDistance <= node.L)
{
- (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, pivotPointFunc, distance);
+ (newNode, newDist) = FindNearestNeighbour(node.LeftChild, pivot, pivotPointFunc, distance);
}
}
@@ -152,6 +160,170 @@
}
#endregion
+ #region k-NN
+ /*****************************************************************************
+ * WARNING: k-nearest neighbours algo will need more checks and tests.
+ *****************************************************************************/
+
+ ///
+ /// Get the k nearest neighbours to the pivot element. If elements are equidistant, they are counted as one.
+ ///
+ /// The element for which to find the k nearest neighbours.
+ /// The number of neighbours to return. If elements are equidistant, they are counted as one.
+ ///
+ ///
+ /// Returns a list of tuples of the k nearest neighbours. Tuples are (element, index, distance).
+ public IReadOnlyList<(T, int, double)> FindNearestNeighbours(T pivot, int k, Func pivotPointFunc, Func distanceMeasure)
+ {
+ if (k == 1)
+ {
+ // if only 1 neighbour required, use default to avoid creating KNearestNeighboursQueue
+ var nn = FindNearestNeighbour(pivot, pivotPointFunc, distanceMeasure, out int index, out double distance);
+ if (index == -1)
+ {
+ return EmptyArray<(T, int, double)>.Instance;
+ }
+ return new List<(T, int, double)>() { (nn, index, distance) };
+ }
+ else
+ {
+ var kdTreeNodes = new KNearestNeighboursQueue(k);
+ FindNearestNeighbours(Root, pivot, k, pivotPointFunc, distanceMeasure, kdTreeNodes);
+ return kdTreeNodes.SelectMany(n => n.Value.Select(e => (e.Element, e.Index, n.Key))).ToList();
+ }
+ }
+
+ private static (KdTreeNode, double) FindNearestNeighbours(KdTreeNode node, T pivot, int k,
+ Func pivotPointFunc, Func distance, KNearestNeighboursQueue queue)
+ {
+ if (node == null)
+ {
+ return (null, double.NaN);
+ }
+ else if (node.IsLeaf)
+ {
+ if (node.Element.Equals(pivot))
+ {
+ return (null, double.NaN);
+ }
+
+ var currentDistance = distance(node.Value, pivotPointFunc(pivot));
+ var currentNearestNode = node;
+
+ if (!queue.IsFull || currentDistance <= queue.LastDistance)
+ {
+ queue.Add(currentDistance, currentNearestNode);
+ currentDistance = queue.LastDistance;
+ currentNearestNode = queue.LastElement;
+ }
+
+ return (currentNearestNode, currentDistance);
+ }
+ else
+ {
+ var point = pivotPointFunc(pivot);
+ var currentNearestNode = node;
+ var currentDistance = distance(node.Value, point);
+ if (!queue.IsFull || currentDistance <= queue.LastDistance)
+ {
+ queue.Add(currentDistance, currentNearestNode);
+ currentDistance = queue.LastDistance;
+ currentNearestNode = queue.LastElement;
+ }
+
+ KdTreeNode newNode = null;
+ double newDist = double.NaN;
+
+ var pointValue = node.IsAxisCutX ? point.X : point.Y;
+
+ if (pointValue < node.L)
+ {
+ // start left
+ (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, k, pivotPointFunc, distance, queue);
+
+ if (!double.IsNaN(newDist) && newDist <= currentDistance && !newNode.Element.Equals(pivot))
+ {
+ queue.Add(newDist, newNode);
+ currentDistance = queue.LastDistance;
+ currentNearestNode = queue.LastElement;
+ }
+
+ if (node.RightChild != null && pointValue + currentDistance >= node.L)
+ {
+ (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, k, pivotPointFunc, distance, queue);
+ }
+ }
+ else
+ {
+ // start right
+ (newNode, newDist) = FindNearestNeighbours(node.RightChild, pivot, k, pivotPointFunc, distance, queue);
+
+ if (!double.IsNaN(newDist) && newDist <= currentDistance && !newNode.Element.Equals(pivot))
+ {
+ queue.Add(newDist, newNode);
+ currentDistance = queue.LastDistance;
+ currentNearestNode = queue.LastElement;
+ }
+
+ if (node.LeftChild != null && pointValue - currentDistance <= node.L)
+ {
+ (newNode, newDist) = FindNearestNeighbours(node.LeftChild, pivot, k, pivotPointFunc, distance, queue);
+ }
+ }
+
+ if (!double.IsNaN(newDist) && newDist <= currentDistance && !newNode.Element.Equals(pivot))
+ {
+ queue.Add(newDist, newNode);
+ currentDistance = queue.LastDistance;
+ currentNearestNode = queue.LastElement;
+ }
+
+ return (currentNearestNode, currentDistance);
+ }
+ }
+
+ private class KNearestNeighboursQueue : SortedList>>
+ {
+ public readonly int K;
+
+ public KdTreeNode LastElement { get; private set; }
+
+ public double LastDistance { get; private set; }
+
+ public bool IsFull => Count >= K;
+
+ public KNearestNeighboursQueue(int k) : base(k)
+ {
+ K = k;
+ LastDistance = double.PositiveInfinity;
+ }
+
+ public void Add(double key, KdTreeNode value)
+ {
+ if (key > LastDistance && IsFull)
+ {
+ return;
+ }
+
+ if (!ContainsKey(key))
+ {
+ base.Add(key, new HashSet>());
+ if (Count > K)
+ {
+ RemoveAt(Count - 1);
+ }
+ }
+
+ if (this[key].Add(value))
+ {
+ var last = this.Last();
+ LastElement = last.Value.Last();
+ LastDistance = last.Key;
+ }
+ }
+ }
+ #endregion
+
private class KdTreeLeaf : KdTreeNode
{
public override bool IsLeaf => true;
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
index 92a432db..9a624a45 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
@@ -108,38 +108,39 @@
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
- // 1. Estimate in line and between line spacing
+ // 1. Estimate within line and between line spacing
+ KdTree kdTreeWL = new KdTree(wordsList, w => w.BoundingBox.BottomLeft);
+ KdTree kdTreeBL = new KdTree(wordsList, w => w.BoundingBox.TopLeft);
+
Parallel.For(0, wordsList.Count, parallelOptions, i =>
{
var word = wordsList[i];
// Within-line distance
- var pointsWithinLine = GetNearestPointDistance(wordsList, word,
- bb => bb.BottomRight, bb => bb.BottomRight,
- bb => bb.BottomLeft, bb => bb.BottomLeft,
- withinLine, Distances.Horizontal);
-
- if (pointsWithinLine != null)
+ var neighbourWL = kdTreeWL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 0.5));
+ foreach (var n in neighbourWL)
{
- withinLineDistList.Add(pointsWithinLine.Value);
+ if (withinLine.Contains(Distances.Angle(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft)))
+ {
+ withinLineDistList.Add(Distances.Horizontal(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft));
+ }
}
// Between-line distance
- var pointsBetweenLine = GetNearestPointDistance(wordsList, word,
- bb => bb.BottomLeft, bb => bb.Centroid,
- bb => bb.TopLeft, bb => bb.Centroid,
- betweenLine, Distances.Vertical);
-
- if (pointsBetweenLine != null)
+ var neighbourBL = kdTreeBL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomLeft, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 50));
+ foreach (var n in neighbourBL)
{
- betweenLineDistList.Add(pointsBetweenLine.Value);
+ if (betweenLine.Contains(Distances.Angle(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid)))
+ {
+ betweenLineDistList.Add(Distances.Vertical(word.BoundingBox.BottomLeft, n.Item1.BoundingBox.TopLeft));
+ }
}
});
double? withinLineDistance = GetPeakAverageDistance(withinLineDistList);
double? betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);
- if (withinLineDistance == null || betweenLineDistance == null)
+ if (!withinLineDistance.HasValue || !betweenLineDistance.HasValue)
{
return new[] { new TextBlock(new[] { new TextLine(wordsList) }) };
}
@@ -193,69 +194,15 @@
return blocks.Where(b => b != null).ToList();
}
- ///
- /// Get information on the nearest point, filtered for angle.
- ///
- private double? GetNearestPointDistance(List words, Word pivot, Func funcPivotDist, Func funcPivotAngle,
- Func funcPointsDist, Func funcPointsAngle,
- AngleBounds angleBounds,
- Func finalDistanceMeasure)
- {
- var pointR = funcPivotDist(pivot.BoundingBox);
-
- var pivotPoint = funcPivotAngle(pivot.BoundingBox);
-
- var wordsWithinAngleBoundDistancePoints = new List();
-
- // Filter to words within the angle range.
- foreach (var word in words)
- {
- // Ignore the pivot word.
- if (ReferenceEquals(word, pivot))
- {
- continue;
- }
-
- var angle = Distances.Angle(pivotPoint, funcPointsAngle(word.BoundingBox));
-
- if (angleBounds.Contains(angle))
- {
- wordsWithinAngleBoundDistancePoints.Add(funcPointsDist(word.BoundingBox));
- }
- }
-
- if (wordsWithinAngleBoundDistancePoints.Count == 0)
- {
- return null;
- }
-
- var closestWordIndex = Distances.FindIndexNearest(pointR, wordsWithinAngleBoundDistancePoints, p => p,
- p => p, Distances.Euclidean, out _);
-
- if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count)
- {
- return null;
- }
-
- return finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex]);
- }
-
private static IEnumerable GetLines(List words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism)
{
TextDirection textDirection = words[0].TextDirection;
- var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, Distances.Euclidean,
- (pivot, candidate) => maxDist,
- pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
- pivot => true,
- (pivot, candidate) =>
- {
- // Compare bottom right with bottom left for angle
- var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft);
-
- return (withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper);
- },
- maxDegreeOfParallelism).ToList();
+ var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, 2, Distances.Euclidean,
+ (pivot, candidate) => maxDist,
+ pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
+ pivot => true,
+ (pivot, candidate) => withinLine.Contains(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft)),
+ maxDegreeOfParallelism).ToList();
Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
if (textDirection == TextDirection.Rotate180)
@@ -287,7 +234,7 @@
* If the two lines are not overlapping, the distance is set to the max distance.
**************************************************************************************************/
- Func euclidianOverlappingMiddleDistance = (l1, l2) =>
+ double euclidianOverlappingMiddleDistance(PdfLine l1, PdfLine l2)
{
var left = Math.Max(l1.Point1.X, l2.Point1.X);
var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left);
@@ -295,9 +242,9 @@
if (d < 0) return double.MaxValue; // not overlapping -> max distance
return Distances.Euclidean(
- new PdfPoint(left + d / 2, l1.Point1.Y),
- new PdfPoint(left + d / 2, l2.Point1.Y));
- };
+ new PdfPoint(left + d / 2, l1.Point1.Y),
+ new PdfPoint(left + d / 2, l2.Point1.Y));
+ }
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(lines,
euclidianOverlappingMiddleDistance,
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs
index b08164a0..1788639a 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs
@@ -254,7 +254,7 @@
#region Sorted Queue
private class QueueEntries : SortedSet
{
- int bound;
+ readonly int bound;
public QueueEntries(int maximumBound)
{
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs
index 756eafec..05989bfe 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs
@@ -31,14 +31,14 @@
/// The letters in the page.
public IEnumerable GetWords(IReadOnlyList letters)
{
- Func baseMaxFunc = (l1, l2) =>
+ double baseMaxFunc(Letter l1, Letter l2)
{
return Math.Max(Math.Max(Math.Max(
Math.Abs(l1.GlyphRectangle.Width),
Math.Abs(l2.GlyphRectangle.Width)),
- Math.Abs(l1.Width)),
+ Math.Abs(l1.Width)),
Math.Abs(l2.Width));
- };
+ }
List wordsH = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Horizontal).ToList(),
diff --git a/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs b/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs
index b557661e..3da754de 100644
--- a/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs
+++ b/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs
@@ -382,9 +382,15 @@
}
else
{
- if (!rectangle.Normalise().IntersectsWith(other.Normalise()))
+ var r1 = rectangle.Normalise();
+ var r2 = other.Normalise();
+ if (Math.Abs(r1.Rotation) < epsilon && Math.Abs(r2.Rotation) < epsilon)
{
- return false;
+ // check rotation to avoid stackoverflow
+ if (!r1.IntersectsWith(r2))
+ {
+ return false;
+ }
}
if (rectangle.Contains(other.BottomLeft)) return true;