mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 11:44:51 +08:00
merge pull request #93 from BobLd/master
improving clustering algorithm
This commit is contained in:
@@ -12,8 +12,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
internal class ClusteringAlgorithms
|
||||
{
|
||||
/// <summary>
|
||||
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
|
||||
/// https://en.wikipedia.org/wiki/Transitive_closure
|
||||
/// Algorithm to group elements using nearest neighbours.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
|
||||
/// <param name="elements">List of elements to group.</param>
|
||||
@@ -23,7 +22,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(List<T> elements,
|
||||
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(List<T> elements,
|
||||
Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
Func<T, T, double> maxDistanceFunction,
|
||||
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
|
||||
@@ -41,7 +40,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
* that if indexes[i] = j then indexes[j] != i.
|
||||
*
|
||||
* 2. Group indexes
|
||||
* Group indexes if share neighbours in common - Transitive closure
|
||||
* Group indexes if share neighbours in common - Depth-first search
|
||||
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
|
||||
* (i,j,k) will form a group and (m,n) will form another group.
|
||||
*************************************************************************************/
|
||||
@@ -56,12 +55,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
if (filterPivot(pivot))
|
||||
{
|
||||
int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist);
|
||||
var paired = elements[index];
|
||||
int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist);
|
||||
|
||||
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
|
||||
if (index != -1)
|
||||
{
|
||||
indexes[e] = index;
|
||||
var paired = elements[index];
|
||||
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
|
||||
{
|
||||
indexes[e] = index;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -73,8 +75,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
|
||||
/// https://en.wikipedia.org/wiki/Transitive_closure
|
||||
/// Algorithm to group elements using nearest neighbours.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
|
||||
/// <param name="elements">Array of elements to group.</param>
|
||||
@@ -84,7 +85,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
|
||||
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(T[] elements,
|
||||
Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
Func<T, T, double> maxDistanceFunction,
|
||||
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
|
||||
@@ -102,7 +103,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
* that if indexes[i] = j then indexes[j] != i.
|
||||
*
|
||||
* 2. Group indexes
|
||||
* Group indexes if share neighbours in common - Transitive closure
|
||||
* Group indexes if share neighbours in common - Depth-first search
|
||||
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
|
||||
* (i,j,k) will form a group and (m,n) will form another group.
|
||||
*************************************************************************************/
|
||||
@@ -117,12 +118,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
if (filterPivot(pivot))
|
||||
{
|
||||
int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist);
|
||||
var paired = elements[index];
|
||||
int index = pivot.FindIndexNearest(elements, candidatesPoint, pivotPoint, distMeasure, out double dist);
|
||||
|
||||
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
|
||||
if (index != -1)
|
||||
{
|
||||
indexes[e] = index;
|
||||
var paired = elements[index];
|
||||
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
|
||||
{
|
||||
indexes[e] = index;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -134,8 +138,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
|
||||
/// https://en.wikipedia.org/wiki/Transitive_closure
|
||||
/// Algorithm to group elements using nearest neighbours.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
|
||||
/// <param name="elements">Array of elements to group.</param>
|
||||
@@ -145,7 +148,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <param name="candidatesLine">The candidates' line to use for pairing.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
|
||||
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(T[] elements,
|
||||
Func<PdfLine, PdfLine, double> distMeasure,
|
||||
Func<T, T, double> maxDistanceFunction,
|
||||
Func<T, PdfLine> pivotLine, Func<T, PdfLine> candidatesLine,
|
||||
@@ -163,7 +166,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
* that if indexes[i] = j then indexes[j] != i.
|
||||
*
|
||||
* 2. Group indexes
|
||||
* Group indexes if share neighbours in common - Transitive closure
|
||||
* Group indexes if share neighbours in common - Depth-first search
|
||||
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
|
||||
* (i,j,k) will form a group and (m,n) will form another group.
|
||||
*************************************************************************************/
|
||||
@@ -178,12 +181,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
if (filterPivot(pivot))
|
||||
{
|
||||
int index = pivotLine(pivot).FindIndexNearest(candidatesLines, distMeasure, out double dist);
|
||||
var paired = elements[index];
|
||||
int index = pivot.FindIndexNearest(elements, candidatesLine, pivotLine, distMeasure, out double dist);
|
||||
|
||||
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
|
||||
if (index != -1)
|
||||
{
|
||||
indexes[e] = index;
|
||||
var paired = elements[index];
|
||||
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
|
||||
{
|
||||
indexes[e] = index;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -195,104 +201,98 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Group elements via transitive closure. Each element has only one connected neighbour.
|
||||
/// https://en.wikipedia.org/wiki/Transitive_closure
|
||||
/// Group elements using Depth-first search.
|
||||
/// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
|
||||
/// </summary>
|
||||
/// <param name="indexes">Array of paired elements index.</param>
|
||||
/// <returns></returns>
|
||||
private static List<HashSet<int>> GroupIndexes(int[] indexes)
|
||||
/// <param name="edges">The graph. edges[i] = j indicates that there is an edge between i and j.</param>
|
||||
/// <returns>A List of HashSets containing containing the grouped indexes.</returns>
|
||||
internal static List<HashSet<int>> GroupIndexes(int[] edges)
|
||||
{
|
||||
int[][] adjacency = new int[indexes.Length][];
|
||||
for (int i = 0; i < indexes.Length; i++)
|
||||
int[][] adjacency = new int[edges.Length][];
|
||||
for (int i = 0; i < edges.Length; i++)
|
||||
{
|
||||
HashSet<int> matches = new HashSet<int>();
|
||||
for (int j = 0; j < indexes.Length; ++j)
|
||||
if (edges[i] != -1) matches.Add(edges[i]);
|
||||
for (int j = 0; j < edges.Length; j++)
|
||||
{
|
||||
if (indexes[j] == i) matches.Add(j);
|
||||
if (edges[j] == i) matches.Add(j);
|
||||
}
|
||||
adjacency[i] = matches.ToArray();
|
||||
}
|
||||
|
||||
List<HashSet<int>> groupedIndexes = new List<HashSet<int>>();
|
||||
bool[] isDone = new bool[indexes.Length];
|
||||
bool[] isDone = new bool[edges.Length];
|
||||
|
||||
for (int p = 0; p < indexes.Length; p++)
|
||||
for (int p = 0; p < edges.Length; p++)
|
||||
{
|
||||
if (isDone[p]) continue;
|
||||
groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone));
|
||||
}
|
||||
return groupedIndexes;
|
||||
}
|
||||
|
||||
LinkedList<int[]> L = new LinkedList<int[]>();
|
||||
HashSet<int> grouped = new HashSet<int>();
|
||||
L.AddLast(new[] { p, indexes[p] });
|
||||
|
||||
while (L.Any())
|
||||
/// <summary>
|
||||
/// Group elements using Depth-first search.
|
||||
/// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
|
||||
/// </summary>
|
||||
/// <param name="edges">The graph. edges[i] = [j, k, l, ...] indicates that there is an edge between i and each element j, k, l, ...</param>
|
||||
/// <returns>A List of HashSets containing containing the grouped indexes.</returns>
|
||||
internal static List<HashSet<int>> GroupIndexes(int[][] edges)
|
||||
{
|
||||
int[][] adjacency = new int[edges.Length][];
|
||||
for (int i = 0; i < edges.Length; i++)
|
||||
{
|
||||
HashSet<int> matches = new HashSet<int>();
|
||||
for (int j = 0; j < edges[i].Length; j++)
|
||||
{
|
||||
var current = L.First.Value;
|
||||
L.RemoveFirst();
|
||||
var current0 = current[0];
|
||||
var current1 = current[1];
|
||||
if (edges[i][j] != -1) matches.Add(edges[i][j]);
|
||||
}
|
||||
|
||||
if (current0 != -1 && !isDone[current0])
|
||||
for (int j = 0; j < edges.Length; j++)
|
||||
{
|
||||
for (int k = 0; k < edges[j].Length; k++)
|
||||
{
|
||||
var adjs = adjacency[current0];
|
||||
foreach (var k in adjs)
|
||||
{
|
||||
if (isDone[k]) continue;
|
||||
L.AddLast(new[] { k, current0 });
|
||||
}
|
||||
|
||||
int current0P = indexes[current0];
|
||||
if (current0P != -1)
|
||||
{
|
||||
var adjsP = adjacency[current0P];
|
||||
foreach (var k in adjsP)
|
||||
{
|
||||
if (isDone[k]) continue;
|
||||
L.AddLast(new[] { k, current0P });
|
||||
isDone[k] = true;
|
||||
grouped.Add(k);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
L.AddLast(new[] { current0, current0P });
|
||||
isDone[current0] = true;
|
||||
grouped.Add(current0);
|
||||
}
|
||||
}
|
||||
|
||||
if (current1 != -1 && !isDone[current1])
|
||||
{
|
||||
var adjs = adjacency[current1];
|
||||
foreach (var k in adjs)
|
||||
{
|
||||
if (isDone[k]) continue;
|
||||
L.AddLast(new[] { k, current1 });
|
||||
}
|
||||
|
||||
int current1P = indexes[current1];
|
||||
if (current1P != -1)
|
||||
{
|
||||
var adjsP = adjacency[current1P];
|
||||
foreach (var k in adjsP)
|
||||
{
|
||||
if (isDone[k]) continue;
|
||||
L.AddLast(new[] { k, current1P });
|
||||
isDone[k] = true;
|
||||
grouped.Add(k);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
L.AddLast(new[] { current1, current1P });
|
||||
isDone[current1] = true;
|
||||
grouped.Add(current1);
|
||||
}
|
||||
if (edges[j][k] == i) matches.Add(j);
|
||||
}
|
||||
}
|
||||
groupedIndexes.Add(grouped);
|
||||
adjacency[i] = matches.ToArray();
|
||||
}
|
||||
|
||||
List<HashSet<int>> groupedIndexes = new List<HashSet<int>>();
|
||||
bool[] isDone = new bool[edges.Length];
|
||||
|
||||
for (int p = 0; p < edges.Length; p++)
|
||||
{
|
||||
if (isDone[p]) continue;
|
||||
groupedIndexes.Add(DfsIterative(p, adjacency, ref isDone));
|
||||
}
|
||||
return groupedIndexes;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Depth-first search
|
||||
/// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
|
||||
/// </summary>
|
||||
private static HashSet<int> DfsIterative(int c, int[][] adj, ref bool[] isDone)
|
||||
{
|
||||
HashSet<int> group = new HashSet<int>();
|
||||
Stack<int> S = new Stack<int>();
|
||||
S.Push(c);
|
||||
|
||||
while (S.Any())
|
||||
{
|
||||
var v = S.Pop();
|
||||
if (!isDone[v])
|
||||
{
|
||||
group.Add(v);
|
||||
isDone[v] = true;
|
||||
foreach (var w in adj[v])
|
||||
{
|
||||
S.Push(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
return group;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -81,52 +81,21 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Find the nearest point.
|
||||
/// Find the index of the nearest point, excluding itself.
|
||||
/// </summary>
|
||||
/// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
|
||||
/// <param name="points">The list of neighbours candidates.</param>
|
||||
/// <typeparam name="T"></typeparam>
|
||||
/// <param name="element">The reference point, for which to find the nearest neighbour.</param>
|
||||
/// <param name="candidates">The list of neighbours candidates.</param>
|
||||
/// <param name="candidatesPoint"></param>
|
||||
/// <param name="pivotPoint"></param>
|
||||
/// <param name="distanceMeasure">The distance measure to use.</param>
|
||||
/// <param name="distance">The distance between reference point, and its nearest neighbour.</param>
|
||||
public static PdfPoint FindNearest(this PdfPoint pdfPoint, IReadOnlyList<PdfPoint> points,
|
||||
/// <returns></returns>
|
||||
internal static int FindIndexNearest<T>(this T element, IReadOnlyList<T> candidates,
|
||||
Func<T, PdfPoint> candidatesPoint, Func<T, PdfPoint> pivotPoint,
|
||||
Func<PdfPoint, PdfPoint, double> distanceMeasure, out double distance)
|
||||
{
|
||||
if (points == null || points.Count == 0)
|
||||
{
|
||||
throw new ArgumentException("Distances.FindNearest(): The list of neighbours candidates is either null or empty.", "points");
|
||||
}
|
||||
|
||||
if (distanceMeasure == null)
|
||||
{
|
||||
throw new ArgumentException("Distances.FindNearest(): The distance measure must not be null.", "distanceMeasure");
|
||||
}
|
||||
|
||||
distance = double.MaxValue;
|
||||
PdfPoint closestPoint = default;
|
||||
|
||||
for (var i = 0; i < points.Count; i++)
|
||||
{
|
||||
double currentDistance = distanceMeasure(points[i], pdfPoint);
|
||||
if (currentDistance < distance)
|
||||
{
|
||||
distance = currentDistance;
|
||||
closestPoint = points[i];
|
||||
}
|
||||
}
|
||||
|
||||
return closestPoint;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Find the index of the nearest point.
|
||||
/// </summary>
|
||||
/// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
|
||||
/// <param name="points">The list of neighbours candidates.</param>
|
||||
/// <param name="distanceMeasure">The distance measure to use.</param>
|
||||
/// <param name="distance">The distance between reference point, and its nearest neighbour.</param>
|
||||
public static int FindIndexNearest(this PdfPoint pdfPoint, IReadOnlyList<PdfPoint> points,
|
||||
Func<PdfPoint, PdfPoint, double> distanceMeasure, out double distance)
|
||||
{
|
||||
if (points == null || points.Count == 0)
|
||||
if (candidates == null || candidates.Count == 0)
|
||||
{
|
||||
throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "points");
|
||||
}
|
||||
@@ -138,11 +107,13 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
distance = double.MaxValue;
|
||||
int closestPointIndex = -1;
|
||||
var candidatesPoints = candidates.Select(candidatesPoint).ToList();
|
||||
var pivot = pivotPoint(element);
|
||||
|
||||
for (var i = 0; i < points.Count; i++)
|
||||
for (var i = 0; i < candidates.Count; i++)
|
||||
{
|
||||
double currentDistance = distanceMeasure(points[i], pdfPoint);
|
||||
if (currentDistance < distance)
|
||||
double currentDistance = distanceMeasure(candidatesPoints[i], pivot);
|
||||
if (currentDistance < distance && !candidates[i].Equals(element))
|
||||
{
|
||||
distance = currentDistance;
|
||||
closestPointIndex = i;
|
||||
@@ -153,16 +124,20 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Find the index of the nearest line.
|
||||
/// Find the index of the nearest line, excluding itself.
|
||||
/// </summary>
|
||||
/// <param name="pdfLine">The reference line, for which to find the nearest neighbour.</param>
|
||||
/// <param name="lines">The list of neighbours candidates.</param>
|
||||
/// <typeparam name="T"></typeparam>
|
||||
/// <param name="element">The reference line, for which to find the nearest neighbour.</param>
|
||||
/// <param name="candidates">The list of neighbours candidates.</param>
|
||||
/// <param name="candidatesLine"></param>
|
||||
/// <param name="pivotLine"></param>
|
||||
/// <param name="distanceMeasure">The distance measure between two lines to use.</param>
|
||||
/// <param name="distance">The distance between reference line, and its nearest neighbour.</param>
|
||||
public static int FindIndexNearest(this PdfLine pdfLine, IReadOnlyList<PdfLine> lines,
|
||||
internal static int FindIndexNearest<T>(this T element, IReadOnlyList<T> candidates,
|
||||
Func<T, PdfLine> candidatesLine, Func<T, PdfLine> pivotLine,
|
||||
Func<PdfLine, PdfLine, double> distanceMeasure, out double distance)
|
||||
{
|
||||
if (lines == null || lines.Count == 0)
|
||||
if (candidates == null || candidates.Count == 0)
|
||||
{
|
||||
throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "lines");
|
||||
}
|
||||
@@ -174,11 +149,13 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
distance = double.MaxValue;
|
||||
int closestLineIndex = -1;
|
||||
var candidatesLines = candidates.Select(candidatesLine).ToList();
|
||||
var pivot = pivotLine(element);
|
||||
|
||||
for (var i = 0; i < lines.Count; i++)
|
||||
for (var i = 0; i < candidates.Count; i++)
|
||||
{
|
||||
double currentDistance = distanceMeasure(lines[i], pdfLine);
|
||||
if (currentDistance < distance)
|
||||
double currentDistance = distanceMeasure(candidatesLines[i], pivot);
|
||||
if (currentDistance < distance && !candidates[i].Equals(element))
|
||||
{
|
||||
distance = currentDistance;
|
||||
closestLineIndex = i;
|
||||
|
@@ -126,6 +126,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
continue;
|
||||
}
|
||||
|
||||
// Merge all lines (words)
|
||||
blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(),
|
||||
double.MaxValue, withinLine).ToList());
|
||||
|
||||
for (var c = 0; c < blocks.Count; c++)
|
||||
{
|
||||
if (b == c || blocks[c] == null)
|
||||
@@ -142,8 +146,9 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
// 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
|
||||
// same block. Filtering will still be done based on angle.
|
||||
var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine);
|
||||
blocks[b] = new TextBlock(mergedLines.ToList());
|
||||
// Merge all lines (words) sharing same bottom (baseline)
|
||||
var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine).ToList();
|
||||
blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList());
|
||||
|
||||
// Remove
|
||||
blocks[c] = null;
|
||||
@@ -191,7 +196,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
return null;
|
||||
}
|
||||
|
||||
var closestWordIndex = pointR.FindIndexNearest(wordsWithinAngleBoundDistancePoints, Distances.Euclidean, out _);
|
||||
var closestWordIndex = pointR.FindIndexNearest(wordsWithinAngleBoundDistancePoints, p => p,
|
||||
p => p, Distances.Euclidean, out _);
|
||||
|
||||
if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count)
|
||||
{
|
||||
@@ -201,20 +207,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
return finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex]);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Build lines via transitive closure.
|
||||
/// </summary>
|
||||
private static IEnumerable<TextLine> GetLines(List<Word> words, double maxDist, AngleBounds withinLine)
|
||||
{
|
||||
/***************************************************************************************************
|
||||
* /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not
|
||||
* work as the FindIndexNearest() function might pair the pivot with itself (the pivot's right point
|
||||
* (distance = width) is closer than other words' left point).
|
||||
* -> Solution would be to find more than one nearest neighbours. Use KDTree?
|
||||
***************************************************************************************************/
|
||||
|
||||
TextDirection textDirection = words[0].TextDirection;
|
||||
var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean,
|
||||
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, Distances.Euclidean,
|
||||
(pivot, candidate) => maxDist,
|
||||
pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
|
||||
pivot => true,
|
||||
@@ -246,9 +242,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Build blocks via transitive closure.
|
||||
/// </summary>
|
||||
private static IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
|
||||
{
|
||||
/**************************************************************************************************
|
||||
@@ -257,11 +250,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
* If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
|
||||
* We finally compute the Euclidean distance between these two middle points.
|
||||
* If the two lines are not overlapping, the distance is set to the max distance.
|
||||
*
|
||||
* /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'line Height', the algo won't
|
||||
* work as the FindIndexNearest() function will always pair the pivot with itself (the pivot's top
|
||||
* point (distance = height) is closer than other lines' top point).
|
||||
* -> Solution would be to find more than one nearest neighbours. Use KDTree?
|
||||
**************************************************************************************************/
|
||||
|
||||
Func<PdfLine, PdfLine, double> euclidianOverlappingMiddleDistance = (l1, l2) =>
|
||||
@@ -276,7 +264,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
new PdfPoint(left + d / 2, l2.Point1.Y));
|
||||
};
|
||||
|
||||
var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines,
|
||||
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(lines,
|
||||
euclidianOverlappingMiddleDistance,
|
||||
(pivot, candidate) => maxDist,
|
||||
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
|
||||
|
@@ -102,7 +102,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
Letter[] letters = pageLetters.ToArray();
|
||||
|
||||
var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(letters,
|
||||
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(letters,
|
||||
distMeasure, maxDistanceFunction,
|
||||
l => l.EndBaseLine, l => l.StartBaseLine,
|
||||
l => !string.IsNullOrWhiteSpace(l.Value),
|
||||
|
Reference in New Issue
Block a user