Update DocstrumBB to account for middle point of the overlapping area distance. For this, using distance between 2 lines.

This commit is contained in:
BobLd
2019-08-11 13:45:08 +01:00
parent c14d77e414
commit 7e8b3bdc85
3 changed files with 215 additions and 25 deletions

View File

@@ -18,11 +18,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
/// <param name="elements">Array of elements to group.</param>
/// <param name="distMeasure">The distance measure between two points.</param>
/// <param name="maxDistanceFunction">The function that determines the distance between to points in the same cluster.</param>
/// <param name="pivotPoint">The pivot's point to use.</param>
/// <param name="candidatesPoint">The candidates to pair point to use.</param>
/// <param name="filterPivot">Filter to apply to the pivot point.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point.</param>
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two points in the same cluster.</param>
/// <param name="pivotPoint">The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.</param>
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
Func<PdfPoint, PdfPoint, double> distMeasure,
Func<T, T, double> maxDistanceFunction,
@@ -69,17 +69,97 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
}
});
// 2. Group indexes
// 3. Merge groups that have indexes in common
var groupedIndexes = GroupMergeIndexes(indexes);
return groupedIndexes;
}
/// <summary>
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
/// https://en.wikipedia.org/wiki/Transitive_closure
/// </summary>
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
/// <param name="elements">Array of elements to group.</param>
/// <param name="distMeasure">The distance measure between two lines.</param>
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two points in the same cluster.</param>
/// <param name="pivotLine">The pivot's line to use for pairing.</param>
/// <param name="candidatesLine">The candidates' line to use for pairing.</param>
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
Func<PdfLine, PdfLine, double> distMeasure,
Func<T, T, double> maxDistanceFunction,
Func<T, PdfLine> pivotLine, Func<T, PdfLine> candidatesLine,
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
{
/*************************************************************************************
* Algorithm steps
* 1. Find nearest neighbours indexes (done in parallel)
* Iterate every point (pivot) and put its nearest neighbour's index in an array
* e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
* Only conciders a neighbour if it is within the maximum distance.
* If not within the maximum distance, index will be set to -1.
* NB: Given the possible asymmetry in the relationship, it is possible
* that if indexes[i] = j then indexes[j] != i.
*
* 2. Group indexes
* Group indexes if share neighbours in common - Transitive closure
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
* (i,j,k) will form a group and (m,n) will form another group.
*
* 3. Merge groups that have indexes in common - If any
* If there are group with indexes in common, merge them.
* (Could be improved and put in step 2)
*************************************************************************************/
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
var candidatesLines = elements.Select(x => candidatesLine(x)).ToList();
// 1. Find nearest neighbours indexes
Parallel.For(0, elements.Length, e =>
{
var pivot = elements[e];
if (filterPivot(pivot))
{
int index = pivotLine(pivot).FindIndexNearest(candidatesLines, distMeasure, out double dist);
var paired = elements[index];
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
{
indexes[e] = index;
}
}
});
// 2. Group indexes
// 3. Merge groups that have indexes in common
var groupedIndexes = GroupMergeIndexes(indexes);
return groupedIndexes;
}
/// <summary>
/// Group elements via transitive closure.
/// https://en.wikipedia.org/wiki/Transitive_closure
/// </summary>
/// <param name="indexes">Array of paired elements index.</param>
/// <returns></returns>
internal static List<HashSet<int>> GroupMergeIndexes(int[] indexes)
{
// 2. Group indexes
List<HashSet<int>> groupedIndexes = new List<HashSet<int>>();
HashSet<int> indexDone = new HashSet<int>();
for (int e = 0; e < elements.Length; e++)
for (int e = 0; e < indexes.Length; e++)
{
int index = indexes[e];
if (index == -1) // This element is not connected
{
// Check if another element index is connected to this element (nb: distance measure is asymetric)
// Check if another element's index is connected to this element (nb: distance measure is asymmetric)
if (!indexes.Contains(e))
{
// If no other element is connected to this element, add it as a standalone element
@@ -131,7 +211,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
}
// Check that all elements are done
if (elements.Length != indexDone.Count)
if (indexes.Length != indexDone.Count)
{
throw new Exception("ClusteringAlgorithms.GetNNGroupedIndexes(): Some elements were not done.");
}
@@ -140,7 +220,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
// Check if duplicates (if duplicates, then same index in different groups)
if (indexDone.Count != groupedIndexes.SelectMany(x => x).Count())
{
for (int e = 0; e < elements.Length; e++)
for (int e = 0; e < indexes.Length; e++)
{
List<HashSet<int>> candidates = groupedIndexes.Where(x => x.Contains(e)).ToList();
int count = candidates.Count();
@@ -157,7 +237,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
groupedIndexes.Add(merged);
}
}
return groupedIndexes;
}
}