mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-24 13:23:40 +08:00
Document Layout Analysis - IPageSegmenter, Docstrum
- Create a TextBlock class - Creates IPageSegmenter - Add other useful distances: angle, etc. - Update RecursiveXYCut - With IPageSegmenter and TextBlock - Make XYNode and XYLeaf internal - Optimise (faster) NearestNeighbourWordExtractor and isolate the clustering algorithms for use outside of this class - Implement a Docstrum inspired page segmentation algorithm
This commit is contained in:
@@ -0,0 +1,164 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
using UglyToad.PdfPig.Geometry;
|
||||
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
/// <summary>
|
||||
/// Clustering Algorithms.
|
||||
/// </summary>
|
||||
internal class ClusteringAlgorithms
|
||||
{
|
||||
/// <summary>
|
||||
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
|
||||
/// https://en.wikipedia.org/wiki/Transitive_closure
|
||||
/// </summary>
|
||||
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
|
||||
/// <param name="elements">Array of elements to group.</param>
|
||||
/// <param name="distMeasure">The distance measure between two points.</param>
|
||||
/// <param name="maxDistanceFunction">The function that determines the distance between to points in the same cluster.</param>
|
||||
/// <param name="pivotPoint">The pivot's point to use.</param>
|
||||
/// <param name="candidatesPoint">The candidates to pair point to use.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point.</param>
|
||||
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
|
||||
Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
Func<T, T, double> maxDistanceFunction,
|
||||
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
|
||||
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
|
||||
{
|
||||
/*************************************************************************************
|
||||
* Algorithm steps
|
||||
* 1. Find nearest neighbours indexes (done in parallel)
|
||||
* Iterate every point (pivot) and put its nearest neighbour's index in an array
|
||||
* e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
|
||||
* Only conciders a neighbour if it is within the maximum distance.
|
||||
* If not within the maximum distance, index will be set to -1.
|
||||
* NB: Given the possible asymmetry in the relationship, it is possible
|
||||
* that if indexes[i] = j then indexes[j] != i.
|
||||
*
|
||||
* 2. Group indexes
|
||||
* Group indexes if share neighbours in common - Transitive closure
|
||||
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
|
||||
* (i,j,k) will form a group and (m,n) will form another group.
|
||||
*
|
||||
* 3. Merge groups that have indexes in common - If any
|
||||
* If there are group with indexes in common, merge them.
|
||||
* (Could be improved and put in step 2)
|
||||
*************************************************************************************/
|
||||
|
||||
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
|
||||
var candidatesPoints = elements.Select(x => candidatesPoint(x)).ToList();
|
||||
|
||||
// 1. Find nearest neighbours indexes
|
||||
Parallel.For(0, elements.Length, e =>
|
||||
{
|
||||
var pivot = elements[e];
|
||||
|
||||
if (filterPivot(pivot))
|
||||
{
|
||||
int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist);
|
||||
var paired = elements[index];
|
||||
|
||||
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
|
||||
{
|
||||
indexes[e] = index;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 2. Group indexes
|
||||
List<HashSet<int>> groupedIndexes = new List<HashSet<int>>();
|
||||
HashSet<int> indexDone = new HashSet<int>();
|
||||
|
||||
for (int e = 0; e < elements.Length; e++)
|
||||
{
|
||||
int index = indexes[e];
|
||||
|
||||
if (index == -1) // This element is not connected
|
||||
{
|
||||
// Check if another element index is connected to this element (nb: distance measure is asymetric)
|
||||
if (!indexes.Contains(e))
|
||||
{
|
||||
// If no other element is connected to this element, add it as a standalone element
|
||||
groupedIndexes.Add(new HashSet<int>() { e });
|
||||
indexDone.Add(e);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
bool isDoneC = indexDone.Contains(e);
|
||||
bool isDoneI = indexDone.Contains(index);
|
||||
if (isDoneC || isDoneI)
|
||||
{
|
||||
if (isDoneC && !isDoneI)
|
||||
{
|
||||
foreach (var pair in groupedIndexes.Where(x => x.Contains(e)))
|
||||
{
|
||||
pair.Add(index);
|
||||
}
|
||||
indexDone.Add(index);
|
||||
}
|
||||
else if (!isDoneC && isDoneI)
|
||||
{
|
||||
foreach (var pair in groupedIndexes.Where(x => x.Contains(index)))
|
||||
{
|
||||
pair.Add(e);
|
||||
}
|
||||
indexDone.Add(e);
|
||||
}
|
||||
else // isDoneC && isDoneI
|
||||
{
|
||||
foreach (var pair in groupedIndexes.Where(x => x.Contains(index)))
|
||||
{
|
||||
if (!pair.Contains(e)) pair.Add(e);
|
||||
}
|
||||
|
||||
foreach (var pair in groupedIndexes.Where(x => x.Contains(e)))
|
||||
{
|
||||
if (!pair.Contains(index)) pair.Add(index);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
groupedIndexes.Add(new HashSet<int>() { e, index });
|
||||
indexDone.Add(e);
|
||||
indexDone.Add(index);
|
||||
}
|
||||
}
|
||||
|
||||
// Check that all elements are done
|
||||
if (elements.Length != indexDone.Count)
|
||||
{
|
||||
throw new Exception("ClusteringAlgorithms.GetNNGroupedIndexes(): Some elements were not done.");
|
||||
}
|
||||
|
||||
// 3. Merge groups that have indexes in common
|
||||
// Check if duplicates (if duplicates, then same index in different groups)
|
||||
if (indexDone.Count != groupedIndexes.SelectMany(x => x).Count())
|
||||
{
|
||||
for (int e = 0; e < elements.Length; e++)
|
||||
{
|
||||
List<HashSet<int>> candidates = groupedIndexes.Where(x => x.Contains(e)).ToList();
|
||||
int count = candidates.Count();
|
||||
if (count < 2) continue; // Only one group with this index
|
||||
|
||||
HashSet<int> merged = candidates.First();
|
||||
groupedIndexes.Remove(merged);
|
||||
for (int i = 1; i < count; i++)
|
||||
{
|
||||
var current = candidates.ElementAt(i);
|
||||
merged.UnionWith(current);
|
||||
groupedIndexes.Remove(current);
|
||||
}
|
||||
groupedIndexes.Add(merged);
|
||||
}
|
||||
}
|
||||
|
||||
return groupedIndexes;
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user