mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-23 04:36:44 +08:00
Update DocstrumBB to account for middle point of the overlapping area distance. For this, using distance between 2 lines.
This commit is contained in:
@@ -18,11 +18,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
|
||||
/// <param name="elements">Array of elements to group.</param>
|
||||
/// <param name="distMeasure">The distance measure between two points.</param>
|
||||
/// <param name="maxDistanceFunction">The function that determines the distance between to points in the same cluster.</param>
|
||||
/// <param name="pivotPoint">The pivot's point to use.</param>
|
||||
/// <param name="candidatesPoint">The candidates to pair point to use.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point.</param>
|
||||
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two points in the same cluster.</param>
|
||||
/// <param name="pivotPoint">The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
|
||||
Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
Func<T, T, double> maxDistanceFunction,
|
||||
@@ -69,17 +69,97 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
}
|
||||
});
|
||||
|
||||
// 2. Group indexes
|
||||
// 3. Merge groups that have indexes in common
|
||||
var groupedIndexes = GroupMergeIndexes(indexes);
|
||||
|
||||
return groupedIndexes;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
|
||||
/// https://en.wikipedia.org/wiki/Transitive_closure
|
||||
/// </summary>
|
||||
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
|
||||
/// <param name="elements">Array of elements to group.</param>
|
||||
/// <param name="distMeasure">The distance measure between two lines.</param>
|
||||
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two points in the same cluster.</param>
|
||||
/// <param name="pivotLine">The pivot's line to use for pairing.</param>
|
||||
/// <param name="candidatesLine">The candidates' line to use for pairing.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
|
||||
Func<PdfLine, PdfLine, double> distMeasure,
|
||||
Func<T, T, double> maxDistanceFunction,
|
||||
Func<T, PdfLine> pivotLine, Func<T, PdfLine> candidatesLine,
|
||||
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
|
||||
{
|
||||
/*************************************************************************************
|
||||
* Algorithm steps
|
||||
* 1. Find nearest neighbours indexes (done in parallel)
|
||||
* Iterate every point (pivot) and put its nearest neighbour's index in an array
|
||||
* e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
|
||||
* Only conciders a neighbour if it is within the maximum distance.
|
||||
* If not within the maximum distance, index will be set to -1.
|
||||
* NB: Given the possible asymmetry in the relationship, it is possible
|
||||
* that if indexes[i] = j then indexes[j] != i.
|
||||
*
|
||||
* 2. Group indexes
|
||||
* Group indexes if share neighbours in common - Transitive closure
|
||||
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
|
||||
* (i,j,k) will form a group and (m,n) will form another group.
|
||||
*
|
||||
* 3. Merge groups that have indexes in common - If any
|
||||
* If there are group with indexes in common, merge them.
|
||||
* (Could be improved and put in step 2)
|
||||
*************************************************************************************/
|
||||
|
||||
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
|
||||
var candidatesLines = elements.Select(x => candidatesLine(x)).ToList();
|
||||
|
||||
// 1. Find nearest neighbours indexes
|
||||
Parallel.For(0, elements.Length, e =>
|
||||
{
|
||||
var pivot = elements[e];
|
||||
|
||||
if (filterPivot(pivot))
|
||||
{
|
||||
int index = pivotLine(pivot).FindIndexNearest(candidatesLines, distMeasure, out double dist);
|
||||
var paired = elements[index];
|
||||
|
||||
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
|
||||
{
|
||||
indexes[e] = index;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 2. Group indexes
|
||||
// 3. Merge groups that have indexes in common
|
||||
var groupedIndexes = GroupMergeIndexes(indexes);
|
||||
|
||||
return groupedIndexes;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Group elements via transitive closure.
|
||||
/// https://en.wikipedia.org/wiki/Transitive_closure
|
||||
/// </summary>
|
||||
/// <param name="indexes">Array of paired elements index.</param>
|
||||
/// <returns></returns>
|
||||
internal static List<HashSet<int>> GroupMergeIndexes(int[] indexes)
|
||||
{
|
||||
// 2. Group indexes
|
||||
List<HashSet<int>> groupedIndexes = new List<HashSet<int>>();
|
||||
HashSet<int> indexDone = new HashSet<int>();
|
||||
|
||||
for (int e = 0; e < elements.Length; e++)
|
||||
for (int e = 0; e < indexes.Length; e++)
|
||||
{
|
||||
int index = indexes[e];
|
||||
|
||||
if (index == -1) // This element is not connected
|
||||
{
|
||||
// Check if another element index is connected to this element (nb: distance measure is asymetric)
|
||||
// Check if another element's index is connected to this element (nb: distance measure is asymmetric)
|
||||
if (!indexes.Contains(e))
|
||||
{
|
||||
// If no other element is connected to this element, add it as a standalone element
|
||||
@@ -131,7 +211,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
}
|
||||
|
||||
// Check that all elements are done
|
||||
if (elements.Length != indexDone.Count)
|
||||
if (indexes.Length != indexDone.Count)
|
||||
{
|
||||
throw new Exception("ClusteringAlgorithms.GetNNGroupedIndexes(): Some elements were not done.");
|
||||
}
|
||||
@@ -140,7 +220,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
// Check if duplicates (if duplicates, then same index in different groups)
|
||||
if (indexDone.Count != groupedIndexes.SelectMany(x => x).Count())
|
||||
{
|
||||
for (int e = 0; e < elements.Length; e++)
|
||||
for (int e = 0; e < indexes.Length; e++)
|
||||
{
|
||||
List<HashSet<int>> candidates = groupedIndexes.Where(x => x.Contains(e)).ToList();
|
||||
int count = candidates.Count();
|
||||
@@ -157,7 +237,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
groupedIndexes.Add(merged);
|
||||
}
|
||||
}
|
||||
|
||||
return groupedIndexes;
|
||||
}
|
||||
}
|
||||
|
@@ -86,7 +86,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
|
||||
/// <param name="points">The list of neighbours candidates.</param>
|
||||
/// <param name="distanceMeasure">The distance measure to use.</param>
|
||||
/// <param name="distance">The distance between reference point, and its nearest neighbour</param>
|
||||
/// <param name="distance">The distance between reference point, and its nearest neighbour.</param>
|
||||
public static PdfPoint FindNearest(this PdfPoint pdfPoint, IReadOnlyList<PdfPoint> points,
|
||||
Func<PdfPoint, PdfPoint, double> distanceMeasure, out double distance)
|
||||
{
|
||||
@@ -122,7 +122,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
|
||||
/// <param name="points">The list of neighbours candidates.</param>
|
||||
/// <param name="distanceMeasure">The distance measure to use.</param>
|
||||
/// <param name="distance">The distance between reference point, and its nearest neighbour</param>
|
||||
/// <param name="distance">The distance between reference point, and its nearest neighbour.</param>
|
||||
public static int FindIndexNearest(this PdfPoint pdfPoint, IReadOnlyList<PdfPoint> points,
|
||||
Func<PdfPoint, PdfPoint, double> distanceMeasure, out double distance)
|
||||
{
|
||||
@@ -151,5 +151,41 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
return closestPointIndex;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Find the index of the nearest line.
|
||||
/// </summary>
|
||||
/// <param name="pdfLine">The reference line, for which to find the nearest neighbour.</param>
|
||||
/// <param name="lines">The list of neighbours candidates.</param>
|
||||
/// <param name="distanceMeasure">The distance measure between two lines to use.</param>
|
||||
/// <param name="distance">The distance between reference line, and its nearest neighbour.</param>
|
||||
public static int FindIndexNearest(this PdfLine pdfLine, IReadOnlyList<PdfLine> lines,
|
||||
Func<PdfLine, PdfLine, double> distanceMeasure, out double distance)
|
||||
{
|
||||
if (lines == null || lines.Count == 0)
|
||||
{
|
||||
throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "lines");
|
||||
}
|
||||
|
||||
if (distanceMeasure == null)
|
||||
{
|
||||
throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", "distanceMeasure");
|
||||
}
|
||||
|
||||
distance = double.MaxValue;
|
||||
int closestLineIndex = -1;
|
||||
|
||||
for (var i = 0; i < lines.Count; i++)
|
||||
{
|
||||
double currentDistance = distanceMeasure(lines[i], pdfLine);
|
||||
if (currentDistance < distance)
|
||||
{
|
||||
distance = currentDistance;
|
||||
closestLineIndex = i;
|
||||
}
|
||||
}
|
||||
|
||||
return closestLineIndex;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -77,11 +77,48 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
// 2. Find lines of text
|
||||
double maxDistWL = Math.Min(3 * withinLineDistance, Math.Sqrt(2) * betweenLineDistance);
|
||||
var lines = GetLines(pageWordsArr, maxDistWL).ToArray();
|
||||
var lines = GetLines(pageWordsArr, maxDistWL, wlAngleLB, wlAngleUB).ToArray();
|
||||
|
||||
// 3. Find blocks of text
|
||||
double maxDistBL = blMultiplier * betweenLineDistance;
|
||||
return GetLinesGroups(lines, maxDistBL).ToList();
|
||||
var blocks = GetLinesGroups(lines, maxDistBL).ToList();
|
||||
|
||||
// 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
|
||||
for (int b = 0; b < blocks.Count; b++)
|
||||
{
|
||||
if (blocks[b] == null) continue;
|
||||
|
||||
for (int c = 0; c < blocks.Count; c++)
|
||||
{
|
||||
if (b == c) continue;
|
||||
if (blocks[c] == null) continue;
|
||||
|
||||
if (AreRectangleOverlapping(blocks[b].BoundingBox, blocks[c].BoundingBox))
|
||||
{
|
||||
// Merge
|
||||
// 1. Merge all words
|
||||
var mergedWords = new List<Word>(blocks[b].TextLines.SelectMany(l => l.Words));
|
||||
mergedWords.AddRange(blocks[c].TextLines.SelectMany(l => l.Words));
|
||||
|
||||
// 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
|
||||
// same block. Filtering will still be done based on angle.
|
||||
var mergedLines = GetLines(mergedWords.ToArray(), wlAngleLB, wlAngleUB, double.MaxValue);
|
||||
blocks[b] = new TextBlock(mergedLines.ToList());
|
||||
|
||||
// Remove
|
||||
blocks[c] = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return blocks.Where(b => b != null).ToList();
|
||||
}
|
||||
|
||||
private bool AreRectangleOverlapping(PdfRectangle rectangle1, PdfRectangle rectangle2)
|
||||
{
|
||||
if (rectangle1.Left > rectangle2.Right || rectangle2.Left > rectangle1.Right) return false;
|
||||
if (rectangle1.Top < rectangle2.Bottom || rectangle2.Top < rectangle1.Bottom) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -104,6 +141,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
Func<PdfPoint, PdfPoint, double> finalDistMEasure)
|
||||
{
|
||||
var pointR = funcPivotDist(pivot.BoundingBox);
|
||||
|
||||
// Filter by angle
|
||||
var filtered = words.Where(w =>
|
||||
{
|
||||
var angleWL = Distances.Angle(funcPivotAngle(pivot.BoundingBox), funcPointsAngle(w.BoundingBox));
|
||||
@@ -135,18 +174,27 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// </summary>
|
||||
/// <param name="words"></param>
|
||||
/// <param name="maxDist"></param>
|
||||
/// <param name="wlAngleLB"></param>
|
||||
/// <param name="wlAngleUB"></param>
|
||||
/// <returns></returns>
|
||||
private IEnumerable<TextLine> GetLines(Word[] words, double maxDist)
|
||||
private IEnumerable<TextLine> GetLines(Word[] words, double maxDist, double wlAngleLB, double wlAngleUB)
|
||||
{
|
||||
/***************************************************************************************************
|
||||
* /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not
|
||||
* work as the FindIndexNearest() function might pair the pivot with itself (the pivot's right point
|
||||
* (distance = width) is closer than other words' left point).
|
||||
* -> Solution would be to find more than one nearest neighbours. Use KDTree?
|
||||
***************************************************************************************************/
|
||||
|
||||
TextDirection textDirection = words[0].TextDirection;
|
||||
var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean,
|
||||
(w1, w2) => maxDist,
|
||||
w => w.BoundingBox.BottomRight, w => w.BoundingBox.BottomLeft,
|
||||
w => true,
|
||||
(w1, w2) =>
|
||||
(pivot, candidate) => maxDist,
|
||||
pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
|
||||
pivot => true,
|
||||
(pivot, candidate) =>
|
||||
{
|
||||
var angleWL = Distances.Angle(w1.BoundingBox.BottomRight, w2.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle
|
||||
return (angleWL >= -30 && angleWL <= 30);
|
||||
var angleWL = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle
|
||||
return (angleWL >= wlAngleLB && angleWL <= wlAngleUB);
|
||||
}).ToList();
|
||||
|
||||
Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
|
||||
@@ -177,10 +225,37 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <returns></returns>
|
||||
private IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
|
||||
{
|
||||
var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, Distances.Euclidean,
|
||||
(l1, l2) => maxDist,
|
||||
l => l.BoundingBox.TopLeft, l => l.BoundingBox.BottomLeft,
|
||||
l => true, (l1, l2) => true).ToList();
|
||||
/**************************************************************************************************
|
||||
* We want to measure the distance between two lines using the following method:
|
||||
* We check if two lines are overlapping horizontally.
|
||||
* If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
|
||||
* We finally compute the Euclidean distance between these two middle points.
|
||||
* If the two lines are not overlapping, the distance is set to the max distance.
|
||||
*
|
||||
* /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'line Height', the algo won't
|
||||
* work as the FindIndexNearest() function will always pair the pivot with itself (the pivot's top
|
||||
* point (distance = height) is closer than other lines' top point).
|
||||
* -> Solution would be to find more than one nearest neighbours. Use KDTree?
|
||||
**************************************************************************************************/
|
||||
|
||||
Func<PdfLine, PdfLine, double> euclidianOverlappingMiddleDistance = (l1, l2) =>
|
||||
{
|
||||
var left = Math.Max(l1.Point1.X, l2.Point1.X);
|
||||
var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left);
|
||||
|
||||
if (d < 0) return double.MaxValue; // not overlapping -> max distance
|
||||
|
||||
return Distances.Euclidean(
|
||||
new PdfPoint(left + d / 2, l1.Point1.Y),
|
||||
new PdfPoint(left + d / 2, l2.Point1.Y));
|
||||
};
|
||||
|
||||
var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines,
|
||||
euclidianOverlappingMiddleDistance,
|
||||
(pivot, candidate) => maxDist,
|
||||
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
|
||||
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
|
||||
pivot => true, (pivot, candidate) => true).ToList();
|
||||
|
||||
for (int a = 0; a < groupedIndexes.Count(); a++)
|
||||
{
|
||||
|
Reference in New Issue
Block a user