mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-01-09 14:04:35 +08:00
completely rework DocstrumBoundingBoxes, now handle rotated text
This commit is contained in:
@@ -2,17 +2,17 @@
|
||||
{
|
||||
using Content;
|
||||
using Core;
|
||||
using Geometry;
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// The Document Spectrum (Docstrum) algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood
|
||||
/// clustering of connected components extracted from the document.
|
||||
/// The Document Spectrum (Docstrum) algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood
|
||||
/// clustering of connected components extracted from the document.
|
||||
/// This implementation leverages bounding boxes and does not exactly replicates the original algorithm.
|
||||
/// <para>See 'The document spectrum for page layout analysis.' by L. O'Gorman.</para>
|
||||
/// </summary>
|
||||
@@ -25,274 +25,499 @@
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// Get the blocks.
|
||||
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
|
||||
/// Get the blocks using default options values.
|
||||
/// </summary>
|
||||
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
|
||||
{
|
||||
return GetBlocks(words, -1);
|
||||
return GetBlocks(words, new DocstrumBoundingBoxesOptions());
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// Get the blocks.
|
||||
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
|
||||
/// Get the blocks using options values.
|
||||
/// </summary>
|
||||
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <param name="options">The <see cref="DocstrumBoundingBoxesOptions"/> to use.</param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, int maxDegreeOfParallelism)
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
|
||||
{
|
||||
return GetBlocks(words, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3, maxDegreeOfParallelism);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks. See original paper for more information.
|
||||
/// </summary>
|
||||
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
|
||||
/// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
|
||||
/// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
|
||||
/// text lines for blocking. Maximum distance will be this number times the between-line
|
||||
/// distance found by the analysis.</param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, AngleBounds withinLine,
|
||||
AngleBounds betweenLine, double betweenLineMultiplier)
|
||||
{
|
||||
return GetBlocks(words, withinLine, betweenLine, betweenLineMultiplier, -1);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks. See original paper for more information.
|
||||
/// </summary>
|
||||
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
|
||||
/// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
|
||||
/// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
|
||||
/// text lines for blocking. Maximum distance will be this number times the between-line
|
||||
/// distance found by the analysis.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, AngleBounds withinLine,
|
||||
AngleBounds betweenLine, double betweenLineMultiplier, int maxDegreeOfParallelism)
|
||||
{
|
||||
if (words == null)
|
||||
if (options is DocstrumBoundingBoxesOptions dbbOptions)
|
||||
{
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
|
||||
var wordsList = new List<Word>();
|
||||
|
||||
foreach (var word in words)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(word.Text))
|
||||
if (words?.Any() != true)
|
||||
{
|
||||
continue;
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
|
||||
wordsList.Add(word);
|
||||
return GetBlocks(words.ToList(),
|
||||
dbbOptions.WithinLineBounds, dbbOptions.WithinLineMultiplier, dbbOptions.WithinLineBinSize,
|
||||
dbbOptions.BetweenLineBounds, dbbOptions.BetweenLineMultiplier, dbbOptions.BetweenLineBinSize,
|
||||
dbbOptions.AngularDifferenceBounds,
|
||||
dbbOptions.Epsilon,
|
||||
dbbOptions.WordSeparator, dbbOptions.LineSeparator,
|
||||
dbbOptions.MaxDegreeOfParallelism);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentException("Options provided must be of type " + nameof(DocstrumBoundingBoxesOptions) + ".", nameof(options));
|
||||
}
|
||||
}
|
||||
|
||||
if (wordsList.Count == 0)
|
||||
/// <summary>
|
||||
/// Get the blocks. See original paper for more information.
|
||||
/// </summary>
|
||||
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <param name="wlBounds">Angle bounds for words to be considered as neighbours on the same line.</param>
|
||||
/// <param name="wlMultiplier">Multiplier that gives the maximum euclidian distance between words for building lines.
|
||||
/// Maximum distance will be this number times the within-line distance found by the analysis.</param>
|
||||
/// <param name="wlBinSize">The bin size used when building the within-line distances distribution.</param>
|
||||
/// <param name="blBounds">Angle bounds for words to be considered as neighbours on separate lines.</param>
|
||||
/// <param name="blMultiplier">Multiplier that gives the maximum perpendicular distance between
|
||||
/// text lines for blocking. Maximum distance will be this number times the between-line
|
||||
/// distance found by the analysis.</param>
|
||||
/// <param name="blBinSize">The bin size used when building the between-line distances distribution.</param>
|
||||
/// <param name="angularDifferenceBounds">The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.</param>
|
||||
/// <param name="epsilon">Precision when testing equalities.</param>
|
||||
/// <param name="wordSeparator"></param>
|
||||
/// <param name="lineSeparator"></param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
private IReadOnlyList<TextBlock> GetBlocks(IReadOnlyList<Word> words,
|
||||
AngleBounds wlBounds, double wlMultiplier, int wlBinSize,
|
||||
AngleBounds blBounds, double blMultiplier, int blBinSize,
|
||||
AngleBounds angularDifferenceBounds,
|
||||
double epsilon,
|
||||
string wordSeparator, string lineSeparator,
|
||||
int maxDegreeOfParallelism)
|
||||
{
|
||||
// Filter out white spaces
|
||||
words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToList();
|
||||
if (words.Count == 0)
|
||||
{
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
|
||||
// 1. Estimate within line and between line spacing
|
||||
if (!GetSpacingEstimation(words, wlBounds, wlBinSize, blBounds, blBinSize,
|
||||
maxDegreeOfParallelism,
|
||||
out double withinLineDistance, out double betweenLineDistance))
|
||||
{
|
||||
if (double.IsNaN(withinLineDistance)) withinLineDistance = 0;
|
||||
if (double.IsNaN(betweenLineDistance)) betweenLineDistance = 0;
|
||||
}
|
||||
|
||||
// 2. Determination of Text Lines
|
||||
double maxWithinLineDistance = wlMultiplier * withinLineDistance; //Math.Min(3 * withinLineDistance.Value, 1.4142 * betweenLineDistance.Value);
|
||||
var lines = GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();
|
||||
|
||||
// 3. Structural Block Determination
|
||||
double maxBetweenLineDistance = blMultiplier * betweenLineDistance;
|
||||
return GetStructuralBlocks(lines, maxBetweenLineDistance, angularDifferenceBounds, epsilon, lineSeparator, maxDegreeOfParallelism).ToList();
|
||||
}
|
||||
|
||||
#region Spacing Estimation
|
||||
/// <summary>
|
||||
/// Estimation of within-line and between-line spacing.
|
||||
/// </summary>
|
||||
/// <returns>False if either 'withinLineDistance' or 'betweenLineDistance' is NaN.</returns>
|
||||
private static bool GetSpacingEstimation(IReadOnlyList<Word> words,
|
||||
AngleBounds wlBounds, int wlBinSize,
|
||||
AngleBounds blBounds, int blBinSize,
|
||||
int maxDegreeOfParallelism,
|
||||
out double withinLineDistance, out double betweenLineDistance)
|
||||
{
|
||||
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
|
||||
|
||||
var withinLineDistList = new ConcurrentBag<double>();
|
||||
var betweenLineDistList = new ConcurrentBag<double>();
|
||||
|
||||
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
|
||||
|
||||
// 1. Estimate within line and between line spacing
|
||||
KdTree<Word> kdTreeWL = new KdTree<Word>(wordsList, w => w.BoundingBox.BottomLeft);
|
||||
KdTree<Word> kdTreeBL = new KdTree<Word>(wordsList, w => w.BoundingBox.TopLeft);
|
||||
KdTree<Word> kdTreeBottomLeft = new KdTree<Word>(words, w => w.BoundingBox.BottomLeft);
|
||||
|
||||
Parallel.For(0, wordsList.Count, parallelOptions, i =>
|
||||
Parallel.For(0, words.Count, parallelOptions, i =>
|
||||
{
|
||||
var word = wordsList[i];
|
||||
var word = words[i];
|
||||
|
||||
// Within-line distance
|
||||
var neighbourWL = kdTreeWL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 0.5));
|
||||
foreach (var n in neighbourWL)
|
||||
// 1.1.1 Find the 2 closest neighbours words to the candidate, using euclidean distance.
|
||||
foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, Distances.Euclidean))
|
||||
{
|
||||
if (withinLine.Contains(Distances.Angle(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft)))
|
||||
// 1.1.2 Check if the neighbour word is within the angle of the candidate
|
||||
if (wlBounds.Contains(AngleWL(word, n.Item1)))
|
||||
{
|
||||
withinLineDistList.Add(Distances.Horizontal(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft));
|
||||
withinLineDistList.Add(Distances.Euclidean(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft));
|
||||
}
|
||||
}
|
||||
|
||||
// Between-line distance
|
||||
var neighbourBL = kdTreeBL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomLeft, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 50));
|
||||
foreach (var n in neighbourBL)
|
||||
// 1.2.1 Find the 2 closest neighbours words to the candidate, using euclidean distance.
|
||||
foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.TopLeft, Distances.Euclidean))
|
||||
{
|
||||
if (betweenLine.Contains(Distances.Angle(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid)))
|
||||
// 1.2.2 Check if the candidate words is within the angle
|
||||
var angle = AngleBL(word, n.Item1);
|
||||
if (blBounds.Contains(angle))
|
||||
{
|
||||
betweenLineDistList.Add(Distances.Vertical(word.BoundingBox.BottomLeft, n.Item1.BoundingBox.TopLeft));
|
||||
// 1.2.3 Compute the vertical (between-line) distance between the candidate
|
||||
// and the neighbour and add it to the between-line distances list
|
||||
double hypotenuse = Distances.Euclidean(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid);
|
||||
|
||||
// Angle is kept within [-90, 90]
|
||||
if (angle > 90)
|
||||
{
|
||||
angle -= 180;
|
||||
}
|
||||
|
||||
var dist = Math.Abs(hypotenuse * Math.Cos((90 - angle) * Math.PI / 180))
|
||||
- word.BoundingBox.Height / 2.0 - n.Item1.BoundingBox.Height / 2.0;
|
||||
|
||||
// The perpendicular distance can be negative because of the subtractions.
|
||||
// Could occur when words are overlapping, we ignore that.
|
||||
if (dist >= 0) betweenLineDistList.Add(dist);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
double? withinLineDistance = GetPeakAverageDistance(withinLineDistList);
|
||||
double? betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);
|
||||
// Compute average peak value of distribution
|
||||
double? withinLinePeak = GetPeakAverageDistance(withinLineDistList, wlBinSize);
|
||||
double? betweenLinePeak = GetPeakAverageDistance(betweenLineDistList, blBinSize);
|
||||
|
||||
if (!withinLineDistance.HasValue || !betweenLineDistance.HasValue)
|
||||
{
|
||||
return new[] { new TextBlock(new[] { new TextLine(wordsList) }) };
|
||||
}
|
||||
withinLineDistance = withinLinePeak ?? double.NaN;
|
||||
betweenLineDistance = betweenLinePeak ?? double.NaN;
|
||||
|
||||
// 2. Find lines of text
|
||||
double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value);
|
||||
var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine, maxDegreeOfParallelism).ToArray();
|
||||
|
||||
// 3. Find blocks of text
|
||||
double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value;
|
||||
var blocks = GetLinesGroups(lines, maxDistanceBetweenLine, maxDegreeOfParallelism).ToList();
|
||||
|
||||
// 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
|
||||
for (var b = 0; b < blocks.Count; b++)
|
||||
{
|
||||
if (blocks[b] == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Merge all lines (words)
|
||||
blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(),
|
||||
double.MaxValue, withinLine, maxDegreeOfParallelism).ToList());
|
||||
|
||||
for (var c = 0; c < blocks.Count; c++)
|
||||
{
|
||||
if (b == c || blocks[c] == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (blocks[b].BoundingBox.IntersectsWith(blocks[c].BoundingBox))
|
||||
{
|
||||
// Merge
|
||||
// 1. Merge all words
|
||||
var mergedWords = new List<Word>(blocks[b].TextLines.SelectMany(l => l.Words));
|
||||
mergedWords.AddRange(blocks[c].TextLines.SelectMany(l => l.Words));
|
||||
|
||||
// 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
|
||||
// same block. Filtering will still be done based on angle.
|
||||
// Merge all lines (words) sharing same bottom (baseline)
|
||||
var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine, maxDegreeOfParallelism).ToList();
|
||||
blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList());
|
||||
|
||||
// Remove
|
||||
blocks[c] = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return blocks.Where(b => b != null).ToList();
|
||||
}
|
||||
|
||||
private static IEnumerable<TextLine> GetLines(List<Word> words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism)
|
||||
{
|
||||
TextOrientation TextOrientation = words[0].TextOrientation;
|
||||
var groupedIndexes = Clustering.NearestNeighbours(words, 2, Distances.Euclidean,
|
||||
(pivot, candidate) => maxDist,
|
||||
pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
|
||||
pivot => true,
|
||||
(pivot, candidate) => withinLine.Contains(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft)),
|
||||
maxDegreeOfParallelism).ToList();
|
||||
|
||||
Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
|
||||
if (TextOrientation == TextOrientation.Rotate180)
|
||||
{
|
||||
orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList();
|
||||
}
|
||||
else if (TextOrientation == TextOrientation.Rotate90)
|
||||
{
|
||||
orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList();
|
||||
}
|
||||
else if (TextOrientation == TextOrientation.Rotate270)
|
||||
{
|
||||
orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
|
||||
}
|
||||
|
||||
for (var a = 0; a < groupedIndexes.Count; a++)
|
||||
{
|
||||
yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i])));
|
||||
}
|
||||
}
|
||||
|
||||
private static IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist, int maxDegreeOfParallelism)
|
||||
{
|
||||
/**************************************************************************************************
|
||||
* We want to measure the distance between two lines using the following method:
|
||||
* We check if two lines are overlapping horizontally.
|
||||
* If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
|
||||
* We finally compute the Euclidean distance between these two middle points.
|
||||
* If the two lines are not overlapping, the distance is set to the max distance.
|
||||
**************************************************************************************************/
|
||||
|
||||
double euclidianOverlappingMiddleDistance(PdfLine l1, PdfLine l2)
|
||||
{
|
||||
var left = Math.Max(l1.Point1.X, l2.Point1.X);
|
||||
var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left);
|
||||
|
||||
if (d < 0) return double.MaxValue; // not overlapping -> max distance
|
||||
|
||||
return Distances.Euclidean(
|
||||
new PdfPoint(left + d / 2, l1.Point1.Y),
|
||||
new PdfPoint(left + d / 2, l2.Point1.Y));
|
||||
}
|
||||
|
||||
var groupedIndexes = Clustering.NearestNeighbours(lines,
|
||||
euclidianOverlappingMiddleDistance,
|
||||
(pivot, candidate) => maxDist,
|
||||
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
|
||||
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
|
||||
pivot => true, (pivot, candidate) => true,
|
||||
maxDegreeOfParallelism).ToList();
|
||||
|
||||
for (int a = 0; a < groupedIndexes.Count; a++)
|
||||
{
|
||||
yield return new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList());
|
||||
}
|
||||
return withinLinePeak.HasValue && betweenLinePeak.HasValue;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the average distance value of the peak bucket of the histogram.
|
||||
/// </summary>
|
||||
/// <param name="distances">The set of distances to average.</param>
|
||||
private static double? GetPeakAverageDistance(IEnumerable<double> distances)
|
||||
/// <param name="binLength"></param>
|
||||
private static double? GetPeakAverageDistance(IEnumerable<double> distances, int binLength = 1)
|
||||
{
|
||||
var buckets = new Dictionary<int, List<double>>();
|
||||
if (!distances.Any())
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
if (binLength <= 0)
|
||||
{
|
||||
throw new ArgumentException("DocstrumBoundingBoxes: the bin length must be positive when commputing peak average distance.", nameof(binLength));
|
||||
}
|
||||
|
||||
var max = (int)Math.Ceiling(distances.Max());
|
||||
if (max == 0)
|
||||
{
|
||||
max = binLength;
|
||||
}
|
||||
else
|
||||
{
|
||||
binLength = binLength > max ? max : binLength;
|
||||
}
|
||||
|
||||
var bins = Enumerable.Range(0, (int)Math.Ceiling(max / (double)binLength) + 1)
|
||||
.Select(x => x * binLength)
|
||||
.ToDictionary(x => x, _ => new List<double>());
|
||||
|
||||
foreach (var distance in distances)
|
||||
{
|
||||
var floor = (int)distance;
|
||||
|
||||
if (buckets.ContainsKey(floor))
|
||||
int bin = (int)Math.Floor(distance / binLength);
|
||||
if (bin < 0)
|
||||
{
|
||||
buckets[floor].Add(distance);
|
||||
}
|
||||
else
|
||||
{
|
||||
buckets[floor] = new List<double> { distance };
|
||||
throw new ArgumentOutOfRangeException(nameof(bin), "DocstrumBoundingBoxes: Negative distance found while commputing peak average distance.");
|
||||
}
|
||||
bins[bins.Keys.ElementAt(bin)].Add(distance);
|
||||
}
|
||||
|
||||
var best = default(List<double>);
|
||||
|
||||
foreach (var bucket in buckets)
|
||||
foreach (var bin in bins)
|
||||
{
|
||||
if (best == null || bucket.Value.Count > best.Count)
|
||||
if (best == null || bin.Value.Count > best.Count)
|
||||
{
|
||||
best = bucket.Value;
|
||||
best = bin.Value;
|
||||
}
|
||||
}
|
||||
|
||||
return best?.Average();
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region Text Lines
|
||||
private static IEnumerable<TextLine> GetLines(IReadOnlyList<Word> words, double maxWLDistance, AngleBounds withinLine,
|
||||
string wordSeparator, int maxDegreeOfParallelism)
|
||||
{
|
||||
var groupedWords = Clustering.NearestNeighbours(words,
|
||||
2,
|
||||
Distances.Euclidean,
|
||||
(_, __) => maxWLDistance,
|
||||
pivot => pivot.BoundingBox.BottomRight,
|
||||
candidate => candidate.BoundingBox.BottomLeft,
|
||||
_ => true,
|
||||
(pivot, candidate) => withinLine.Contains(AngleWL(pivot, candidate)),
|
||||
maxDegreeOfParallelism).ToList();
|
||||
|
||||
foreach (var g in groupedWords)
|
||||
{
|
||||
yield return new TextLine(g.OrderByReadingOrder(), wordSeparator);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Helper function to compute the within line angle between the pivot's bottom
|
||||
/// right and the candidate's bottom left points, taking in account the pivot's rotation.
|
||||
/// <para>-90 ≤ θ ≤ 90.</para>
|
||||
/// </summary>
|
||||
private static double AngleWL(Word pivot, Word candidate)
|
||||
{
|
||||
var angle = Distances.BoundAngle180(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft) - pivot.BoundingBox.Rotation);
|
||||
|
||||
// Angle is kept within [-90;90] degree to handle overlapping words
|
||||
if (angle > 90)
|
||||
{
|
||||
angle -= 180;
|
||||
}
|
||||
else if (angle < -90)
|
||||
{
|
||||
angle += 180;
|
||||
}
|
||||
|
||||
return angle;
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region Blocking
|
||||
private static IEnumerable<TextBlock> GetStructuralBlocks(IReadOnlyList<TextLine> lines,
|
||||
double maxBLDistance, AngleBounds angularDifference, double epsilon, string lineSeparator, int maxDegreeOfParallelism)
|
||||
{
|
||||
/******************************************************************************************************
|
||||
* We want to measure the distance between two lines using the following method:
|
||||
* We check if two lines are overlapping horizontally and compute the perpendicular distance.
|
||||
* We check if the angle between the two line is within 'angularDifference'.
|
||||
* If the two lines are not overlapping or the angle is too wide, the distance is set to the infinity.
|
||||
*
|
||||
* If two text lines are approximately parallel, close in perpendicular distance, and they either
|
||||
* overlap to some specified degree or are separated by only a small distance in parallel distance,
|
||||
* then they are said to meet the criteria to belong to the same structural block.
|
||||
******************************************************************************************************/
|
||||
|
||||
var groupedLines = Clustering.NearestNeighbours(
|
||||
lines,
|
||||
(l1, l2) => PerpendicularOverlappingDistance(l1, l2, angularDifference, epsilon),
|
||||
(_, __) => maxBLDistance,
|
||||
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
|
||||
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
|
||||
_ => true,
|
||||
(_, __) => true,
|
||||
maxDegreeOfParallelism).ToList();
|
||||
|
||||
foreach (var g in groupedLines)
|
||||
{
|
||||
yield return new TextBlock(g.OrderByReadingOrder(), lineSeparator);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Perpendicular overlapping distance.
|
||||
/// TODO: describe checks done
|
||||
/// </summary>
|
||||
/// <param name="line1"></param>
|
||||
/// <param name="line2"></param>
|
||||
/// <param name="angularDifferenceBounds"></param>
|
||||
/// <param name="epsilon"></param>
|
||||
private static double PerpendicularOverlappingDistance(PdfLine line1, PdfLine line2, AngleBounds angularDifferenceBounds, double epsilon)
|
||||
{
|
||||
if (GetStructuralBlockingParameters(line1, line2, epsilon, out double theta, out _, out double ed))
|
||||
{
|
||||
// Angle is kept within [-90;90]
|
||||
if (theta > 90)
|
||||
{
|
||||
theta -= 180;
|
||||
}
|
||||
else if (theta < -90)
|
||||
{
|
||||
theta += 180;
|
||||
}
|
||||
|
||||
if (!angularDifferenceBounds.Contains(theta))
|
||||
{
|
||||
// exclude because not parallel enough
|
||||
return double.PositiveInfinity;
|
||||
}
|
||||
|
||||
return Math.Abs(ed);
|
||||
}
|
||||
else
|
||||
{
|
||||
// nonoverlapped
|
||||
return double.PositiveInfinity;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the structural blocking parameters.
|
||||
/// </summary>
|
||||
/// <param name="i"></param>
|
||||
/// <param name="j"></param>
|
||||
/// <param name="epsilon"></param>
|
||||
/// <param name="angularDifference">The angle between the 2 lines.<para>-180 ≤ θ ≤ 180</para></param>
|
||||
/// <param name="normalisedOverlap">Overlap of segment i onto j. Positive value if overlapped, negative value if nonoverlapped.<para>[-1, 1]?</para></param>
|
||||
/// <param name="perpendicularDistance">Signed perpendicular distance.</param>
|
||||
/// <returns>Return true if overlapped, false if nonoverlapped.</returns>
|
||||
public static bool GetStructuralBlockingParameters(PdfLine i, PdfLine j, double epsilon,
|
||||
out double angularDifference, out double normalisedOverlap, out double perpendicularDistance)
|
||||
{
|
||||
if (AlmostEquals(i, j, epsilon))
|
||||
{
|
||||
angularDifference = 0;
|
||||
normalisedOverlap = 1;
|
||||
perpendicularDistance = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
double dXi = i.Point2.X - i.Point1.X;
|
||||
double dYi = i.Point2.Y - i.Point1.Y;
|
||||
double dXj = j.Point2.X - j.Point1.X;
|
||||
double dYj = j.Point2.Y - j.Point1.Y;
|
||||
|
||||
angularDifference = Distances.BoundAngle180((Math.Atan2(dYj, dXj) - Math.Atan2(dYi, dXi)) * 180 / Math.PI);
|
||||
|
||||
PdfPoint? Aj = GetTranslatedPoint(i.Point1.X, i.Point1.Y, j.Point1.X, j.Point1.Y, dXi, dYi, dXj, dYj, epsilon);
|
||||
PdfPoint? Bj = GetTranslatedPoint(i.Point2.X, i.Point2.Y, j.Point2.X, j.Point2.Y, dXi, dYi, dXj, dYj, epsilon);
|
||||
|
||||
if (!Aj.HasValue || !Bj.HasValue)
|
||||
{
|
||||
// Might happen because lines are perpendicular
|
||||
// or have too small lengths
|
||||
normalisedOverlap = double.NaN;
|
||||
perpendicularDistance = double.NaN;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get middle points
|
||||
var ps = new[] { j.Point1, j.Point2, Aj.Value, Bj.Value };
|
||||
|
||||
if (dXj != 0)
|
||||
{
|
||||
ps = ps.OrderBy(p => p.X).ThenBy(p => p.Y).ToArray();
|
||||
}
|
||||
else if (dYj != 0)
|
||||
{
|
||||
ps = ps.OrderBy(p => p.Y).ToArray();
|
||||
}
|
||||
|
||||
PdfPoint Cj = ps[1];
|
||||
PdfPoint Dj = ps[2];
|
||||
|
||||
bool overlap = true;
|
||||
// Cj and Dj should be contained within both j and [Aj,Bj] if overlapped
|
||||
if (!PointInLine(j.Point1, j.Point2, Cj) || !PointInLine(j.Point1, j.Point2, Dj) ||
|
||||
!PointInLine(Aj.Value, Bj.Value, Cj) || !PointInLine(Aj.Value, Bj.Value, Dj))
|
||||
{
|
||||
// nonoverlapped
|
||||
overlap = false;
|
||||
}
|
||||
|
||||
//double pj = Math.Sqrt((Dj.Y - Cj.Y) * (Dj.Y - Cj.Y) + (Dj.X - Cj.X) * (Dj.X - Cj.X));
|
||||
double pj = Distances.Euclidean(Cj, Dj);
|
||||
|
||||
normalisedOverlap = (overlap ? pj : -pj) / j.Length;
|
||||
|
||||
double xMj = (Cj.X + Dj.X) / 2.0;
|
||||
double yMj = (Cj.Y + Dj.Y) / 2.0;
|
||||
|
||||
if (!dXi.AlmostEqualsToZero(epsilon) && !dYi.AlmostEqualsToZero(epsilon))
|
||||
{
|
||||
perpendicularDistance = ((xMj - i.Point1.X) - (yMj - i.Point1.Y) * dXi / dYi) / Math.Sqrt(dXi * dXi / (dYi * dYi) + 1);
|
||||
}
|
||||
else if (dXi.AlmostEqualsToZero(epsilon))
|
||||
{
|
||||
perpendicularDistance = xMj - i.Point1.X;
|
||||
}
|
||||
else
|
||||
{
|
||||
perpendicularDistance = yMj - i.Point1.Y;
|
||||
}
|
||||
|
||||
return overlap;
|
||||
}
|
||||
|
||||
private static PdfPoint? GetTranslatedPoint(double xPi, double yPi, double xPj, double yPj, double dXi, double dYi, double dXj, double dYj, double epsilon)
|
||||
{
|
||||
double dYidYj = dYi * dYj;
|
||||
double dXidXj = dXi * dXj;
|
||||
double denominator = dYidYj + dXidXj;
|
||||
if (denominator.AlmostEqualsToZero(epsilon))
|
||||
{
|
||||
// The denominator is 0 when translating points, meaning the lines are perpendicular.
|
||||
return null;
|
||||
}
|
||||
|
||||
double xTj = (xPi * dXidXj + xPj * dYidYj + dXj * dYi * (yPi - yPj)) / denominator;
|
||||
double yTj = yPj; // TODO: need to check that
|
||||
|
||||
if (dXj > epsilon)
|
||||
{
|
||||
yTj = dYj / dXj * (xTj - xPj) + yPj;
|
||||
}
|
||||
|
||||
return new PdfPoint(xTj, yTj);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Helper function to check if the point belongs to the line./>
|
||||
/// </summary>
|
||||
/// <param name="pl1">Line's first point.</param>
|
||||
/// <param name="pl2">Line's second point.</param>
|
||||
/// <param name="point">The point to check.</param>
|
||||
private static bool PointInLine(PdfPoint pl1, PdfPoint pl2, PdfPoint point)
|
||||
{
|
||||
// /!\ Assuming the points are aligned (be careful)
|
||||
double ax = point.X - pl1.X;
|
||||
double ay = point.Y - pl1.Y;
|
||||
double bx = pl2.X - pl1.X;
|
||||
double by = pl2.Y - pl1.Y;
|
||||
|
||||
double dotProd1 = ax * bx + ay * by;
|
||||
if (dotProd1 < 0) return false;
|
||||
|
||||
double dotProd2 = bx * bx + by * by;
|
||||
return dotProd1 <= dotProd2;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Helper function to check if 2 lines are equal.
|
||||
/// </summary>
|
||||
/// <param name="line1"></param>
|
||||
/// <param name="line2"></param>
|
||||
/// <param name="epsilon"></param>
|
||||
private static bool AlmostEquals(PdfLine line1, PdfLine line2, double epsilon)
|
||||
{
|
||||
return (line1.Point1.X - line2.Point1.X).AlmostEqualsToZero(epsilon) &&
|
||||
(line1.Point1.Y - line2.Point1.Y).AlmostEqualsToZero(epsilon) &&
|
||||
(line1.Point2.X - line2.Point2.X).AlmostEqualsToZero(epsilon) &&
|
||||
(line1.Point2.Y - line2.Point2.Y).AlmostEqualsToZero(epsilon);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Helper function to compute the between line angle between the pivot's
|
||||
/// and the candidate's centroid points, taking in account the pivot's rotation.
|
||||
/// <para>0 ≤ θ ≤ 180.</para>
|
||||
/// </summary>
|
||||
private static double AngleBL(Word pivot, Word candidate)
|
||||
{
|
||||
var angle = Distances.BoundAngle180(Distances.Angle(pivot.BoundingBox.Centroid, candidate.BoundingBox.Centroid) - pivot.BoundingBox.Rotation);
|
||||
|
||||
// Angle is kept within [0, 180] for the check
|
||||
if (angle < 0)
|
||||
{
|
||||
angle += 180;
|
||||
}
|
||||
|
||||
return angle;
|
||||
}
|
||||
#endregion
|
||||
|
||||
/// <summary>
|
||||
/// The bounds for the angle between two words for them to have a certain type of relationship.
|
||||
@@ -314,6 +539,11 @@
|
||||
/// </summary>
|
||||
public AngleBounds(double lowerBound, double upperBound)
|
||||
{
|
||||
if (lowerBound >= upperBound)
|
||||
{
|
||||
throw new ArgumentException("The lower bound should be smaller than the upper bound.");
|
||||
}
|
||||
|
||||
Lower = lowerBound;
|
||||
Upper = upperBound;
|
||||
}
|
||||
@@ -326,5 +556,65 @@
|
||||
return angle >= Lower && angle <= Upper;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Docstrum bounding boxes page segmenter options.
|
||||
/// </summary>
|
||||
public class DocstrumBoundingBoxesOptions : PageSegmenterOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Precision when testing equalities.
|
||||
/// <para>Default value is 1e-3.</para>
|
||||
/// </summary>
|
||||
public double Epsilon { get; set; } = 1e-3;
|
||||
|
||||
/// <summary>
|
||||
/// Angle bounds for words to be considered as neighbours on the same line.
|
||||
/// <para>Default value is -30 ≤ θ ≤ 30.</para>
|
||||
/// </summary>
|
||||
public AngleBounds WithinLineBounds { get; set; } = new AngleBounds(-30, 30);
|
||||
|
||||
/// <summary>
|
||||
/// Multiplier that gives the maximum euclidian distance between
|
||||
/// words for building lines. Maximum distance will be this number times the within-line
|
||||
/// distance found by the analysis.
|
||||
/// <para>Default value is 3.</para>
|
||||
/// </summary>
|
||||
public double WithinLineMultiplier { get; set; } = 3.0;
|
||||
|
||||
/// <summary>
|
||||
/// The bin size used when building the within-line distances distribution.
|
||||
/// <para>Default value is 10.</para>
|
||||
/// </summary>
|
||||
public int WithinLineBinSize { get; set; } = 10;
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Angle bounds for words to be considered as neighbours on separate lines.
|
||||
/// <para>Default value is 45 ≤ θ ≤ 135.</para>
|
||||
/// </summary>
|
||||
public AngleBounds BetweenLineBounds { get; set; } = new AngleBounds(45, 135);
|
||||
|
||||
/// <summary>
|
||||
/// Multiplier that gives the maximum perpendicular distance between
|
||||
/// text lines for blocking. Maximum distance will be this number times the between-line
|
||||
/// distance found by the analysis.
|
||||
/// <para>Default value is 1.3.</para>
|
||||
/// </summary>
|
||||
public double BetweenLineMultiplier { get; set; } = 1.3;
|
||||
|
||||
/// <summary>
|
||||
/// The bin size used when building the between-line distances distribution.
|
||||
/// <para>Default value is 10.</para>
|
||||
/// </summary>
|
||||
public int BetweenLineBinSize { get; set; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// The angular difference bounds between two lines to be considered in the same block.
|
||||
/// This defines if two lines are parallel enough.
|
||||
/// <para>Default value is -30 ≤ θ ≤ 30.</para>
|
||||
/// </summary>
|
||||
public AngleBounds AngularDifferenceBounds { get; set; } = new AngleBounds(-30, 30);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user