completely rework DocstrumBoundingBoxes, now handle rotated text

This commit is contained in:
BobLd
2020-05-23 20:11:29 +01:00
committed by Eliot Jones
parent a16f377d5a
commit f883b56e72

View File

@@ -2,17 +2,17 @@
{
using Content;
using Core;
using Geometry;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
/// <inheritdoc />
/// <summary>
/// The Document Spectrum (Docstrum) algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood
/// clustering of connected components extracted from the document.
/// The Document Spectrum (Docstrum) algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood
/// clustering of connected components extracted from the document.
/// This implementation leverages bounding boxes and does not exactly replicates the original algorithm.
/// <para>See 'The document spectrum for page layout analysis.' by L. O'Gorman.</para>
/// </summary>
@@ -25,274 +25,499 @@
/// <inheritdoc />
/// <summary>
/// Get the blocks.
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
/// Get the blocks using default options values.
/// </summary>
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{
return GetBlocks(words, -1);
return GetBlocks(words, new DocstrumBoundingBoxesOptions());
}
/// <inheritdoc />
/// <summary>
/// Get the blocks.
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
/// Get the blocks using options values.
/// </summary>
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="options">The <see cref="DocstrumBoundingBoxesOptions"/> to use.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, int maxDegreeOfParallelism)
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
{
return GetBlocks(words, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3, maxDegreeOfParallelism);
}
/// <summary>
/// Get the blocks. See original paper for more information.
/// </summary>
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
/// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
/// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
/// text lines for blocking. Maximum distance will be this number times the between-line
/// distance found by the analysis.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, AngleBounds withinLine,
AngleBounds betweenLine, double betweenLineMultiplier)
{
return GetBlocks(words, withinLine, betweenLine, betweenLineMultiplier, -1);
}
/// <summary>
/// Get the blocks. See original paper for more information.
/// </summary>
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
/// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
/// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
/// text lines for blocking. Maximum distance will be this number times the between-line
/// distance found by the analysis.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, AngleBounds withinLine,
AngleBounds betweenLine, double betweenLineMultiplier, int maxDegreeOfParallelism)
{
if (words == null)
if (options is DocstrumBoundingBoxesOptions dbbOptions)
{
return EmptyArray<TextBlock>.Instance;
}
var wordsList = new List<Word>();
foreach (var word in words)
{
if (string.IsNullOrWhiteSpace(word.Text))
if (words?.Any() != true)
{
continue;
return EmptyArray<TextBlock>.Instance;
}
wordsList.Add(word);
return GetBlocks(words.ToList(),
dbbOptions.WithinLineBounds, dbbOptions.WithinLineMultiplier, dbbOptions.WithinLineBinSize,
dbbOptions.BetweenLineBounds, dbbOptions.BetweenLineMultiplier, dbbOptions.BetweenLineBinSize,
dbbOptions.AngularDifferenceBounds,
dbbOptions.Epsilon,
dbbOptions.WordSeparator, dbbOptions.LineSeparator,
dbbOptions.MaxDegreeOfParallelism);
}
else
{
throw new ArgumentException("Options provided must be of type " + nameof(DocstrumBoundingBoxesOptions) + ".", nameof(options));
}
}
if (wordsList.Count == 0)
/// <summary>
/// Get the blocks. See original paper for more information.
/// </summary>
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="wlBounds">Angle bounds for words to be considered as neighbours on the same line.</param>
/// <param name="wlMultiplier">Multiplier that gives the maximum euclidian distance between words for building lines.
/// Maximum distance will be this number times the within-line distance found by the analysis.</param>
/// <param name="wlBinSize">The bin size used when building the within-line distances distribution.</param>
/// <param name="blBounds">Angle bounds for words to be considered as neighbours on separate lines.</param>
/// <param name="blMultiplier">Multiplier that gives the maximum perpendicular distance between
/// text lines for blocking. Maximum distance will be this number times the between-line
/// distance found by the analysis.</param>
/// <param name="blBinSize">The bin size used when building the between-line distances distribution.</param>
/// <param name="angularDifferenceBounds">The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.</param>
/// <param name="epsilon">Precision when testing equalities.</param>
/// <param name="wordSeparator"></param>
/// <param name="lineSeparator"></param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
private IReadOnlyList<TextBlock> GetBlocks(IReadOnlyList<Word> words,
AngleBounds wlBounds, double wlMultiplier, int wlBinSize,
AngleBounds blBounds, double blMultiplier, int blBinSize,
AngleBounds angularDifferenceBounds,
double epsilon,
string wordSeparator, string lineSeparator,
int maxDegreeOfParallelism)
{
// Filter out white spaces
words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToList();
if (words.Count == 0)
{
return EmptyArray<TextBlock>.Instance;
}
// 1. Estimate within line and between line spacing
if (!GetSpacingEstimation(words, wlBounds, wlBinSize, blBounds, blBinSize,
maxDegreeOfParallelism,
out double withinLineDistance, out double betweenLineDistance))
{
if (double.IsNaN(withinLineDistance)) withinLineDistance = 0;
if (double.IsNaN(betweenLineDistance)) betweenLineDistance = 0;
}
// 2. Determination of Text Lines
double maxWithinLineDistance = wlMultiplier * withinLineDistance; //Math.Min(3 * withinLineDistance.Value, 1.4142 * betweenLineDistance.Value);
var lines = GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();
// 3. Structural Block Determination
double maxBetweenLineDistance = blMultiplier * betweenLineDistance;
return GetStructuralBlocks(lines, maxBetweenLineDistance, angularDifferenceBounds, epsilon, lineSeparator, maxDegreeOfParallelism).ToList();
}
#region Spacing Estimation
/// <summary>
/// Estimation of within-line and between-line spacing.
/// </summary>
/// <returns>False if either 'withinLineDistance' or 'betweenLineDistance' is NaN.</returns>
private static bool GetSpacingEstimation(IReadOnlyList<Word> words,
AngleBounds wlBounds, int wlBinSize,
AngleBounds blBounds, int blBinSize,
int maxDegreeOfParallelism,
out double withinLineDistance, out double betweenLineDistance)
{
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
var withinLineDistList = new ConcurrentBag<double>();
var betweenLineDistList = new ConcurrentBag<double>();
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
// 1. Estimate within line and between line spacing
KdTree<Word> kdTreeWL = new KdTree<Word>(wordsList, w => w.BoundingBox.BottomLeft);
KdTree<Word> kdTreeBL = new KdTree<Word>(wordsList, w => w.BoundingBox.TopLeft);
KdTree<Word> kdTreeBottomLeft = new KdTree<Word>(words, w => w.BoundingBox.BottomLeft);
Parallel.For(0, wordsList.Count, parallelOptions, i =>
Parallel.For(0, words.Count, parallelOptions, i =>
{
var word = wordsList[i];
var word = words[i];
// Within-line distance
var neighbourWL = kdTreeWL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 0.5));
foreach (var n in neighbourWL)
// 1.1.1 Find the 2 closest neighbours words to the candidate, using euclidean distance.
foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, Distances.Euclidean))
{
if (withinLine.Contains(Distances.Angle(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft)))
// 1.1.2 Check if the neighbour word is within the angle of the candidate
if (wlBounds.Contains(AngleWL(word, n.Item1)))
{
withinLineDistList.Add(Distances.Horizontal(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft));
withinLineDistList.Add(Distances.Euclidean(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft));
}
}
// Between-line distance
var neighbourBL = kdTreeBL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomLeft, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 50));
foreach (var n in neighbourBL)
// 1.2.1 Find the 2 closest neighbours words to the candidate, using euclidean distance.
foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.TopLeft, Distances.Euclidean))
{
if (betweenLine.Contains(Distances.Angle(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid)))
// 1.2.2 Check if the candidate words is within the angle
var angle = AngleBL(word, n.Item1);
if (blBounds.Contains(angle))
{
betweenLineDistList.Add(Distances.Vertical(word.BoundingBox.BottomLeft, n.Item1.BoundingBox.TopLeft));
// 1.2.3 Compute the vertical (between-line) distance between the candidate
// and the neighbour and add it to the between-line distances list
double hypotenuse = Distances.Euclidean(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid);
// Angle is kept within [-90, 90]
if (angle > 90)
{
angle -= 180;
}
var dist = Math.Abs(hypotenuse * Math.Cos((90 - angle) * Math.PI / 180))
- word.BoundingBox.Height / 2.0 - n.Item1.BoundingBox.Height / 2.0;
// The perpendicular distance can be negative because of the subtractions.
// Could occur when words are overlapping, we ignore that.
if (dist >= 0) betweenLineDistList.Add(dist);
}
}
});
double? withinLineDistance = GetPeakAverageDistance(withinLineDistList);
double? betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);
// Compute average peak value of distribution
double? withinLinePeak = GetPeakAverageDistance(withinLineDistList, wlBinSize);
double? betweenLinePeak = GetPeakAverageDistance(betweenLineDistList, blBinSize);
if (!withinLineDistance.HasValue || !betweenLineDistance.HasValue)
{
return new[] { new TextBlock(new[] { new TextLine(wordsList) }) };
}
withinLineDistance = withinLinePeak ?? double.NaN;
betweenLineDistance = betweenLinePeak ?? double.NaN;
// 2. Find lines of text
double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value);
var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine, maxDegreeOfParallelism).ToArray();
// 3. Find blocks of text
double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value;
var blocks = GetLinesGroups(lines, maxDistanceBetweenLine, maxDegreeOfParallelism).ToList();
// 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
for (var b = 0; b < blocks.Count; b++)
{
if (blocks[b] == null)
{
continue;
}
// Merge all lines (words)
blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(),
double.MaxValue, withinLine, maxDegreeOfParallelism).ToList());
for (var c = 0; c < blocks.Count; c++)
{
if (b == c || blocks[c] == null)
{
continue;
}
if (blocks[b].BoundingBox.IntersectsWith(blocks[c].BoundingBox))
{
// Merge
// 1. Merge all words
var mergedWords = new List<Word>(blocks[b].TextLines.SelectMany(l => l.Words));
mergedWords.AddRange(blocks[c].TextLines.SelectMany(l => l.Words));
// 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
// same block. Filtering will still be done based on angle.
// Merge all lines (words) sharing same bottom (baseline)
var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine, maxDegreeOfParallelism).ToList();
blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList());
// Remove
blocks[c] = null;
}
}
}
return blocks.Where(b => b != null).ToList();
}
private static IEnumerable<TextLine> GetLines(List<Word> words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism)
{
TextOrientation TextOrientation = words[0].TextOrientation;
var groupedIndexes = Clustering.NearestNeighbours(words, 2, Distances.Euclidean,
(pivot, candidate) => maxDist,
pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
pivot => true,
(pivot, candidate) => withinLine.Contains(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft)),
maxDegreeOfParallelism).ToList();
Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
if (TextOrientation == TextOrientation.Rotate180)
{
orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList();
}
else if (TextOrientation == TextOrientation.Rotate90)
{
orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList();
}
else if (TextOrientation == TextOrientation.Rotate270)
{
orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
}
for (var a = 0; a < groupedIndexes.Count; a++)
{
yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i])));
}
}
private static IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist, int maxDegreeOfParallelism)
{
/**************************************************************************************************
* We want to measure the distance between two lines using the following method:
* We check if two lines are overlapping horizontally.
* If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
* We finally compute the Euclidean distance between these two middle points.
* If the two lines are not overlapping, the distance is set to the max distance.
**************************************************************************************************/
double euclidianOverlappingMiddleDistance(PdfLine l1, PdfLine l2)
{
var left = Math.Max(l1.Point1.X, l2.Point1.X);
var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left);
if (d < 0) return double.MaxValue; // not overlapping -> max distance
return Distances.Euclidean(
new PdfPoint(left + d / 2, l1.Point1.Y),
new PdfPoint(left + d / 2, l2.Point1.Y));
}
var groupedIndexes = Clustering.NearestNeighbours(lines,
euclidianOverlappingMiddleDistance,
(pivot, candidate) => maxDist,
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
pivot => true, (pivot, candidate) => true,
maxDegreeOfParallelism).ToList();
for (int a = 0; a < groupedIndexes.Count; a++)
{
yield return new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList());
}
return withinLinePeak.HasValue && betweenLinePeak.HasValue;
}
/// <summary>
/// Get the average distance value of the peak bucket of the histogram.
/// </summary>
/// <param name="distances">The set of distances to average.</param>
private static double? GetPeakAverageDistance(IEnumerable<double> distances)
/// <param name="binLength"></param>
private static double? GetPeakAverageDistance(IEnumerable<double> distances, int binLength = 1)
{
var buckets = new Dictionary<int, List<double>>();
if (!distances.Any())
{
return null;
}
if (binLength <= 0)
{
throw new ArgumentException("DocstrumBoundingBoxes: the bin length must be positive when commputing peak average distance.", nameof(binLength));
}
var max = (int)Math.Ceiling(distances.Max());
if (max == 0)
{
max = binLength;
}
else
{
binLength = binLength > max ? max : binLength;
}
var bins = Enumerable.Range(0, (int)Math.Ceiling(max / (double)binLength) + 1)
.Select(x => x * binLength)
.ToDictionary(x => x, _ => new List<double>());
foreach (var distance in distances)
{
var floor = (int)distance;
if (buckets.ContainsKey(floor))
int bin = (int)Math.Floor(distance / binLength);
if (bin < 0)
{
buckets[floor].Add(distance);
}
else
{
buckets[floor] = new List<double> { distance };
throw new ArgumentOutOfRangeException(nameof(bin), "DocstrumBoundingBoxes: Negative distance found while commputing peak average distance.");
}
bins[bins.Keys.ElementAt(bin)].Add(distance);
}
var best = default(List<double>);
foreach (var bucket in buckets)
foreach (var bin in bins)
{
if (best == null || bucket.Value.Count > best.Count)
if (best == null || bin.Value.Count > best.Count)
{
best = bucket.Value;
best = bin.Value;
}
}
return best?.Average();
}
#endregion
#region Text Lines
private static IEnumerable<TextLine> GetLines(IReadOnlyList<Word> words, double maxWLDistance, AngleBounds withinLine,
string wordSeparator, int maxDegreeOfParallelism)
{
var groupedWords = Clustering.NearestNeighbours(words,
2,
Distances.Euclidean,
(_, __) => maxWLDistance,
pivot => pivot.BoundingBox.BottomRight,
candidate => candidate.BoundingBox.BottomLeft,
_ => true,
(pivot, candidate) => withinLine.Contains(AngleWL(pivot, candidate)),
maxDegreeOfParallelism).ToList();
foreach (var g in groupedWords)
{
yield return new TextLine(g.OrderByReadingOrder(), wordSeparator);
}
}
/// <summary>
/// Helper function to compute the within line angle between the pivot's bottom
/// right and the candidate's bottom left points, taking in account the pivot's rotation.
/// <para>-90 ≤ θ ≤ 90.</para>
/// </summary>
private static double AngleWL(Word pivot, Word candidate)
{
var angle = Distances.BoundAngle180(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft) - pivot.BoundingBox.Rotation);
// Angle is kept within [-90;90] degree to handle overlapping words
if (angle > 90)
{
angle -= 180;
}
else if (angle < -90)
{
angle += 180;
}
return angle;
}
#endregion
#region Blocking
private static IEnumerable<TextBlock> GetStructuralBlocks(IReadOnlyList<TextLine> lines,
double maxBLDistance, AngleBounds angularDifference, double epsilon, string lineSeparator, int maxDegreeOfParallelism)
{
/******************************************************************************************************
* We want to measure the distance between two lines using the following method:
* We check if two lines are overlapping horizontally and compute the perpendicular distance.
* We check if the angle between the two line is within 'angularDifference'.
* If the two lines are not overlapping or the angle is too wide, the distance is set to the infinity.
*
* If two text lines are approximately parallel, close in perpendicular distance, and they either
* overlap to some specified degree or are separated by only a small distance in parallel distance,
* then they are said to meet the criteria to belong to the same structural block.
******************************************************************************************************/
var groupedLines = Clustering.NearestNeighbours(
lines,
(l1, l2) => PerpendicularOverlappingDistance(l1, l2, angularDifference, epsilon),
(_, __) => maxBLDistance,
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
_ => true,
(_, __) => true,
maxDegreeOfParallelism).ToList();
foreach (var g in groupedLines)
{
yield return new TextBlock(g.OrderByReadingOrder(), lineSeparator);
}
}
/// <summary>
/// Perpendicular overlapping distance.
/// TODO: describe checks done
/// </summary>
/// <param name="line1"></param>
/// <param name="line2"></param>
/// <param name="angularDifferenceBounds"></param>
/// <param name="epsilon"></param>
private static double PerpendicularOverlappingDistance(PdfLine line1, PdfLine line2, AngleBounds angularDifferenceBounds, double epsilon)
{
if (GetStructuralBlockingParameters(line1, line2, epsilon, out double theta, out _, out double ed))
{
// Angle is kept within [-90;90]
if (theta > 90)
{
theta -= 180;
}
else if (theta < -90)
{
theta += 180;
}
if (!angularDifferenceBounds.Contains(theta))
{
// exclude because not parallel enough
return double.PositiveInfinity;
}
return Math.Abs(ed);
}
else
{
// nonoverlapped
return double.PositiveInfinity;
}
}
/// <summary>
/// Get the structural blocking parameters.
/// </summary>
/// <param name="i"></param>
/// <param name="j"></param>
/// <param name="epsilon"></param>
/// <param name="angularDifference">The angle between the 2 lines.<para>-180 ≤ θ ≤ 180</para></param>
/// <param name="normalisedOverlap">Overlap of segment i onto j. Positive value if overlapped, negative value if nonoverlapped.<para>[-1, 1]?</para></param>
/// <param name="perpendicularDistance">Signed perpendicular distance.</param>
/// <returns>Return true if overlapped, false if nonoverlapped.</returns>
public static bool GetStructuralBlockingParameters(PdfLine i, PdfLine j, double epsilon,
out double angularDifference, out double normalisedOverlap, out double perpendicularDistance)
{
if (AlmostEquals(i, j, epsilon))
{
angularDifference = 0;
normalisedOverlap = 1;
perpendicularDistance = 0;
return true;
}
double dXi = i.Point2.X - i.Point1.X;
double dYi = i.Point2.Y - i.Point1.Y;
double dXj = j.Point2.X - j.Point1.X;
double dYj = j.Point2.Y - j.Point1.Y;
angularDifference = Distances.BoundAngle180((Math.Atan2(dYj, dXj) - Math.Atan2(dYi, dXi)) * 180 / Math.PI);
PdfPoint? Aj = GetTranslatedPoint(i.Point1.X, i.Point1.Y, j.Point1.X, j.Point1.Y, dXi, dYi, dXj, dYj, epsilon);
PdfPoint? Bj = GetTranslatedPoint(i.Point2.X, i.Point2.Y, j.Point2.X, j.Point2.Y, dXi, dYi, dXj, dYj, epsilon);
if (!Aj.HasValue || !Bj.HasValue)
{
// Might happen because lines are perpendicular
// or have too small lengths
normalisedOverlap = double.NaN;
perpendicularDistance = double.NaN;
return false;
}
// Get middle points
var ps = new[] { j.Point1, j.Point2, Aj.Value, Bj.Value };
if (dXj != 0)
{
ps = ps.OrderBy(p => p.X).ThenBy(p => p.Y).ToArray();
}
else if (dYj != 0)
{
ps = ps.OrderBy(p => p.Y).ToArray();
}
PdfPoint Cj = ps[1];
PdfPoint Dj = ps[2];
bool overlap = true;
// Cj and Dj should be contained within both j and [Aj,Bj] if overlapped
if (!PointInLine(j.Point1, j.Point2, Cj) || !PointInLine(j.Point1, j.Point2, Dj) ||
!PointInLine(Aj.Value, Bj.Value, Cj) || !PointInLine(Aj.Value, Bj.Value, Dj))
{
// nonoverlapped
overlap = false;
}
//double pj = Math.Sqrt((Dj.Y - Cj.Y) * (Dj.Y - Cj.Y) + (Dj.X - Cj.X) * (Dj.X - Cj.X));
double pj = Distances.Euclidean(Cj, Dj);
normalisedOverlap = (overlap ? pj : -pj) / j.Length;
double xMj = (Cj.X + Dj.X) / 2.0;
double yMj = (Cj.Y + Dj.Y) / 2.0;
if (!dXi.AlmostEqualsToZero(epsilon) && !dYi.AlmostEqualsToZero(epsilon))
{
perpendicularDistance = ((xMj - i.Point1.X) - (yMj - i.Point1.Y) * dXi / dYi) / Math.Sqrt(dXi * dXi / (dYi * dYi) + 1);
}
else if (dXi.AlmostEqualsToZero(epsilon))
{
perpendicularDistance = xMj - i.Point1.X;
}
else
{
perpendicularDistance = yMj - i.Point1.Y;
}
return overlap;
}
private static PdfPoint? GetTranslatedPoint(double xPi, double yPi, double xPj, double yPj, double dXi, double dYi, double dXj, double dYj, double epsilon)
{
double dYidYj = dYi * dYj;
double dXidXj = dXi * dXj;
double denominator = dYidYj + dXidXj;
if (denominator.AlmostEqualsToZero(epsilon))
{
// The denominator is 0 when translating points, meaning the lines are perpendicular.
return null;
}
double xTj = (xPi * dXidXj + xPj * dYidYj + dXj * dYi * (yPi - yPj)) / denominator;
double yTj = yPj; // TODO: need to check that
if (dXj > epsilon)
{
yTj = dYj / dXj * (xTj - xPj) + yPj;
}
return new PdfPoint(xTj, yTj);
}
/// <summary>
/// Helper function to check if the point belongs to the line./>
/// </summary>
/// <param name="pl1">Line's first point.</param>
/// <param name="pl2">Line's second point.</param>
/// <param name="point">The point to check.</param>
private static bool PointInLine(PdfPoint pl1, PdfPoint pl2, PdfPoint point)
{
// /!\ Assuming the points are aligned (be careful)
double ax = point.X - pl1.X;
double ay = point.Y - pl1.Y;
double bx = pl2.X - pl1.X;
double by = pl2.Y - pl1.Y;
double dotProd1 = ax * bx + ay * by;
if (dotProd1 < 0) return false;
double dotProd2 = bx * bx + by * by;
return dotProd1 <= dotProd2;
}
/// <summary>
/// Helper function to check if 2 lines are equal.
/// </summary>
/// <param name="line1"></param>
/// <param name="line2"></param>
/// <param name="epsilon"></param>
private static bool AlmostEquals(PdfLine line1, PdfLine line2, double epsilon)
{
return (line1.Point1.X - line2.Point1.X).AlmostEqualsToZero(epsilon) &&
(line1.Point1.Y - line2.Point1.Y).AlmostEqualsToZero(epsilon) &&
(line1.Point2.X - line2.Point2.X).AlmostEqualsToZero(epsilon) &&
(line1.Point2.Y - line2.Point2.Y).AlmostEqualsToZero(epsilon);
}
/// <summary>
/// Helper function to compute the between line angle between the pivot's
/// and the candidate's centroid points, taking in account the pivot's rotation.
/// <para>0 ≤ θ ≤ 180.</para>
/// </summary>
private static double AngleBL(Word pivot, Word candidate)
{
var angle = Distances.BoundAngle180(Distances.Angle(pivot.BoundingBox.Centroid, candidate.BoundingBox.Centroid) - pivot.BoundingBox.Rotation);
// Angle is kept within [0, 180] for the check
if (angle < 0)
{
angle += 180;
}
return angle;
}
#endregion
/// <summary>
/// The bounds for the angle between two words for them to have a certain type of relationship.
@@ -314,6 +539,11 @@
/// </summary>
public AngleBounds(double lowerBound, double upperBound)
{
if (lowerBound >= upperBound)
{
throw new ArgumentException("The lower bound should be smaller than the upper bound.");
}
Lower = lowerBound;
Upper = upperBound;
}
@@ -326,5 +556,65 @@
return angle >= Lower && angle <= Upper;
}
}
/// <summary>
/// Docstrum bounding boxes page segmenter options.
/// </summary>
public class DocstrumBoundingBoxesOptions : PageSegmenterOptions
{
/// <summary>
/// Precision when testing equalities.
/// <para>Default value is 1e-3.</para>
/// </summary>
public double Epsilon { get; set; } = 1e-3;
/// <summary>
/// Angle bounds for words to be considered as neighbours on the same line.
/// <para>Default value is -30 ≤ θ ≤ 30.</para>
/// </summary>
public AngleBounds WithinLineBounds { get; set; } = new AngleBounds(-30, 30);
/// <summary>
/// Multiplier that gives the maximum euclidian distance between
/// words for building lines. Maximum distance will be this number times the within-line
/// distance found by the analysis.
/// <para>Default value is 3.</para>
/// </summary>
public double WithinLineMultiplier { get; set; } = 3.0;
/// <summary>
/// The bin size used when building the within-line distances distribution.
/// <para>Default value is 10.</para>
/// </summary>
public int WithinLineBinSize { get; set; } = 10;
/// <summary>
/// Angle bounds for words to be considered as neighbours on separate lines.
/// <para>Default value is 45 ≤ θ ≤ 135.</para>
/// </summary>
public AngleBounds BetweenLineBounds { get; set; } = new AngleBounds(45, 135);
/// <summary>
/// Multiplier that gives the maximum perpendicular distance between
/// text lines for blocking. Maximum distance will be this number times the between-line
/// distance found by the analysis.
/// <para>Default value is 1.3.</para>
/// </summary>
public double BetweenLineMultiplier { get; set; } = 1.3;
/// <summary>
/// The bin size used when building the between-line distances distribution.
/// <para>Default value is 10.</para>
/// </summary>
public int BetweenLineBinSize { get; set; } = 10;
/// <summary>
/// The angular difference bounds between two lines to be considered in the same block.
/// This defines if two lines are parallel enough.
/// <para>Default value is -30 ≤ θ ≤ 30.</para>
/// </summary>
public AngleBounds AngularDifferenceBounds { get; set; } = new AngleBounds(-30, 30);
}
}
}
}