diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
index 290fa8d8..6ca7ebf1 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
@@ -76,8 +76,8 @@
/// The bin size used when building the between-line distances distribution.
/// The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.
/// Precision when testing equalities.
- ///
- ///
+ /// Separator used between words when building lines.
+ /// Separator used between lines when building paragraphs.
/// Sets the maximum number of concurrent tasks enabled.
/// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
@@ -102,12 +102,19 @@
maxDegreeOfParallelism,
out double withinLineDistance, out double betweenLineDistance))
{
- if (double.IsNaN(withinLineDistance)) withinLineDistance = 0;
- if (double.IsNaN(betweenLineDistance)) betweenLineDistance = 0;
+ if (double.IsNaN(withinLineDistance))
+ {
+ withinLineDistance = 0;
+ }
+
+ if (double.IsNaN(betweenLineDistance))
+ {
+ betweenLineDistance = 0;
+ }
}
// 2. Determination of Text Lines
- double maxWithinLineDistance = wlMultiplier * withinLineDistance; //Math.Min(3 * withinLineDistance.Value, 1.4142 * betweenLineDistance.Value);
+ double maxWithinLineDistance = wlMultiplier * withinLineDistance;
var lines = GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();
// 3. Structural Block Determination
@@ -118,9 +125,20 @@
#region Spacing Estimation
///
/// Estimation of within-line and between-line spacing.
+ /// This is the Docstrum algorithm's 1st step.
///
- /// False if either 'withinLineDistance' or 'betweenLineDistance' is NaN.
- private static bool GetSpacingEstimation(IReadOnlyList words,
+ /// The list of words.
+ /// Angle bounds for words to be considered as neighbours on the same line.
+ /// The bin size used when building the within-line distances distribution.
+ /// Angle bounds for words to be considered as neighbours on separate lines.
+ /// The bin size used when building the between-line distances distribution.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
+ /// The estimated within-line distance. Computed as the average peak value of distribution.
+ /// The estimated between-line distance. Computed as the average peak value of distribution.
+ /// False if either 'withinLineDistance' or 'betweenLineDistance' is .
+ public static bool GetSpacingEstimation(IReadOnlyList words,
AngleBounds wlBounds, int wlBinSize,
AngleBounds blBounds, int blBinSize,
int maxDegreeOfParallelism,
@@ -172,7 +190,10 @@
// The perpendicular distance can be negative because of the subtractions.
// Could occur when words are overlapping, we ignore that.
- if (dist >= 0) betweenLineDistList.Add(dist);
+ if (dist >= 0)
+ {
+ betweenLineDistList.Add(dist);
+ }
}
}
});
@@ -242,7 +263,19 @@
#endregion
#region Text Lines
- private static IEnumerable GetLines(IReadOnlyList words, double maxWLDistance, AngleBounds withinLine,
+ ///
+ /// Get the s by grouping words using nearest neighbours.
+ /// This is the Docstrum algorithm's 2nd step.
+ ///
+ /// The words to segment into s.
+ /// The maximum within-line distance. Computed as the estimated within-line spacing times the within-line multiplier in the default implementation.
+ /// Angle bounds for words to be considered as neighbours on the same line.
+ /// Separator used between words when building lines.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
+ /// The s built.
+ public static IEnumerable GetLines(IReadOnlyList words, double maxWLDistance, AngleBounds wlBounds,
string wordSeparator, int maxDegreeOfParallelism)
{
var groupedWords = Clustering.NearestNeighbours(words,
@@ -252,7 +285,7 @@
pivot => pivot.BoundingBox.BottomRight,
candidate => candidate.BoundingBox.BottomLeft,
_ => true,
- (pivot, candidate) => withinLine.Contains(AngleWL(pivot, candidate)),
+ (pivot, candidate) => wlBounds.Contains(AngleWL(pivot, candidate)),
maxDegreeOfParallelism).ToList();
foreach (var g in groupedWords)
@@ -285,8 +318,28 @@
#endregion
#region Blocking
- private static IEnumerable GetStructuralBlocks(IReadOnlyList lines,
- double maxBLDistance, AngleBounds angularDifference, double epsilon, string lineSeparator, int maxDegreeOfParallelism)
+ ///
+ /// Get the s.
+ /// This is the Docstrum algorithm's 3rd and final step.
+ ///
+ /// Method: We want to measure the distance between two lines using the following method:
+ /// - We check if two lines are overlapping horizontally and compute the perpendicular distance.
+ /// - We check if the angle between the two line is within 'angularDifference'.
+ /// - If the two lines are not overlapping or the angle is too wide, the distance is set to the infinity.
+ /// If two text lines are approximately parallel, close in perpendicular distance, and they either overlap to some specified degree or are separated by only a small distance in parallel distance, then they are said to meet the criteria to belong to the same structural block.
+ ///
+ ///
+ /// The lines to segment into s.
+ /// The maximum between-line distance. Computed as the estimated between-line spacing times the between-line multiplier in the default implementation.
+ /// The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.
+ /// Precision when testing equalities.
+ /// Separator used between lines when building paragraphs.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
+ /// The s built.
+ public static IEnumerable GetStructuralBlocks(IReadOnlyList lines,
+ double maxBLDistance, AngleBounds angularDifferenceBounds, double epsilon, string lineSeparator, int maxDegreeOfParallelism)
{
/******************************************************************************************************
* We want to measure the distance between two lines using the following method:
@@ -301,7 +354,7 @@
var groupedLines = Clustering.NearestNeighbours(
lines,
- (l1, l2) => PerpendicularOverlappingDistance(l1, l2, angularDifference, epsilon),
+ (l1, l2) => PerpendicularOverlappingDistance(l1, l2, angularDifferenceBounds, epsilon),
(_, __) => maxBLDistance,
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
@@ -460,7 +513,6 @@
}
else // If dXj = 0, then yAj is calculated first, and xAj is calculated from that.
{
- // TODO: check that
yAj = (yPi * dYidYj + yPj * dXidXj + dYj * dXi * (xPi - xPj)) / denominator;
xAj = xPj;
}
@@ -483,9 +535,7 @@
double by = pl2.Y - pl1.Y;
double dotProd1 = ax * bx + ay * by;
- if (dotProd1 < 0) return false;
-
- return dotProd1 <= (bx * bx + by * by);
+ return dotProd1 >= 0 && dotProd1 <= (bx * bx + by * by);
}
///
diff --git a/src/UglyToad.PdfPig.Tests/Dla/DocstrumBoundingBoxesTests.cs b/src/UglyToad.PdfPig.Tests/Dla/DocstrumBoundingBoxesTests.cs
index 831c1215..b1c48000 100644
--- a/src/UglyToad.PdfPig.Tests/Dla/DocstrumBoundingBoxesTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Dla/DocstrumBoundingBoxesTests.cs
@@ -1,104 +1,173 @@
-namespace UglyToad.PdfPig.Tests.Dla
-{
- using System.Collections.Generic;
- using System.Linq;
- using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
+namespace UglyToad.PdfPig.Tests.Dla
+{
+ using System.Collections.Generic;
+ using System.Linq;
+ using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
using UglyToad.PdfPig.Fonts.SystemFonts;
- using Xunit;
-
- public class DocstrumBoundingBoxesTests
- {
- public static IEnumerable