mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-11-28 17:47:12 +08:00
Clean DocstrumBoundingBoxes and make relevant static methods public. Add tests. #376
This commit is contained in:
@@ -76,8 +76,8 @@
|
|||||||
/// <param name="blBinSize">The bin size used when building the between-line distances distribution.</param>
|
/// <param name="blBinSize">The bin size used when building the between-line distances distribution.</param>
|
||||||
/// <param name="angularDifferenceBounds">The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.</param>
|
/// <param name="angularDifferenceBounds">The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.</param>
|
||||||
/// <param name="epsilon">Precision when testing equalities.</param>
|
/// <param name="epsilon">Precision when testing equalities.</param>
|
||||||
/// <param name="wordSeparator"></param>
|
/// <param name="wordSeparator">Separator used between words when building lines.</param>
|
||||||
/// <param name="lineSeparator"></param>
|
/// <param name="lineSeparator">Separator used between lines when building paragraphs.</param>
|
||||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||||
@@ -102,12 +102,19 @@
|
|||||||
maxDegreeOfParallelism,
|
maxDegreeOfParallelism,
|
||||||
out double withinLineDistance, out double betweenLineDistance))
|
out double withinLineDistance, out double betweenLineDistance))
|
||||||
{
|
{
|
||||||
if (double.IsNaN(withinLineDistance)) withinLineDistance = 0;
|
if (double.IsNaN(withinLineDistance))
|
||||||
if (double.IsNaN(betweenLineDistance)) betweenLineDistance = 0;
|
{
|
||||||
|
withinLineDistance = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (double.IsNaN(betweenLineDistance))
|
||||||
|
{
|
||||||
|
betweenLineDistance = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. Determination of Text Lines
|
// 2. Determination of Text Lines
|
||||||
double maxWithinLineDistance = wlMultiplier * withinLineDistance; //Math.Min(3 * withinLineDistance.Value, 1.4142 * betweenLineDistance.Value);
|
double maxWithinLineDistance = wlMultiplier * withinLineDistance;
|
||||||
var lines = GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();
|
var lines = GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();
|
||||||
|
|
||||||
// 3. Structural Block Determination
|
// 3. Structural Block Determination
|
||||||
@@ -118,9 +125,20 @@
|
|||||||
#region Spacing Estimation
|
#region Spacing Estimation
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Estimation of within-line and between-line spacing.
|
/// Estimation of within-line and between-line spacing.
|
||||||
|
/// <para>This is the Docstrum algorithm's 1st step.</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <returns>False if either 'withinLineDistance' or 'betweenLineDistance' is NaN.</returns>
|
/// <param name="words">The list of words.</param>
|
||||||
private static bool GetSpacingEstimation(IReadOnlyList<Word> words,
|
/// <param name="wlBounds">Angle bounds for words to be considered as neighbours on the same line.</param>
|
||||||
|
/// <param name="wlBinSize">The bin size used when building the within-line distances distribution.</param>
|
||||||
|
/// <param name="blBounds">Angle bounds for words to be considered as neighbours on separate lines.</param>
|
||||||
|
/// <param name="blBinSize">The bin size used when building the between-line distances distribution.</param>
|
||||||
|
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||||
|
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||||
|
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||||
|
/// <param name="withinLineDistance">The estimated within-line distance. Computed as the average peak value of distribution.</param>
|
||||||
|
/// <param name="betweenLineDistance">The estimated between-line distance. Computed as the average peak value of distribution.</param>
|
||||||
|
/// <returns>False if either 'withinLineDistance' or 'betweenLineDistance' is <see cref="double.NaN"/>.</returns>
|
||||||
|
public static bool GetSpacingEstimation(IReadOnlyList<Word> words,
|
||||||
AngleBounds wlBounds, int wlBinSize,
|
AngleBounds wlBounds, int wlBinSize,
|
||||||
AngleBounds blBounds, int blBinSize,
|
AngleBounds blBounds, int blBinSize,
|
||||||
int maxDegreeOfParallelism,
|
int maxDegreeOfParallelism,
|
||||||
@@ -172,7 +190,10 @@
|
|||||||
|
|
||||||
// The perpendicular distance can be negative because of the subtractions.
|
// The perpendicular distance can be negative because of the subtractions.
|
||||||
// Could occur when words are overlapping, we ignore that.
|
// Could occur when words are overlapping, we ignore that.
|
||||||
if (dist >= 0) betweenLineDistList.Add(dist);
|
if (dist >= 0)
|
||||||
|
{
|
||||||
|
betweenLineDistList.Add(dist);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -242,7 +263,19 @@
|
|||||||
#endregion
|
#endregion
|
||||||
|
|
||||||
#region Text Lines
|
#region Text Lines
|
||||||
private static IEnumerable<TextLine> GetLines(IReadOnlyList<Word> words, double maxWLDistance, AngleBounds withinLine,
|
/// <summary>
|
||||||
|
/// Get the <see cref="TextLine"/>s by grouping words using nearest neighbours.
|
||||||
|
/// <para>This is the Docstrum algorithm's 2nd step.</para>
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="words">The words to segment into <see cref="TextLine"/>s.</param>
|
||||||
|
/// <param name="maxWLDistance">The maximum within-line distance. Computed as the estimated within-line spacing times the within-line multiplier in the default implementation.</param>
|
||||||
|
/// <param name="wlBounds">Angle bounds for words to be considered as neighbours on the same line.</param>
|
||||||
|
/// <param name="wordSeparator">Separator used between words when building lines.</param>
|
||||||
|
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||||
|
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||||
|
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||||
|
/// <returns>The <see cref="TextLine"/>s built.</returns>
|
||||||
|
public static IEnumerable<TextLine> GetLines(IReadOnlyList<Word> words, double maxWLDistance, AngleBounds wlBounds,
|
||||||
string wordSeparator, int maxDegreeOfParallelism)
|
string wordSeparator, int maxDegreeOfParallelism)
|
||||||
{
|
{
|
||||||
var groupedWords = Clustering.NearestNeighbours(words,
|
var groupedWords = Clustering.NearestNeighbours(words,
|
||||||
@@ -252,7 +285,7 @@
|
|||||||
pivot => pivot.BoundingBox.BottomRight,
|
pivot => pivot.BoundingBox.BottomRight,
|
||||||
candidate => candidate.BoundingBox.BottomLeft,
|
candidate => candidate.BoundingBox.BottomLeft,
|
||||||
_ => true,
|
_ => true,
|
||||||
(pivot, candidate) => withinLine.Contains(AngleWL(pivot, candidate)),
|
(pivot, candidate) => wlBounds.Contains(AngleWL(pivot, candidate)),
|
||||||
maxDegreeOfParallelism).ToList();
|
maxDegreeOfParallelism).ToList();
|
||||||
|
|
||||||
foreach (var g in groupedWords)
|
foreach (var g in groupedWords)
|
||||||
@@ -285,8 +318,28 @@
|
|||||||
#endregion
|
#endregion
|
||||||
|
|
||||||
#region Blocking
|
#region Blocking
|
||||||
private static IEnumerable<TextBlock> GetStructuralBlocks(IReadOnlyList<TextLine> lines,
|
/// <summary>
|
||||||
double maxBLDistance, AngleBounds angularDifference, double epsilon, string lineSeparator, int maxDegreeOfParallelism)
|
/// Get the <see cref="TextBlock"/>s.
|
||||||
|
/// <para>This is the Docstrum algorithm's 3rd and final step.</para>
|
||||||
|
/// <para>
|
||||||
|
/// Method: We want to measure the distance between two lines using the following method:
|
||||||
|
/// <br>- We check if two lines are overlapping horizontally and compute the perpendicular distance.</br>
|
||||||
|
/// <br>- We check if the angle between the two line is within 'angularDifference'.</br>
|
||||||
|
/// <br>- If the two lines are not overlapping or the angle is too wide, the distance is set to the infinity.</br>
|
||||||
|
/// <para>If two text lines are approximately parallel, close in perpendicular distance, and they either overlap to some specified degree or are separated by only a small distance in parallel distance, then they are said to meet the criteria to belong to the same structural block.</para>
|
||||||
|
/// </para>
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="lines">The lines to segment into <see cref="TextBlock"/>s.</param>
|
||||||
|
/// <param name="maxBLDistance">The maximum between-line distance. Computed as the estimated between-line spacing times the between-line multiplier in the default implementation.</param>
|
||||||
|
/// <param name="angularDifferenceBounds">The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.</param>
|
||||||
|
/// <param name="epsilon">Precision when testing equalities.</param>
|
||||||
|
/// <param name="lineSeparator">Separator used between lines when building paragraphs.</param>
|
||||||
|
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||||
|
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||||
|
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||||
|
/// <returns>The <see cref="TextBlock"/>s built.</returns>
|
||||||
|
public static IEnumerable<TextBlock> GetStructuralBlocks(IReadOnlyList<TextLine> lines,
|
||||||
|
double maxBLDistance, AngleBounds angularDifferenceBounds, double epsilon, string lineSeparator, int maxDegreeOfParallelism)
|
||||||
{
|
{
|
||||||
/******************************************************************************************************
|
/******************************************************************************************************
|
||||||
* We want to measure the distance between two lines using the following method:
|
* We want to measure the distance between two lines using the following method:
|
||||||
@@ -301,7 +354,7 @@
|
|||||||
|
|
||||||
var groupedLines = Clustering.NearestNeighbours(
|
var groupedLines = Clustering.NearestNeighbours(
|
||||||
lines,
|
lines,
|
||||||
(l1, l2) => PerpendicularOverlappingDistance(l1, l2, angularDifference, epsilon),
|
(l1, l2) => PerpendicularOverlappingDistance(l1, l2, angularDifferenceBounds, epsilon),
|
||||||
(_, __) => maxBLDistance,
|
(_, __) => maxBLDistance,
|
||||||
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
|
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
|
||||||
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
|
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
|
||||||
@@ -460,7 +513,6 @@
|
|||||||
}
|
}
|
||||||
else // If dXj = 0, then yAj is calculated first, and xAj is calculated from that.
|
else // If dXj = 0, then yAj is calculated first, and xAj is calculated from that.
|
||||||
{
|
{
|
||||||
// TODO: check that
|
|
||||||
yAj = (yPi * dYidYj + yPj * dXidXj + dYj * dXi * (xPi - xPj)) / denominator;
|
yAj = (yPi * dYidYj + yPj * dXidXj + dYj * dXi * (xPi - xPj)) / denominator;
|
||||||
xAj = xPj;
|
xAj = xPj;
|
||||||
}
|
}
|
||||||
@@ -483,9 +535,7 @@
|
|||||||
double by = pl2.Y - pl1.Y;
|
double by = pl2.Y - pl1.Y;
|
||||||
|
|
||||||
double dotProd1 = ax * bx + ay * by;
|
double dotProd1 = ax * bx + ay * by;
|
||||||
if (dotProd1 < 0) return false;
|
return dotProd1 >= 0 && dotProd1 <= (bx * bx + by * by);
|
||||||
|
|
||||||
return dotProd1 <= (bx * bx + by * by);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
|||||||
@@ -100,5 +100,74 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkippableTheory]
|
||||||
|
[MemberData(nameof(DataExtract))]
|
||||||
|
public void GetBlocksStatic(string name, string[] expected)
|
||||||
|
{
|
||||||
|
if (name == "90 180 270 rotated.pdf")
|
||||||
|
{
|
||||||
|
// The 'TimesNewRomanPSMT' font is used by this particular document. Thus, results cannot be trusted on
|
||||||
|
// platforms where this font isn't generally available (e.g. OSX, Linux, etc.), so we skip it!
|
||||||
|
var font = SystemFontFinder.Instance.GetTrueTypeFont("TimesNewRomanPSMT");
|
||||||
|
Skip.If(font == null, "Skipped because the font TimesNewRomanPSMT could not be found in the execution environment.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions() { LineSeparator = " " };
|
||||||
|
using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
|
||||||
|
{
|
||||||
|
var page = document.GetPage(1);
|
||||||
|
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters).ToList();
|
||||||
|
|
||||||
|
// Docstrum using static methods
|
||||||
|
// Filter out white spaces
|
||||||
|
words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToList();
|
||||||
|
|
||||||
|
var wlBounds = options.WithinLineBounds;
|
||||||
|
var wlBinSize = options.WithinLineBinSize;
|
||||||
|
var wlMultiplier = options.WithinLineMultiplier;
|
||||||
|
var blBounds = options.BetweenLineBounds;
|
||||||
|
var blBinSize = options.BetweenLineBinSize;
|
||||||
|
var blMultiplier = options.BetweenLineMultiplier;
|
||||||
|
var maxDegreeOfParallelism = options.MaxDegreeOfParallelism;
|
||||||
|
var angularDifferenceBounds = options.AngularDifferenceBounds;
|
||||||
|
var wordSeparator = options.WordSeparator;
|
||||||
|
var lineSeparator = options.LineSeparator;
|
||||||
|
var epsilon = options.Epsilon;
|
||||||
|
|
||||||
|
// 1. Estimate within line and between line spacing
|
||||||
|
if (!DocstrumBoundingBoxes.GetSpacingEstimation(words, wlBounds, wlBinSize, blBounds, blBinSize,
|
||||||
|
maxDegreeOfParallelism,
|
||||||
|
out double withinLineDistance, out double betweenLineDistance))
|
||||||
|
{
|
||||||
|
if (double.IsNaN(withinLineDistance))
|
||||||
|
{
|
||||||
|
withinLineDistance = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (double.IsNaN(betweenLineDistance))
|
||||||
|
{
|
||||||
|
betweenLineDistance = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Determination of Text Lines
|
||||||
|
double maxWithinLineDistance = wlMultiplier * withinLineDistance;
|
||||||
|
var lines = DocstrumBoundingBoxes.GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();
|
||||||
|
|
||||||
|
// 3. Structural Block Determination
|
||||||
|
double maxBetweenLineDistance = blMultiplier * betweenLineDistance;
|
||||||
|
var blocks = DocstrumBoundingBoxes.GetStructuralBlocks(lines, maxBetweenLineDistance, angularDifferenceBounds, epsilon, lineSeparator, maxDegreeOfParallelism).ToList();
|
||||||
|
|
||||||
|
Assert.Equal(expected.Length, blocks.Count);
|
||||||
|
var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
|
||||||
|
.ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();
|
||||||
|
|
||||||
|
for (int i = 0; i < orderedBlocks.Count; i++)
|
||||||
|
{
|
||||||
|
Assert.Equal(expected[i], orderedBlocks[i].Text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user