Clean DocstrumBoundingBoxes and make relevant static methods public. Add tests. #376

This commit is contained in:
BobLD
2021-10-24 11:44:24 +01:00
parent 615e8964e9
commit 7ba28d2b56
2 changed files with 232 additions and 113 deletions

View File

@@ -76,8 +76,8 @@
/// <param name="blBinSize">The bin size used when building the between-line distances distribution.</param> /// <param name="blBinSize">The bin size used when building the between-line distances distribution.</param>
/// <param name="angularDifferenceBounds">The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.</param> /// <param name="angularDifferenceBounds">The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.</param>
/// <param name="epsilon">Precision when testing equalities.</param> /// <param name="epsilon">Precision when testing equalities.</param>
/// <param name="wordSeparator"></param> /// <param name="wordSeparator">Separator used between words when building lines.</param>
/// <param name="lineSeparator"></param> /// <param name="lineSeparator">Separator used between lines when building paragraphs.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled. /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value. /// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param> /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
@@ -102,12 +102,19 @@
maxDegreeOfParallelism, maxDegreeOfParallelism,
out double withinLineDistance, out double betweenLineDistance)) out double withinLineDistance, out double betweenLineDistance))
{ {
if (double.IsNaN(withinLineDistance)) withinLineDistance = 0; if (double.IsNaN(withinLineDistance))
if (double.IsNaN(betweenLineDistance)) betweenLineDistance = 0; {
withinLineDistance = 0;
}
if (double.IsNaN(betweenLineDistance))
{
betweenLineDistance = 0;
}
} }
// 2. Determination of Text Lines // 2. Determination of Text Lines
double maxWithinLineDistance = wlMultiplier * withinLineDistance; //Math.Min(3 * withinLineDistance.Value, 1.4142 * betweenLineDistance.Value); double maxWithinLineDistance = wlMultiplier * withinLineDistance;
var lines = GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray(); var lines = GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();
// 3. Structural Block Determination // 3. Structural Block Determination
@@ -118,9 +125,20 @@
#region Spacing Estimation #region Spacing Estimation
/// <summary> /// <summary>
/// Estimation of within-line and between-line spacing. /// Estimation of within-line and between-line spacing.
/// <para>This is the Docstrum algorithm's 1st step.</para>
/// </summary> /// </summary>
/// <returns>False if either 'withinLineDistance' or 'betweenLineDistance' is NaN.</returns> /// <param name="words">The list of words.</param>
private static bool GetSpacingEstimation(IReadOnlyList<Word> words, /// <param name="wlBounds">Angle bounds for words to be considered as neighbours on the same line.</param>
/// <param name="wlBinSize">The bin size used when building the within-line distances distribution.</param>
/// <param name="blBounds">Angle bounds for words to be considered as neighbours on separate lines.</param>
/// <param name="blBinSize">The bin size used when building the between-line distances distribution.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
/// <param name="withinLineDistance">The estimated within-line distance. Computed as the average peak value of distribution.</param>
/// <param name="betweenLineDistance">The estimated between-line distance. Computed as the average peak value of distribution.</param>
/// <returns>False if either 'withinLineDistance' or 'betweenLineDistance' is <see cref="double.NaN"/>.</returns>
public static bool GetSpacingEstimation(IReadOnlyList<Word> words,
AngleBounds wlBounds, int wlBinSize, AngleBounds wlBounds, int wlBinSize,
AngleBounds blBounds, int blBinSize, AngleBounds blBounds, int blBinSize,
int maxDegreeOfParallelism, int maxDegreeOfParallelism,
@@ -172,7 +190,10 @@
// The perpendicular distance can be negative because of the subtractions. // The perpendicular distance can be negative because of the subtractions.
// Could occur when words are overlapping, we ignore that. // Could occur when words are overlapping, we ignore that.
if (dist >= 0) betweenLineDistList.Add(dist); if (dist >= 0)
{
betweenLineDistList.Add(dist);
}
} }
} }
}); });
@@ -242,7 +263,19 @@
#endregion #endregion
#region Text Lines #region Text Lines
private static IEnumerable<TextLine> GetLines(IReadOnlyList<Word> words, double maxWLDistance, AngleBounds withinLine, /// <summary>
/// Get the <see cref="TextLine"/>s by grouping words using nearest neighbours.
/// <para>This is the Docstrum algorithm's 2nd step.</para>
/// </summary>
/// <param name="words">The words to segment into <see cref="TextLine"/>s.</param>
/// <param name="maxWLDistance">The maximum within-line distance. Computed as the estimated within-line spacing times the within-line multiplier in the default implementation.</param>
/// <param name="wlBounds">Angle bounds for words to be considered as neighbours on the same line.</param>
/// <param name="wordSeparator">Separator used between words when building lines.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
/// <returns>The <see cref="TextLine"/>s built.</returns>
public static IEnumerable<TextLine> GetLines(IReadOnlyList<Word> words, double maxWLDistance, AngleBounds wlBounds,
string wordSeparator, int maxDegreeOfParallelism) string wordSeparator, int maxDegreeOfParallelism)
{ {
var groupedWords = Clustering.NearestNeighbours(words, var groupedWords = Clustering.NearestNeighbours(words,
@@ -252,7 +285,7 @@
pivot => pivot.BoundingBox.BottomRight, pivot => pivot.BoundingBox.BottomRight,
candidate => candidate.BoundingBox.BottomLeft, candidate => candidate.BoundingBox.BottomLeft,
_ => true, _ => true,
(pivot, candidate) => withinLine.Contains(AngleWL(pivot, candidate)), (pivot, candidate) => wlBounds.Contains(AngleWL(pivot, candidate)),
maxDegreeOfParallelism).ToList(); maxDegreeOfParallelism).ToList();
foreach (var g in groupedWords) foreach (var g in groupedWords)
@@ -285,8 +318,28 @@
#endregion #endregion
#region Blocking #region Blocking
private static IEnumerable<TextBlock> GetStructuralBlocks(IReadOnlyList<TextLine> lines, /// <summary>
double maxBLDistance, AngleBounds angularDifference, double epsilon, string lineSeparator, int maxDegreeOfParallelism) /// Get the <see cref="TextBlock"/>s.
/// <para>This is the Docstrum algorithm's 3rd and final step.</para>
/// <para>
/// Method: We want to measure the distance between two lines using the following method:
/// <br>- We check if two lines are overlapping horizontally and compute the perpendicular distance.</br>
/// <br>- We check if the angle between the two line is within 'angularDifference'.</br>
/// <br>- If the two lines are not overlapping or the angle is too wide, the distance is set to the infinity.</br>
/// <para>If two text lines are approximately parallel, close in perpendicular distance, and they either overlap to some specified degree or are separated by only a small distance in parallel distance, then they are said to meet the criteria to belong to the same structural block.</para>
/// </para>
/// </summary>
/// <param name="lines">The lines to segment into <see cref="TextBlock"/>s.</param>
/// <param name="maxBLDistance">The maximum between-line distance. Computed as the estimated between-line spacing times the between-line multiplier in the default implementation.</param>
/// <param name="angularDifferenceBounds">The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.</param>
/// <param name="epsilon">Precision when testing equalities.</param>
/// <param name="lineSeparator">Separator used between lines when building paragraphs.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
/// <returns>The <see cref="TextBlock"/>s built.</returns>
public static IEnumerable<TextBlock> GetStructuralBlocks(IReadOnlyList<TextLine> lines,
double maxBLDistance, AngleBounds angularDifferenceBounds, double epsilon, string lineSeparator, int maxDegreeOfParallelism)
{ {
/****************************************************************************************************** /******************************************************************************************************
* We want to measure the distance between two lines using the following method: * We want to measure the distance between two lines using the following method:
@@ -301,7 +354,7 @@
var groupedLines = Clustering.NearestNeighbours( var groupedLines = Clustering.NearestNeighbours(
lines, lines,
(l1, l2) => PerpendicularOverlappingDistance(l1, l2, angularDifference, epsilon), (l1, l2) => PerpendicularOverlappingDistance(l1, l2, angularDifferenceBounds, epsilon),
(_, __) => maxBLDistance, (_, __) => maxBLDistance,
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight), pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight), candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
@@ -460,7 +513,6 @@
} }
else // If dXj = 0, then yAj is calculated first, and xAj is calculated from that. else // If dXj = 0, then yAj is calculated first, and xAj is calculated from that.
{ {
// TODO: check that
yAj = (yPi * dYidYj + yPj * dXidXj + dYj * dXi * (xPi - xPj)) / denominator; yAj = (yPi * dYidYj + yPj * dXidXj + dYj * dXi * (xPi - xPj)) / denominator;
xAj = xPj; xAj = xPj;
} }
@@ -483,9 +535,7 @@
double by = pl2.Y - pl1.Y; double by = pl2.Y - pl1.Y;
double dotProd1 = ax * bx + ay * by; double dotProd1 = ax * bx + ay * by;
if (dotProd1 < 0) return false; return dotProd1 >= 0 && dotProd1 <= (bx * bx + by * by);
return dotProd1 <= (bx * bx + by * by);
} }
/// <summary> /// <summary>

View File

@@ -1,104 +1,173 @@
namespace UglyToad.PdfPig.Tests.Dla namespace UglyToad.PdfPig.Tests.Dla
{ {
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
using UglyToad.PdfPig.Fonts.SystemFonts; using UglyToad.PdfPig.Fonts.SystemFonts;
using Xunit; using Xunit;
public class DocstrumBoundingBoxesTests public class DocstrumBoundingBoxesTests
{ {
public static IEnumerable<object[]> DataExtract => new[] public static IEnumerable<object[]> DataExtract => new[]
{ {
new object[] new object[]
{ {
"complex rotated.pdf", "complex rotated.pdf",
new string[] new string[]
{ {
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Suspendisse at ullamcorper libero. Cras sit amet dui laoreet tellus tristique commodo. Nam pretium id ligula ac malesuada. Mauris at lacinia magna. Curabitur ex lectus, lobortis lobortis turpis ac, congue aliquet quam. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Etiam consectetur sem et ex sagittis pretium. Praesent urna velit, mollis vitae ex vel, hendrerit finibus mauris. In at luctus orci. Nunc odio justo, rhoncus et euismod nec, bibendum vitae lacus. Aenean maximus sapien lacus, ut pellentesque tellus egestas eget. Nulla semper massa ut vehicula faucibus. Nam rhoncus, dolor consectetur pulvinar gravida, nisi sem luctus nibh, non venenatis nunc lorem et velit.", "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Suspendisse at ullamcorper libero. Cras sit amet dui laoreet tellus tristique commodo. Nam pretium id ligula ac malesuada. Mauris at lacinia magna. Curabitur ex lectus, lobortis lobortis turpis ac, congue aliquet quam. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Etiam consectetur sem et ex sagittis pretium. Praesent urna velit, mollis vitae ex vel, hendrerit finibus mauris. In at luctus orci. Nunc odio justo, rhoncus et euismod nec, bibendum vitae lacus. Aenean maximus sapien lacus, ut pellentesque tellus egestas eget. Nulla semper massa ut vehicula faucibus. Nam rhoncus, dolor consectetur pulvinar gravida, nisi sem luctus nibh, non venenatis nunc lorem et velit.",
"Morbi euismod mattis libero, nec porta neque aliquam et. Nunc sed felis id libero tincidunt malesuada et laoreet orci. Phasellus massa libero, cursus imperdiet rhoncus quis, consequat eu eros. Nullam imperdiet felis sed ligula faucibus bibendum. Vestibulum rhoncus metus eu congue cursus. Maecenas vulputate dignissim dolor a iaculis. Praesent vel diam congue, dapibus lorem nec, viverra dolor. Vestibulum quis odio a risus semper aliquam.", "Morbi euismod mattis libero, nec porta neque aliquam et. Nunc sed felis id libero tincidunt malesuada et laoreet orci. Phasellus massa libero, cursus imperdiet rhoncus quis, consequat eu eros. Nullam imperdiet felis sed ligula faucibus bibendum. Vestibulum rhoncus metus eu congue cursus. Maecenas vulputate dignissim dolor a iaculis. Praesent vel diam congue, dapibus lorem nec, viverra dolor. Vestibulum quis odio a risus semper aliquam.",
"Mauris tincidunt massa id lorem consectetur, in vestibulum nibh feugiat. Quisque eget commodo tortor. Duis iaculis, urna eget porttitor consectetur, metus lacus tempus urna, vehicula facilisis quam est scelerisque dui. Vestibulum imperdiet, tellus vel vulputate pretium, dolor mauris aliquet erat, sit amet fringilla nisi dui ut felis.", "Mauris tincidunt massa id lorem consectetur, in vestibulum nibh feugiat. Quisque eget commodo tortor. Duis iaculis, urna eget porttitor consectetur, metus lacus tempus urna, vehicula facilisis quam est scelerisque dui. Vestibulum imperdiet, tellus vel vulputate pretium, dolor mauris aliquet erat, sit amet fringilla nisi dui ut felis.",
"Cras gravida vel risus sit amet sagittis. Vestibulum et purus pretium, accumsan turpis ac, consectetur augue. Nam viverra purus in urna mollis eleifend. Donec non imperdiet justo. In commodo tortor in diam feugiat, eget placerat augue posuere. Donec justo arcu, rutrum in massa quis, dictum condimentum risus. Nunc euismod et dolor at elementum. Duis pretium risus rhoncus mauris pulvinar, vel semper elit tempus. Quisque imperdiet, odio et hendrerit laoreet, justo dolor blandit sapien, ut mollis risus elit sit amet lacus. Vivamus id tortor eleifend, gravida tortor vitae, dignissim mauris. Integer efficitur ac neque id venenatis. Suspendisse pharetra neque sit amet ornare convallis. Sed eget eros dignissim risus eleifend elementum. Duis non bibendum ipsum.", "Cras gravida vel risus sit amet sagittis. Vestibulum et purus pretium, accumsan turpis ac, consectetur augue. Nam viverra purus in urna mollis eleifend. Donec non imperdiet justo. In commodo tortor in diam feugiat, eget placerat augue posuere. Donec justo arcu, rutrum in massa quis, dictum condimentum risus. Nunc euismod et dolor at elementum. Duis pretium risus rhoncus mauris pulvinar, vel semper elit tempus. Quisque imperdiet, odio et hendrerit laoreet, justo dolor blandit sapien, ut mollis risus elit sit amet lacus. Vivamus id tortor eleifend, gravida tortor vitae, dignissim mauris. Integer efficitur ac neque id venenatis. Suspendisse pharetra neque sit amet ornare convallis. Sed eget eros dignissim risus eleifend elementum. Duis non bibendum ipsum.",
"Morbi euismod mattis libero, nec porta neque aliquam et. Nunc sed felis id libero tincidunt malesuada et laoreet orci. Phasellus massa libero, cursus imperdiet rhoncus quis, consequat eu eros. Nullam imperdiet felis sed ligula faucibus bibendum. Vestibulum rhoncus metus eu congue cursus. Maecenas vulputate dignissim dolor a iaculis. Praesent vel diam congue, dapibus lorem nec, viverra dolor. Vestibulum quis odio a risus semper aliquam." "Morbi euismod mattis libero, nec porta neque aliquam et. Nunc sed felis id libero tincidunt malesuada et laoreet orci. Phasellus massa libero, cursus imperdiet rhoncus quis, consequat eu eros. Nullam imperdiet felis sed ligula faucibus bibendum. Vestibulum rhoncus metus eu congue cursus. Maecenas vulputate dignissim dolor a iaculis. Praesent vel diam congue, dapibus lorem nec, viverra dolor. Vestibulum quis odio a risus semper aliquam."
} }
}, },
new object[] new object[]
{ {
"90 180 270 rotated.pdf", "90 180 270 rotated.pdf",
new string[] new string[]
{ {
"Morbi euismod mattis libero, nec porta neque aliquam et. Nunc sed felis id libero tincidunt malesuada et laoreet orci. Phasellus massa libero, cursus imperdiet rhoncus quis, consequat eu eros. Nullam imperdiet felis sed ligula faucibus bibendum. Vestibulum rhoncus metus eu congue cursus. Maecenas vulputate dignissim dolor a iaculis. Praesent vel diam congue, dapibus lorem nec, viverra dolor. Vestibulum quis odio a risus semper aliquam.", "Morbi euismod mattis libero, nec porta neque aliquam et. Nunc sed felis id libero tincidunt malesuada et laoreet orci. Phasellus massa libero, cursus imperdiet rhoncus quis, consequat eu eros. Nullam imperdiet felis sed ligula faucibus bibendum. Vestibulum rhoncus metus eu congue cursus. Maecenas vulputate dignissim dolor a iaculis. Praesent vel diam congue, dapibus lorem nec, viverra dolor. Vestibulum quis odio a risus semper aliquam.",
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Suspendisse at ullamcorper libero. Cras sit amet dui laoreet tellus tristique commodo. Nam pretium id ligula ac malesuada. Mauris at lacinia magna. Curabitur ex lectus, lobortis lobortis turpis ac, congue aliquet quam. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Etiam consectetur sem et ex sagittis pretium. Praesent urna velit, mollis vitae ex vel, hendrerit finibus mauris. In at luctus orci. Nunc odio justo, rhoncus et euismod nec, bibendum vitae lacus. Aenean maximus sapien lacus, ut pellentesque tellus egestas eget. Nulla semper massa ut vehicula faucibus. Nam rhoncus, dolor consectetur pulvinar gravida, nisi sem luctus nibh, non venenatis nunc lorem et velit.", "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Suspendisse at ullamcorper libero. Cras sit amet dui laoreet tellus tristique commodo. Nam pretium id ligula ac malesuada. Mauris at lacinia magna. Curabitur ex lectus, lobortis lobortis turpis ac, congue aliquet quam. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Etiam consectetur sem et ex sagittis pretium. Praesent urna velit, mollis vitae ex vel, hendrerit finibus mauris. In at luctus orci. Nunc odio justo, rhoncus et euismod nec, bibendum vitae lacus. Aenean maximus sapien lacus, ut pellentesque tellus egestas eget. Nulla semper massa ut vehicula faucibus. Nam rhoncus, dolor consectetur pulvinar gravida, nisi sem luctus nibh, non venenatis nunc lorem et velit.",
"Cras gravida vel risus sit amet sagittis. Vestibulum et purus pretium, accumsan turpis ac, consectetur augue. Nam viverra purus in urna mollis eleifend. Donec non imperdiet justo. In commodo tortor in diam feugiat, eget placerat augue posuere. Donec justo arcu, rutrum in massa quis, dictum condimentum risus. Nunc euismod et dolor at elementum. Duis pretium risus rhoncus mauris pulvinar, vel semper elit tempus. Quisque imperdiet, odio et hendrerit laoreet, justo dolor blandit sapien, ut mollis risus elit sit amet lacus. Vivamus id tortor eleifend, gravida tortor vitae, dignissim mauris. Integer efficitur ac neque id venenatis. Suspendisse pharetra neque sit amet ornare convallis. Sed eget eros dignissim risus eleifend elementum. Duis non bibendum ipsum.", "Cras gravida vel risus sit amet sagittis. Vestibulum et purus pretium, accumsan turpis ac, consectetur augue. Nam viverra purus in urna mollis eleifend. Donec non imperdiet justo. In commodo tortor in diam feugiat, eget placerat augue posuere. Donec justo arcu, rutrum in massa quis, dictum condimentum risus. Nunc euismod et dolor at elementum. Duis pretium risus rhoncus mauris pulvinar, vel semper elit tempus. Quisque imperdiet, odio et hendrerit laoreet, justo dolor blandit sapien, ut mollis risus elit sit amet lacus. Vivamus id tortor eleifend, gravida tortor vitae, dignissim mauris. Integer efficitur ac neque id venenatis. Suspendisse pharetra neque sit amet ornare convallis. Sed eget eros dignissim risus eleifend elementum. Duis non bibendum ipsum.",
} }
}, },
new object[] new object[]
{ {
"Random 2 Columns Lists Hyph - Justified.pdf", "Random 2 Columns Lists Hyph - Justified.pdf",
new string[] new string[]
{ {
"Random Big Title", "Random Big Title",
"Lorem Ipsum text with lists Lorem ipsum dolor sit amet, consectetur adipiscing elit. In sodales gravida felis, in rhoncus velit rutrum at. Curabitur hendrerit dapibus nulla, ut hendrerit diam imperdiet quis. Pellentesque id neque ali- quam, pulvinar neque in, vulputate elit. Pel- lentesque ut erat sit amet massa suscipit ullamcor- per. Sed porttitor viverra convallis. Duis vitae sem- per metus. Pellentesque eros purus, egestas eget velit eget, elementum aliquet velit. Suspendisse potenti. Nulla vitae massa rutrum, blandit erat vi- tae, aliquet arcu.", "Lorem Ipsum text with lists Lorem ipsum dolor sit amet, consectetur adipiscing elit. In sodales gravida felis, in rhoncus velit rutrum at. Curabitur hendrerit dapibus nulla, ut hendrerit diam imperdiet quis. Pellentesque id neque ali- quam, pulvinar neque in, vulputate elit. Pel- lentesque ut erat sit amet massa suscipit ullamcor- per. Sed porttitor viverra convallis. Duis vitae sem- per metus. Pellentesque eros purus, egestas eget velit eget, elementum aliquet velit. Suspendisse potenti. Nulla vitae massa rutrum, blandit erat vi- tae, aliquet arcu.",
"Aenean feugiat leo sed enim sodales vehicula. Sus- pendisse tempus hendrerit magna sagittis dictum. Duis ultrices dapibus egestas. Cras eu felis eu lectus suscipit pharetra at at lacus. Nulla facilisi. Proin in- terdum faucibus elit nec rhoncus. Proin sodaless metus sed tincidunt hendrerit.", "Aenean feugiat leo sed enim sodales vehicula. Sus- pendisse tempus hendrerit magna sagittis dictum. Duis ultrices dapibus egestas. Cras eu felis eu lectus suscipit pharetra at at lacus. Nulla facilisi. Proin in- terdum faucibus elit nec rhoncus. Proin sodaless metus sed tincidunt hendrerit.",
"Donec ultricies cursus odio sed rutrum. Nam ven- enatis metus vitae elementum scelerisque. Ali- quam tempor sapien at turpis posuere eleifend. Sed placerat posuere nunc vel efficitur. Quisque auctor felis vel lectus dictum fringilla. Quisque vo- lutpat pulvinar© elit. Aliquam ultrices feugiat ali- quam. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Sus- pendisse imperdiet ex lorem, porta bibendum pu- rus ultricies id.", "Donec ultricies cursus odio sed rutrum. Nam ven- enatis metus vitae elementum scelerisque. Ali- quam tempor sapien at turpis posuere eleifend. Sed placerat posuere nunc vel efficitur. Quisque auctor felis vel lectus dictum fringilla. Quisque vo- lutpat pulvinar© elit. Aliquam ultrices feugiat ali- quam. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Sus- pendisse imperdiet ex lorem, porta bibendum pu- rus ultricies id.",
"Integer vel lacus sapien. Nam sodales ante eu risus facilisis placerat. Aliquam suscipit pulvinar ultricies. Aenean pulvinar, ex ac fermentum egestas, erat nisi feugiat velit, vitae suscipit tellus odio vitae quam. Morbi elementum sem in elit posuere, non", "Integer vel lacus sapien. Nam sodales ante eu risus facilisis placerat. Aliquam suscipit pulvinar ultricies. Aenean pulvinar, ex ac fermentum egestas, erat nisi feugiat velit, vitae suscipit tellus odio vitae quam. Morbi elementum sem in elit posuere, non",
"•","•","•","•", "•","•","•","•",
"Duis leo enim, convallis sit amet orci eget, condimentum mattis mi ; Etiam dolor erat, maximus nec mi sed, con- vallis convallis orci ; Morbi viverra diam in diam cursus, vitae aliquet velit tempus ; Donec at nisi fermentum, ultricies odio eget, egestas massa at nisi fermentum, ul- tricies odio eget, egestas massa.", "Duis leo enim, convallis sit amet orci eget, condimentum mattis mi ; Etiam dolor erat, maximus nec mi sed, con- vallis convallis orci ; Morbi viverra diam in diam cursus, vitae aliquet velit tempus ; Donec at nisi fermentum, ultricies odio eget, egestas massa at nisi fermentum, ul- tricies odio eget, egestas massa.",
"rhoncus magna fringilla. Phasellus cursus in dolor laoreet rutrum. Curabitur tincidunt risus ullamcor- per, vehicula velit at, pulvinar metus.", "rhoncus magna fringilla. Phasellus cursus in dolor laoreet rutrum. Curabitur tincidunt risus ullamcor- per, vehicula velit at, pulvinar metus.",
"Donec quis ante leo. Vivamus pharetra, nisl ac vehi- cula tempor, tellus lacus aliquam sapien, eu congue nibh quam sit amet odio. Quisque metus arcu, sem- per nec consequat eu, pellentesque vel sem. Sed purus risus, tincidunt¹ sit amet dictum vitae, euis- mod id nibh. Praesent ultrices libero quis enim porta, sit amet pellentesque augue pretium. Viva- mus nec molestie nunc. Donec finibus enim nec tel- lus laoreet elementum. Curabitur efficitur placerat dolor et semper.", "Donec quis ante leo. Vivamus pharetra, nisl ac vehi- cula tempor, tellus lacus aliquam sapien, eu congue nibh quam sit amet odio. Quisque metus arcu, sem- per nec consequat eu, pellentesque vel sem. Sed purus risus, tincidunt¹ sit amet dictum vitae, euis- mod id nibh. Praesent ultrices libero quis enim porta, sit amet pellentesque augue pretium. Viva- mus nec molestie nunc. Donec finibus enim nec tel- lus laoreet elementum. Curabitur efficitur placerat dolor et semper.",
"Morbi laoreet dui eu tortor luctus, nec ultrices do- lor ullamcorper. Ut gravida sed nisl a efficitur. In tincidunt orci a condimentum semper. Suspendisse scelerisque fermentum lacinia. Vestibulum sit amet ornare tellus, aliquet euismod mauris. Cras suscipit venenatis ultrices. Sed diam erat, aliquet a tellus ut, viverra 12º ongue magna. Cras id justo tortor. Mauris in tortor vulputate, pellentesque nisl ac, facilisis ligula. Class aptent taciti² sociosqu ad li- tora torquent per conubia nostra³, per inceptos himenaeos. Aliquam eget dolor turpis. Mauris id molestie tellus. Sed elementum molestie nisi, at ali- quet sem vehicula nec. Morbi tempus nulla enim, a vulputate magna €51 luctus £66 eu. Fusce sodales, libero quis suscipit ultrices, metus erat auctor urna, sit amet dictum arcu tortor eu metus.", "Morbi laoreet dui eu tortor luctus, nec ultrices do- lor ullamcorper. Ut gravida sed nisl a efficitur. In tincidunt orci a condimentum semper. Suspendisse scelerisque fermentum lacinia. Vestibulum sit amet ornare tellus, aliquet euismod mauris. Cras suscipit venenatis ultrices. Sed diam erat, aliquet a tellus ut, viverra 12º ongue magna. Cras id justo tortor. Mauris in tortor vulputate, pellentesque nisl ac, facilisis ligula. Class aptent taciti² sociosqu ad li- tora torquent per conubia nostra³, per inceptos himenaeos. Aliquam eget dolor turpis. Mauris id molestie tellus. Sed elementum molestie nisi, at ali- quet sem vehicula nec. Morbi tempus nulla enim, a vulputate magna €51 luctus £66 eu. Fusce sodales, libero quis suscipit ultrices, metus erat auctor urna, sit amet dictum arcu tortor eu metus.",
"Morbi vestibulum varius ipsum nec molestie. Proin auctor efficitur diam ut luctus. Phasellus cursus maximus ultricies. Mauris eu neque ut sem semper tempus. Curabitur non lorem eu nunc lobortis vi- verra at in diam. Pellentesque euismod purus a leo lobortis tempor. Maecenas mollis ligula at sem sus- cipit fringilla. Mauris sollicitudin tincidunt lectus id tempor. Etiam ut nisi est.", "Morbi vestibulum varius ipsum nec molestie. Proin auctor efficitur diam ut luctus. Phasellus cursus maximus ultricies. Mauris eu neque ut sem semper tempus. Curabitur non lorem eu nunc lobortis vi- verra at in diam. Pellentesque euismod purus a leo lobortis tempor. Maecenas mollis ligula at sem sus- cipit fringilla. Mauris sollicitudin tincidunt lectus id tempor. Etiam ut nisi est.",
"1. Ut volutpat, velit at interdum consectetur, nisl lorem consequat mauris, feugiat dignissim tellus massa ut nisl. 2. Praesent at est nisi. Pellentesque rutrum lorem sed dui accumsan gravida. 3. Pellentesque dictum nisl vitae urna luctus, congue pulvinar mi congue.", "1. Ut volutpat, velit at interdum consectetur, nisl lorem consequat mauris, feugiat dignissim tellus massa ut nisl. 2. Praesent at est nisi. Pellentesque rutrum lorem sed dui accumsan gravida. 3. Pellentesque dictum nisl vitae urna luctus, congue pulvinar mi congue.",
} }
}, },
new object[] new object[]
{ {
"no vertical distance.pdf", "no vertical distance.pdf",
new string[] new string[]
{ {
"Documents second line left aligned.", "Documents second line left aligned.",
"Documents first line right aligned." "Documents first line right aligned."
} }
}, },
new object[] new object[]
{ {
"no horizontal distance.pdf", "no horizontal distance.pdf",
new string[] new string[]
{ {
"First. Second." "First. Second."
} }
} }
}; };
[SkippableTheory] [SkippableTheory]
[MemberData(nameof(DataExtract))] [MemberData(nameof(DataExtract))]
public void GetBlocks(string name, string[] expected) public void GetBlocks(string name, string[] expected)
{ {
if (name == "90 180 270 rotated.pdf") if (name == "90 180 270 rotated.pdf")
{ {
// The 'TimesNewRomanPSMT' font is used by this particular document. Thus, results cannot be trusted on // The 'TimesNewRomanPSMT' font is used by this particular document. Thus, results cannot be trusted on
// platforms where this font isn't generally available (e.g. OSX, Linux, etc.), so we skip it! // platforms where this font isn't generally available (e.g. OSX, Linux, etc.), so we skip it!
var font = SystemFontFinder.Instance.GetTrueTypeFont("TimesNewRomanPSMT"); var font = SystemFontFinder.Instance.GetTrueTypeFont("TimesNewRomanPSMT");
Skip.If(font == null, "Skipped because the font TimesNewRomanPSMT could not be found in the execution environment."); Skip.If(font == null, "Skipped because the font TimesNewRomanPSMT could not be found in the execution environment.");
} }
var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions() { LineSeparator = " " }; var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions() { LineSeparator = " " };
using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name))) using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
{ {
var page = document.GetPage(1); var page = document.GetPage(1);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters); var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words, options); var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words, options);
Assert.Equal(expected.Length, blocks.Count); Assert.Equal(expected.Length, blocks.Count);
var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X) var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
.ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList(); .ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();
for (int i = 0; i < orderedBlocks.Count; i++) for (int i = 0; i < orderedBlocks.Count; i++)
{ {
Assert.Equal(expected[i], orderedBlocks[i].Text); Assert.Equal(expected[i], orderedBlocks[i].Text);
} }
} }
} }
}
[SkippableTheory]
[MemberData(nameof(DataExtract))]
public void GetBlocksStatic(string name, string[] expected)
{
if (name == "90 180 270 rotated.pdf")
{
// The 'TimesNewRomanPSMT' font is used by this particular document. Thus, results cannot be trusted on
// platforms where this font isn't generally available (e.g. OSX, Linux, etc.), so we skip it!
var font = SystemFontFinder.Instance.GetTrueTypeFont("TimesNewRomanPSMT");
Skip.If(font == null, "Skipped because the font TimesNewRomanPSMT could not be found in the execution environment.");
}
var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions() { LineSeparator = " " };
using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
{
var page = document.GetPage(1);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters).ToList();
// Docstrum using static methods
// Filter out white spaces
words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToList();
var wlBounds = options.WithinLineBounds;
var wlBinSize = options.WithinLineBinSize;
var wlMultiplier = options.WithinLineMultiplier;
var blBounds = options.BetweenLineBounds;
var blBinSize = options.BetweenLineBinSize;
var blMultiplier = options.BetweenLineMultiplier;
var maxDegreeOfParallelism = options.MaxDegreeOfParallelism;
var angularDifferenceBounds = options.AngularDifferenceBounds;
var wordSeparator = options.WordSeparator;
var lineSeparator = options.LineSeparator;
var epsilon = options.Epsilon;
// 1. Estimate within line and between line spacing
if (!DocstrumBoundingBoxes.GetSpacingEstimation(words, wlBounds, wlBinSize, blBounds, blBinSize,
maxDegreeOfParallelism,
out double withinLineDistance, out double betweenLineDistance))
{
if (double.IsNaN(withinLineDistance))
{
withinLineDistance = 0;
}
if (double.IsNaN(betweenLineDistance))
{
betweenLineDistance = 0;
}
}
// 2. Determination of Text Lines
double maxWithinLineDistance = wlMultiplier * withinLineDistance;
var lines = DocstrumBoundingBoxes.GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();
// 3. Structural Block Determination
double maxBetweenLineDistance = blMultiplier * betweenLineDistance;
var blocks = DocstrumBoundingBoxes.GetStructuralBlocks(lines, maxBetweenLineDistance, angularDifferenceBounds, epsilon, lineSeparator, maxDegreeOfParallelism).ToList();
Assert.Equal(expected.Length, blocks.Count);
var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
.ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();
for (int i = 0; i < orderedBlocks.Count; i++)
{
Assert.Equal(expected[i], orderedBlocks[i].Text);
}
}
}
}
} }