diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
index 4a8f7575..4e4d643a 100644
--- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
@@ -91,6 +91,7 @@
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
"UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
"UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.WhitespaceCoverExtractor",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
"UglyToad.PdfPig.Export.AltoXmlTextExporter",
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
index 97231d9f..5b1740d3 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
@@ -22,11 +22,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.
/// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
/// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
internal static IEnumerable> ClusterNearestNeighbours(List elements,
Func distMeasure,
Func maxDistanceFunction,
Func pivotPoint, Func candidatesPoint,
- Func filterPivot, Func filterFinal)
+ Func filterPivot, Func filterFinal,
+ int maxDegreeOfParallelism)
{
/*************************************************************************************
* Algorithm steps
@@ -48,8 +52,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
int[] indexes = Enumerable.Repeat((int)-1, elements.Count).ToArray();
var candidatesPoints = elements.Select(candidatesPoint).ToList();
+ ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
+
// 1. Find nearest neighbours indexes
- Parallel.For(0, elements.Count, e =>
+ Parallel.For(0, elements.Count, parallelOptions, e =>
{
var pivot = elements[e];
@@ -85,11 +91,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.
/// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
/// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
internal static IEnumerable> ClusterNearestNeighbours(T[] elements,
Func distMeasure,
Func maxDistanceFunction,
Func pivotPoint, Func candidatesPoint,
- Func filterPivot, Func filterFinal)
+ Func filterPivot, Func filterFinal,
+ int maxDegreeOfParallelism)
{
/*************************************************************************************
* Algorithm steps
@@ -111,8 +121,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
var candidatesPoints = elements.Select(candidatesPoint).ToList();
+ ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
+
// 1. Find nearest neighbours indexes
- Parallel.For(0, elements.Length, e =>
+ Parallel.For(0, elements.Length, parallelOptions, e =>
{
var pivot = elements[e];
@@ -148,11 +160,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// The candidates' line to use for pairing.
/// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
/// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
internal static IEnumerable> ClusterNearestNeighbours(T[] elements,
Func distMeasure,
Func maxDistanceFunction,
Func pivotLine, Func candidatesLine,
- Func filterPivot, Func filterFinal)
+ Func filterPivot, Func filterFinal,
+ int maxDegreeOfParallelism)
{
/*************************************************************************************
* Algorithm steps
@@ -174,8 +190,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
var candidatesLines = elements.Select(x => candidatesLine(x)).ToList();
+ ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
+
// 1. Find nearest neighbours indexes
- Parallel.For(0, elements.Length, e =>
+ Parallel.For(0, elements.Length, parallelOptions, e =>
{
var pivot = elements[e];
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs
index 4dc02c1e..136265f6 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs
@@ -28,11 +28,25 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// Get the blocks.
/// Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.
///
- ///
- ///
- public IReadOnlyList GetBlocks(IEnumerable pageWords)
+ /// The words to segment into s.
+ /// The s generated by the document spectrum method.
+ public IReadOnlyList GetBlocks(IEnumerable words)
{
- return GetBlocks(pageWords, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3);
+ return GetBlocks(words, -1);
+ }
+
+ ///
+ /// Get the blocks.
+ /// Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.
+ ///
+ /// The words to segment into s.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
+ /// The s generated by the document spectrum method.
+ public IReadOnlyList GetBlocks(IEnumerable words, int maxDegreeOfParallelism)
+ {
+ return GetBlocks(words, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3, maxDegreeOfParallelism);
}
///
@@ -46,8 +60,26 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// distance found by the analysis.
/// The s generated by the document spectrum method.
public IReadOnlyList GetBlocks(IEnumerable words, AngleBounds withinLine,
- AngleBounds betweenLine,
- double betweenLineMultiplier)
+ AngleBounds betweenLine, double betweenLineMultiplier)
+ {
+ return GetBlocks(words, withinLine, betweenLine, betweenLineMultiplier, -1);
+ }
+
+ ///
+ /// Get the blocks. See original paper for more information.
+ ///
+ /// The words to segment into s.
+ /// Angle bounds for words to be considered on the same line.
+ /// Angle bounds for words to be considered on separate lines.
+ /// Multiplier that gives the maximum perpendicular distance between
+ /// text lines for blocking. Maximum distance will be this number times the between-line
+ /// distance found by the analysis.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
+ /// The s generated by the document spectrum method.
+ public IReadOnlyList GetBlocks(IEnumerable words, AngleBounds withinLine,
+ AngleBounds betweenLine, double betweenLineMultiplier, int maxDegreeOfParallelism)
{
if (words == null)
{
@@ -74,8 +106,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
var withinLineDistList = new ConcurrentBag();
var betweenLineDistList = new ConcurrentBag();
+ ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
+
// 1. Estimate in line and between line spacing
- Parallel.For(0, wordsList.Count, i =>
+ Parallel.For(0, wordsList.Count, parallelOptions, i =>
{
var word = wordsList[i];
@@ -107,16 +141,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
if (withinLineDistance == null || betweenLineDistance == null)
{
- return new[] {new TextBlock(new[] {new TextLine(wordsList)})};
+ return new[] { new TextBlock(new[] { new TextLine(wordsList) }) };
}
// 2. Find lines of text
double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value);
- var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine).ToArray();
+ var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine, maxDegreeOfParallelism).ToArray();
// 3. Find blocks of text
double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value;
- var blocks = GetLinesGroups(lines, maxDistanceBetweenLine).ToList();
+ var blocks = GetLinesGroups(lines, maxDistanceBetweenLine, maxDegreeOfParallelism).ToList();
// 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
for (var b = 0; b < blocks.Count; b++)
@@ -128,7 +162,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
// Merge all lines (words)
blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(),
- double.MaxValue, withinLine).ToList());
+ double.MaxValue, withinLine, maxDegreeOfParallelism).ToList());
for (var c = 0; c < blocks.Count; c++)
{
@@ -147,7 +181,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
// 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
// same block. Filtering will still be done based on angle.
// Merge all lines (words) sharing same bottom (baseline)
- var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine).ToList();
+ var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine, maxDegreeOfParallelism).ToList();
blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList());
// Remove
@@ -207,7 +241,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
return finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex]);
}
- private static IEnumerable GetLines(List words, double maxDist, AngleBounds withinLine)
+ private static IEnumerable GetLines(List words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism)
{
TextDirection textDirection = words[0].TextDirection;
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, Distances.Euclidean,
@@ -220,7 +254,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft);
return (withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper);
- }).ToList();
+ },
+ maxDegreeOfParallelism).ToList();
Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
if (textDirection == TextDirection.Rotate180)
@@ -242,7 +277,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
}
}
- private static IEnumerable GetLinesGroups(TextLine[] lines, double maxDist)
+ private static IEnumerable GetLinesGroups(TextLine[] lines, double maxDist, int maxDegreeOfParallelism)
{
/**************************************************************************************************
* We want to measure the distance between two lines using the following method:
@@ -269,7 +304,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
(pivot, candidate) => maxDist,
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
- pivot => true, (pivot, candidate) => true).ToList();
+ pivot => true, (pivot, candidate) => true,
+ maxDegreeOfParallelism).ToList();
for (int a = 0; a < groupedIndexes.Count(); a++)
{
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs
index 62e212da..559e4228 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs
@@ -16,7 +16,14 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
///
/// Create an instance of Nearest Neighbour Word Extractor, .
///
- public static IWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
+ public static NearestNeighbourWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
+
+ ///
+ /// Gets or sets the maximum number of concurrent tasks enabled. Default value is -1.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
+ ///
+ public int MaxDegreeOfParallelism { get; set; } = -1;
///
/// Gets the words.
@@ -27,14 +34,14 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
List wordsH = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Horizontal),
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2,
- Distances.Manhattan)
+ Distances.Manhattan, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Bottom)
.ThenBy(x => x.BoundingBox.Left).ToList();
List words180 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate180),
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2,
- Distances.Manhattan)
+ Distances.Manhattan, MaxDegreeOfParallelism)
.OrderBy(x => x.BoundingBox.Top)
.ThenByDescending(x => x.BoundingBox.Right).ToList();
wordsH.AddRange(words180);
@@ -42,7 +49,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
List words90 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate90),
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Height), (double)(l2.GlyphRectangle.Height)) * 0.2,
- Distances.Manhattan)
+ Distances.Manhattan, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Left)
.ThenBy(x => x.BoundingBox.Top).ToList();
wordsH.AddRange(words90);
@@ -50,7 +57,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
List words270 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate270),
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Height), (double)(l2.GlyphRectangle.Height)) * 0.2,
- Distances.Manhattan)
+ Distances.Manhattan, MaxDegreeOfParallelism)
.OrderBy(x => x.BoundingBox.Right)
.ThenByDescending(x => x.BoundingBox.Bottom).ToList();
wordsH.AddRange(words270);
@@ -58,7 +65,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
List wordsU = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Unknown),
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2,
- Distances.Manhattan)
+ Distances.Manhattan, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Bottom)
.ThenBy(x => x.BoundingBox.Left).ToList();
wordsH.AddRange(wordsU);
@@ -75,15 +82,19 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// e.g. Max(GlyphRectangle.Width) x 20%.
/// The distance measure between two start and end base line points,
/// e.g. the Manhattan distance.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
private List GetWords(IEnumerable pageLetters,
- Func maxDistanceFunction, Func distMeasure)
+ Func maxDistanceFunction, Func distMeasure,
+ int maxDegreeOfParallelism)
{
if (pageLetters == null || pageLetters.Count() == 0) return new List();
TextDirection textDirection = pageLetters.ElementAt(0).TextDirection;
if (pageLetters.Any(x => textDirection != x.TextDirection))
{
- throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction.");
+ throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction.");
}
Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList();
@@ -106,7 +117,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
distMeasure, maxDistanceFunction,
l => l.EndBaseLine, l => l.StartBaseLine,
l => !string.IsNullOrWhiteSpace(l.Value),
- (l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value)).ToList();
+ (l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value),
+ maxDegreeOfParallelism).ToList();
List words = new List();
for (int a = 0; a < groupedIndexes.Count(); a++)
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs
index 83fc7661..e8a3806c 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs
@@ -29,7 +29,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
///
/// The words in the page.
/// The minimum number of elements to define a text edge.
- public static IReadOnlyDictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4)
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
+ public static IReadOnlyDictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4,
+ int maxDegreeOfParallelism = -1)
{
if (minimumElements < 0)
{
@@ -40,7 +44,9 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
ConcurrentDictionary> dictionary = new ConcurrentDictionary>();
- Parallel.ForEach(edgesFuncs, f =>
+ ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
+
+ Parallel.ForEach(edgesFuncs, parallelOptions, f =>
{
dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
});
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs
new file mode 100644
index 00000000..dd9cf6e8
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs
@@ -0,0 +1,400 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using UglyToad.PdfPig.Content;
+using UglyToad.PdfPig.Geometry;
+using UglyToad.PdfPig.Util;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// A top-down algorithm that finds a cover of the background whitespace of a document in terms of maximal empty rectangles.
+ /// See Section 3.2 of 'High precision text extraction from PDF documents' by Øyvind Raddum Berg and Section 2 of 'Two geometric algorithms for layout analysis' by Thomas M. Breuel.
+ ///
+ public static class WhitespaceCoverExtractor
+ {
+ ///
+ /// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
+ ///
+ /// The words in the page.
+ /// The images in the page.
+ /// The maximum number of rectangles to find.
+ /// The maximum size of the queue used in the algorithm.
+ /// The identified whitespace rectangles.
+ public static IReadOnlyList GetWhitespaces(IEnumerable words, IEnumerable images = null, int maxRectangleCount = 40, int maxBoundQueueSize = 0)
+ {
+ return GetWhitespaces(words,
+ images,
+ words.SelectMany(w => w.Letters).Select(x => x.GlyphRectangle.Width).Mode() * 1.25m,
+ words.SelectMany(w => w.Letters).Select(x => x.GlyphRectangle.Height).Mode() * 1.25m,
+ maxRectangleCount: maxRectangleCount,
+ maxBoundQueueSize: maxBoundQueueSize);
+ }
+
+ ///
+ /// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
+ ///
+ /// The words in the page.
+ /// The images in the page.
+ /// Lower bounds for the width of rectangles.
+ /// Lower bounds for the height of rectangles.
+ /// The maximum number of rectangles to find.
+ /// Constant value to allow candidate whitespace rectangle to overlap the
+ /// surrounding obstacles by some percent. Default value is 15%.
+ /// The maximum size of the queue used in the algorithm.
+ /// The identified whitespace rectangles.
+ public static IReadOnlyList GetWhitespaces(IEnumerable words, IEnumerable images,
+ decimal minWidth, decimal minHeight, int maxRectangleCount = 40, decimal whitespaceFuzziness = 0.15m, int maxBoundQueueSize = 0)
+ {
+ var bboxes = words.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0)
+ .Select(o => o.BoundingBox).ToList();
+
+ if (images != null && images.Count() > 0)
+ {
+ bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds));
+ }
+
+ return GetWhitespaces(bboxes,
+ minWidth: minWidth,
+ minHeight: minHeight,
+ maxRectangleCount: maxRectangleCount,
+ whitespaceFuzziness: whitespaceFuzziness,
+ maxBoundQueueSize: maxBoundQueueSize);
+ }
+
+ ///
+ /// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
+ ///
+ /// The list of obstacles' bounding boxes in the page.
+ /// Lower bounds for the width of rectangles.
+ /// Lower bounds for the height of rectangles.
+ /// The maximum number of rectangles to find.
+ /// Constant value to allow candidate whitespace rectangle to overlap the
+ /// surrounding obstacles by some percent. Default value is 15%.
+ /// The maximum size of the queue used in the algorithm.
+ /// The identified whitespace rectangles.
+ public static IReadOnlyList GetWhitespaces(IEnumerable boundingboxes,
+ decimal minWidth, decimal minHeight, int maxRectangleCount = 40, decimal whitespaceFuzziness = 0.15m, int maxBoundQueueSize = 0)
+ {
+ if (boundingboxes.Count() == 0) return EmptyArray.Instance;
+
+ var obstacles = new HashSet(boundingboxes);
+ var pageBound = GetBound(obstacles);
+ return GetMaximalRectangles(pageBound,
+ obstacles,
+ minWidth: minWidth,
+ minHeight: minHeight,
+ maxRectangleCount: maxRectangleCount,
+ whitespaceFuzziness: whitespaceFuzziness,
+ maxBoundQueueSize: maxBoundQueueSize);
+ }
+
+ private static IReadOnlyList GetMaximalRectangles(PdfRectangle bound,
+ HashSet obstacles, decimal minWidth, decimal minHeight, int maxRectangleCount,
+ decimal whitespaceFuzziness, int maxBoundQueueSize)
+ {
+ QueueEntries queueEntries = new QueueEntries(maxBoundQueueSize);
+ queueEntries.Enqueue(new QueueEntry(bound, obstacles, whitespaceFuzziness));
+
+ HashSet selected = new HashSet();
+ HashSet holdList = new HashSet();
+
+ while (queueEntries.Any())
+ {
+ var current = queueEntries.Dequeue();
+
+ if (current.IsEmptyEnough(obstacles))
+ {
+ if (selected.Any(c => Inside(c, current.Bound))) continue;
+
+ // A check was added which impeded the algorithm from accepting
+ // rectangles which were not adjacent to an already accepted
+ // rectangle, or to the border of the page.
+ if (!IsAdjacentToPageBounds(bound, current.Bound) && // NOT in contact to border page AND
+ !selected.Any(q => IsAdjacentTo(q, current.Bound))) // NOT in contact to any already accepted rectangle
+ {
+ // In order to maintain the correctness of the algorithm,
+ // rejected rectangles are put in a hold list.
+ holdList.Add(current);
+ continue;
+ }
+
+ selected.Add(current.Bound);
+
+ if (selected.Count >= maxRectangleCount) return selected.ToList();
+
+ obstacles.Add(current.Bound);
+
+ // Each time a new rectangle is identified and accepted, this hold list
+ // will be added back to the queue in case any of them will have become valid.
+ foreach (var hold in holdList)
+ {
+ queueEntries.Enqueue(hold);
+ }
+
+ // After a maximal rectangle has been found, it is added back to the list
+ // of obstacles. Whenever a QueueEntry is dequeued, its list of obstacles
+ // can be recomputed to include newly identified whitespace rectangles.
+ foreach (var overlapping in queueEntries)
+ {
+ if (OverlapsHard(current.Bound, overlapping.Bound))
+ overlapping.AddWhitespace(current.Bound);
+ }
+
+ continue;
+ }
+
+ var pivot = current.GetPivot();
+ var b = current.Bound;
+
+ List subRectangles = new List();
+
+ var rRight = new PdfRectangle(pivot.Right, b.Bottom, b.Right, b.Top);
+ if (b.Right > pivot.Right && rRight.Height > minHeight && rRight.Width > minWidth)
+ {
+ queueEntries.Enqueue(new QueueEntry(rRight,
+ new HashSet(current.Obstacles.Where(o => OverlapsHard(rRight, o))),
+ whitespaceFuzziness));
+ }
+
+ var rLeft = new PdfRectangle(b.Left, b.Bottom, pivot.Left, b.Top);
+ if (b.Left < pivot.Left && rLeft.Height > minHeight && rLeft.Width > minWidth)
+ {
+ queueEntries.Enqueue(new QueueEntry(rLeft,
+ new HashSet(current.Obstacles.Where(o => OverlapsHard(rLeft, o))),
+ whitespaceFuzziness));
+ }
+
+ var rAbove = new PdfRectangle(b.Left, b.Bottom, b.Right, pivot.Bottom);
+ if (b.Bottom < pivot.Bottom && rAbove.Height > minHeight && rAbove.Width > minWidth)
+ {
+ queueEntries.Enqueue(new QueueEntry(rAbove,
+ new HashSet(current.Obstacles.Where(o => OverlapsHard(rAbove, o))),
+ whitespaceFuzziness));
+ }
+
+ var rBelow = new PdfRectangle(b.Left, pivot.Top, b.Right, b.Top);
+ if (b.Top > pivot.Top && rBelow.Height > minHeight && rBelow.Width > minWidth)
+ {
+ queueEntries.Enqueue(new QueueEntry(rBelow,
+ new HashSet(current.Obstacles.Where(o => OverlapsHard(rBelow, o))),
+ whitespaceFuzziness));
+ }
+ }
+
+ return selected.ToList();
+ }
+
+ private static bool IsAdjacentTo(PdfRectangle rectangle1, PdfRectangle rectangle2)
+ {
+ if (rectangle1.Left > rectangle2.Right ||
+ rectangle2.Left > rectangle1.Right ||
+ rectangle1.Top < rectangle2.Bottom ||
+ rectangle2.Top < rectangle1.Bottom)
+ {
+ return false;
+ }
+
+ if (rectangle1.Left == rectangle2.Right ||
+ rectangle1.Right == rectangle2.Left ||
+ rectangle1.Bottom == rectangle2.Top ||
+ rectangle1.Top == rectangle2.Bottom)
+ {
+ return true;
+ }
+ return false;
+ }
+
+ private static bool IsAdjacentToPageBounds(PdfRectangle pageBound, PdfRectangle rectangle)
+ {
+ if (rectangle.Bottom == pageBound.Bottom ||
+ rectangle.Top == pageBound.Top ||
+ rectangle.Left == pageBound.Left ||
+ rectangle.Right == pageBound.Right)
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ private static bool OverlapsHard(PdfRectangle rectangle1, PdfRectangle rectangle2)
+ {
+ if (rectangle1.Left >= rectangle2.Right ||
+ rectangle2.Left >= rectangle1.Right ||
+ rectangle1.Top <= rectangle2.Bottom ||
+ rectangle2.Top <= rectangle1.Bottom)
+ {
+ return false;
+ }
+
+ return true;
+ }
+
+ private static bool Inside(PdfRectangle rectangle1, PdfRectangle rectangle2)
+ {
+ if (rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left &&
+ rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom)
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ private static PdfRectangle GetBound(IEnumerable obstacles)
+ {
+ return new PdfRectangle(
+ obstacles.Min(b => b.Left),
+ obstacles.Min(b => b.Bottom),
+ obstacles.Max(b => b.Right),
+ obstacles.Max(b => b.Top));
+ }
+
+ #region Sorted Queue
+ private class QueueEntries : SortedSet
+ {
+ int bound;
+
+ public QueueEntries(int maximumBound)
+ {
+ this.bound = maximumBound;
+ }
+
+ public QueueEntry Dequeue()
+ {
+ var current = this.Max;
+ this.Remove(current);
+ return current;
+ }
+
+ public void Enqueue(QueueEntry queueEntry)
+ {
+ if (this.bound > 0 && this.Count > this.bound)
+ {
+ this.Remove(this.Min);
+ }
+ this.Add(queueEntry);
+ }
+ }
+
+ private class QueueEntry : IComparable
+ {
+ public PdfRectangle Bound { get; private set; }
+
+ public decimal Quality { get; private set; }
+
+ public HashSet Obstacles { get; private set; }
+
+ private decimal WhitespaceFuzziness;
+
+ public QueueEntry(PdfRectangle bound, HashSet obstacles, decimal whitespaceFuzziness)
+ {
+ this.Bound = bound;
+ this.Quality = ScoringFunction(Bound);
+ this.Obstacles = obstacles;
+ this.WhitespaceFuzziness = whitespaceFuzziness;
+ }
+
+ public PdfRectangle GetPivot()
+ {
+ int indexMiddle = Distances.FindIndexNearest(Bound.Centroid,
+ Obstacles.Select(o => o.Centroid).ToList(),
+ p => p, p => p, Distances.Euclidean, out double d);
+
+ return indexMiddle == -1 ? Obstacles.First() : Obstacles.ElementAt(indexMiddle);
+ }
+
+ public bool IsEmptyEnough()
+ {
+ return !Obstacles.Any();
+ }
+
+ public bool IsEmptyEnough(IEnumerable pageObstacles)
+ {
+ if (IsEmptyEnough()) return true;
+
+ decimal sum = 0;
+ foreach (var obstacle in pageObstacles)
+ {
+ var intersect = Bound.Intersect(obstacle);
+ if (!intersect.HasValue) return false;
+
+ decimal minimumArea = MinimumOverlappingArea(obstacle, Bound, WhitespaceFuzziness);
+
+ if (intersect.Value.Area > minimumArea)
+ {
+ return false;
+ }
+ sum += intersect.Value.Area;
+ }
+ return sum < Bound.Area * WhitespaceFuzziness;
+ }
+
+ public override string ToString()
+ {
+ return "Q=" + Quality.ToString("#0.0") + ", O=" + Obstacles.Count + ", " + Bound.ToString();
+ }
+
+ public void AddWhitespace(PdfRectangle rectangle)
+ {
+ Obstacles.Add(rectangle);
+ }
+
+ public int CompareTo(QueueEntry entry)
+ {
+ return this.Quality.CompareTo(entry.Quality);
+ }
+
+ public override bool Equals(object obj)
+ {
+ if (obj is QueueEntry entry)
+ {
+ if (this.Bound.Left != entry.Bound.Left ||
+ this.Bound.Right != entry.Bound.Right ||
+ this.Bound.Top != entry.Bound.Top ||
+ this.Bound.Bottom != entry.Bound.Bottom ||
+ this.Obstacles != entry.Obstacles) return false;
+ return true;
+ }
+ return false;
+ }
+
+ public override int GetHashCode()
+ {
+ return (Bound.Left, Bound.Right,
+ Bound.Top, Bound.Bottom,
+ Obstacles).GetHashCode();
+ }
+
+ private static decimal MinimumOverlappingArea(PdfRectangle r1, PdfRectangle r2, decimal whitespaceFuzziness)
+ {
+ return Math.Min(r1.Area, r2.Area) * whitespaceFuzziness;
+ }
+
+ ///
+ /// The scoring function Q(r) which is subsequently used to sort a priority queue.
+ ///
+ ///
+ private static decimal ScoringFunction(PdfRectangle rectangle)
+ {
+ // As can be seen, tall rectangles are preferred. The trick while choosing this Q(r) was
+ // to keep that preference while still allowing wide rectangles to be chosen. After having
+ // experimented with quite a few variations, this simple function was considered a good
+ // solution.
+ return rectangle.Area * (rectangle.Height / 4m);
+ }
+
+ private static decimal OverlappingArea(PdfRectangle rectangle1, PdfRectangle rectangle2)
+ {
+ var intersect = rectangle1.Intersect(rectangle2);
+ if (intersect.HasValue)
+ {
+ return intersect.Value.Area;
+ }
+ return 0;
+ }
+ }
+ #endregion
+ }
+}