merge pull request #105 from BobLd/master

whitespace covering algorithm and #104
This commit is contained in:
Eliot Jones
2019-12-20 11:57:31 +00:00
committed by GitHub
6 changed files with 506 additions and 33 deletions

View File

@@ -91,6 +91,7 @@
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
"UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
"UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
"UglyToad.PdfPig.DocumentLayoutAnalysis.WhitespaceCoverExtractor",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
"UglyToad.PdfPig.Export.AltoXmlTextExporter",

View File

@@ -22,11 +22,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(List<T> elements,
Func<PdfPoint, PdfPoint, double> distMeasure,
Func<T, T, double> maxDistanceFunction,
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal,
int maxDegreeOfParallelism)
{
/*************************************************************************************
* Algorithm steps
@@ -48,8 +52,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
int[] indexes = Enumerable.Repeat((int)-1, elements.Count).ToArray();
var candidatesPoints = elements.Select(candidatesPoint).ToList();
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
// 1. Find nearest neighbours indexes
Parallel.For(0, elements.Count, e =>
Parallel.For(0, elements.Count, parallelOptions, e =>
{
var pivot = elements[e];
@@ -85,11 +91,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(T[] elements,
Func<PdfPoint, PdfPoint, double> distMeasure,
Func<T, T, double> maxDistanceFunction,
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal,
int maxDegreeOfParallelism)
{
/*************************************************************************************
* Algorithm steps
@@ -111,8 +121,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
var candidatesPoints = elements.Select(candidatesPoint).ToList();
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
// 1. Find nearest neighbours indexes
Parallel.For(0, elements.Length, e =>
Parallel.For(0, elements.Length, parallelOptions, e =>
{
var pivot = elements[e];
@@ -148,11 +160,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// <param name="candidatesLine">The candidates' line to use for pairing.</param>
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(T[] elements,
Func<PdfLine, PdfLine, double> distMeasure,
Func<T, T, double> maxDistanceFunction,
Func<T, PdfLine> pivotLine, Func<T, PdfLine> candidatesLine,
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal,
int maxDegreeOfParallelism)
{
/*************************************************************************************
* Algorithm steps
@@ -174,8 +190,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
var candidatesLines = elements.Select(x => candidatesLine(x)).ToList();
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
// 1. Find nearest neighbours indexes
Parallel.For(0, elements.Length, e =>
Parallel.For(0, elements.Length, parallelOptions, e =>
{
var pivot = elements[e];

View File

@@ -28,11 +28,25 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// Get the blocks.
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
/// </summary>
/// <param name="pageWords"></param>
/// <returns></returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords)
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{
return GetBlocks(pageWords, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3);
return GetBlocks(words, -1);
}
/// <summary>
/// Get the blocks.
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
/// </summary>
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, int maxDegreeOfParallelism)
{
return GetBlocks(words, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3, maxDegreeOfParallelism);
}
/// <summary>
@@ -46,8 +60,26 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// distance found by the analysis.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, AngleBounds withinLine,
AngleBounds betweenLine,
double betweenLineMultiplier)
AngleBounds betweenLine, double betweenLineMultiplier)
{
return GetBlocks(words, withinLine, betweenLine, betweenLineMultiplier, -1);
}
/// <summary>
/// Get the blocks. See original paper for more information.
/// </summary>
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
/// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
/// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
/// text lines for blocking. Maximum distance will be this number times the between-line
/// distance found by the analysis.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, AngleBounds withinLine,
AngleBounds betweenLine, double betweenLineMultiplier, int maxDegreeOfParallelism)
{
if (words == null)
{
@@ -74,8 +106,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
var withinLineDistList = new ConcurrentBag<double>();
var betweenLineDistList = new ConcurrentBag<double>();
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
// 1. Estimate in line and between line spacing
Parallel.For(0, wordsList.Count, i =>
Parallel.For(0, wordsList.Count, parallelOptions, i =>
{
var word = wordsList[i];
@@ -107,16 +141,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
if (withinLineDistance == null || betweenLineDistance == null)
{
return new[] {new TextBlock(new[] {new TextLine(wordsList)})};
return new[] { new TextBlock(new[] { new TextLine(wordsList) }) };
}
// 2. Find lines of text
double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value);
var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine).ToArray();
var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine, maxDegreeOfParallelism).ToArray();
// 3. Find blocks of text
double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value;
var blocks = GetLinesGroups(lines, maxDistanceBetweenLine).ToList();
var blocks = GetLinesGroups(lines, maxDistanceBetweenLine, maxDegreeOfParallelism).ToList();
// 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
for (var b = 0; b < blocks.Count; b++)
@@ -128,7 +162,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
// Merge all lines (words)
blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(),
double.MaxValue, withinLine).ToList());
double.MaxValue, withinLine, maxDegreeOfParallelism).ToList());
for (var c = 0; c < blocks.Count; c++)
{
@@ -147,7 +181,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
// 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
// same block. Filtering will still be done based on angle.
// Merge all lines (words) sharing same bottom (baseline)
var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine).ToList();
var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine, maxDegreeOfParallelism).ToList();
blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList());
// Remove
@@ -207,7 +241,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
return finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex]);
}
private static IEnumerable<TextLine> GetLines(List<Word> words, double maxDist, AngleBounds withinLine)
private static IEnumerable<TextLine> GetLines(List<Word> words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism)
{
TextDirection textDirection = words[0].TextDirection;
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, Distances.Euclidean,
@@ -220,7 +254,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft);
return (withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper);
}).ToList();
},
maxDegreeOfParallelism).ToList();
Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
if (textDirection == TextDirection.Rotate180)
@@ -242,7 +277,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
}
}
private static IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
private static IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist, int maxDegreeOfParallelism)
{
/**************************************************************************************************
* We want to measure the distance between two lines using the following method:
@@ -269,7 +304,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
(pivot, candidate) => maxDist,
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
pivot => true, (pivot, candidate) => true).ToList();
pivot => true, (pivot, candidate) => true,
maxDegreeOfParallelism).ToList();
for (int a = 0; a < groupedIndexes.Count(); a++)
{

View File

@@ -16,7 +16,14 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// <summary>
/// Create an instance of Nearest Neighbour Word Extractor, <see cref="NearestNeighbourWordExtractor"/>.
/// </summary>
public static IWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
public static NearestNeighbourWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
/// <summary>
/// Gets or sets the maximum number of concurrent tasks enabled. Default value is -1.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para>
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1;
/// <summary>
/// Gets the words.
@@ -27,14 +34,14 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
List<Word> wordsH = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Horizontal),
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2,
Distances.Manhattan)
Distances.Manhattan, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Bottom)
.ThenBy(x => x.BoundingBox.Left).ToList();
List<Word> words180 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate180),
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2,
Distances.Manhattan)
Distances.Manhattan, MaxDegreeOfParallelism)
.OrderBy(x => x.BoundingBox.Top)
.ThenByDescending(x => x.BoundingBox.Right).ToList();
wordsH.AddRange(words180);
@@ -42,7 +49,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
List<Word> words90 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate90),
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Height), (double)(l2.GlyphRectangle.Height)) * 0.2,
Distances.Manhattan)
Distances.Manhattan, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Left)
.ThenBy(x => x.BoundingBox.Top).ToList();
wordsH.AddRange(words90);
@@ -50,7 +57,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
List<Word> words270 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate270),
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Height), (double)(l2.GlyphRectangle.Height)) * 0.2,
Distances.Manhattan)
Distances.Manhattan, MaxDegreeOfParallelism)
.OrderBy(x => x.BoundingBox.Right)
.ThenByDescending(x => x.BoundingBox.Bottom).ToList();
wordsH.AddRange(words270);
@@ -58,7 +65,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
List<Word> wordsU = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Unknown),
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2,
Distances.Manhattan)
Distances.Manhattan, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Bottom)
.ThenBy(x => x.BoundingBox.Left).ToList();
wordsH.AddRange(wordsU);
@@ -75,15 +82,19 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// e.g. Max(GlyphRectangle.Width) x 20%.</param>
/// <param name="distMeasure">The distance measure between two start and end base line points,
/// e.g. the Manhattan distance.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
private List<Word> GetWords(IEnumerable<Letter> pageLetters,
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure)
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure,
int maxDegreeOfParallelism)
{
if (pageLetters == null || pageLetters.Count() == 0) return new List<Word>();
TextDirection textDirection = pageLetters.ElementAt(0).TextDirection;
if (pageLetters.Any(x => textDirection != x.TextDirection))
{
throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction.");
throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction.");
}
Func<IEnumerable<Letter>, IReadOnlyList<Letter>> orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList();
@@ -106,7 +117,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
distMeasure, maxDistanceFunction,
l => l.EndBaseLine, l => l.StartBaseLine,
l => !string.IsNullOrWhiteSpace(l.Value),
(l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value)).ToList();
(l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value),
maxDegreeOfParallelism).ToList();
List<Word> words = new List<Word>();
for (int a = 0; a < groupedIndexes.Count(); a++)

View File

@@ -29,7 +29,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="minimumElements">The minimum number of elements to define a text edge.</param>
public static IReadOnlyDictionary<EdgeType, List<PdfLine>> GetEdges(IEnumerable<Word> pageWords, int minimumElements = 4)
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyDictionary<EdgeType, List<PdfLine>> GetEdges(IEnumerable<Word> pageWords, int minimumElements = 4,
int maxDegreeOfParallelism = -1)
{
if (minimumElements < 0)
{
@@ -40,7 +44,9 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
ConcurrentDictionary<EdgeType, List<PdfLine>> dictionary = new ConcurrentDictionary<EdgeType, List<PdfLine>>();
Parallel.ForEach(edgesFuncs, f =>
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
Parallel.ForEach(edgesFuncs, parallelOptions, f =>
{
dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
});

View File

@@ -0,0 +1,400 @@
using System;
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Geometry;
using UglyToad.PdfPig.Util;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// A top-down algorithm that finds a cover of the background whitespace of a document in terms of maximal empty rectangles.
/// <para>See Section 3.2 of 'High precision text extraction from PDF documents' by Øyvind Raddum Berg and Section 2 of 'Two geometric algorithms for layout analysis' by Thomas M. Breuel.</para>
/// </summary>
public static class WhitespaceCoverExtractor
{
/// <summary>
/// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
/// </summary>
/// <param name="words">The words in the page.</param>
/// <param name="images">The images in the page.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<Word> words, IEnumerable<IPdfImage> images = null, int maxRectangleCount = 40, int maxBoundQueueSize = 0)
{
return GetWhitespaces(words,
images,
words.SelectMany(w => w.Letters).Select(x => x.GlyphRectangle.Width).Mode() * 1.25m,
words.SelectMany(w => w.Letters).Select(x => x.GlyphRectangle.Height).Mode() * 1.25m,
maxRectangleCount: maxRectangleCount,
maxBoundQueueSize: maxBoundQueueSize);
}
/// <summary>
/// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
/// </summary>
/// <param name="words">The words in the page.</param>
/// <param name="images">The images in the page.</param>
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
/// surrounding obstacles by some percent. Default value is 15%.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<Word> words, IEnumerable<IPdfImage> images,
decimal minWidth, decimal minHeight, int maxRectangleCount = 40, decimal whitespaceFuzziness = 0.15m, int maxBoundQueueSize = 0)
{
var bboxes = words.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0)
.Select(o => o.BoundingBox).ToList();
if (images != null && images.Count() > 0)
{
bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds));
}
return GetWhitespaces(bboxes,
minWidth: minWidth,
minHeight: minHeight,
maxRectangleCount: maxRectangleCount,
whitespaceFuzziness: whitespaceFuzziness,
maxBoundQueueSize: maxBoundQueueSize);
}
/// <summary>
/// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
/// </summary>
/// <param name="boundingboxes">The list of obstacles' bounding boxes in the page.</param>
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
/// surrounding obstacles by some percent. Default value is 15%.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<PdfRectangle> boundingboxes,
decimal minWidth, decimal minHeight, int maxRectangleCount = 40, decimal whitespaceFuzziness = 0.15m, int maxBoundQueueSize = 0)
{
if (boundingboxes.Count() == 0) return EmptyArray<PdfRectangle>.Instance;
var obstacles = new HashSet<PdfRectangle>(boundingboxes);
var pageBound = GetBound(obstacles);
return GetMaximalRectangles(pageBound,
obstacles,
minWidth: minWidth,
minHeight: minHeight,
maxRectangleCount: maxRectangleCount,
whitespaceFuzziness: whitespaceFuzziness,
maxBoundQueueSize: maxBoundQueueSize);
}
private static IReadOnlyList<PdfRectangle> GetMaximalRectangles(PdfRectangle bound,
HashSet<PdfRectangle> obstacles, decimal minWidth, decimal minHeight, int maxRectangleCount,
decimal whitespaceFuzziness, int maxBoundQueueSize)
{
QueueEntries queueEntries = new QueueEntries(maxBoundQueueSize);
queueEntries.Enqueue(new QueueEntry(bound, obstacles, whitespaceFuzziness));
HashSet<PdfRectangle> selected = new HashSet<PdfRectangle>();
HashSet<QueueEntry> holdList = new HashSet<QueueEntry>();
while (queueEntries.Any())
{
var current = queueEntries.Dequeue();
if (current.IsEmptyEnough(obstacles))
{
if (selected.Any(c => Inside(c, current.Bound))) continue;
// A check was added which impeded the algorithm from accepting
// rectangles which were not adjacent to an already accepted
// rectangle, or to the border of the page.
if (!IsAdjacentToPageBounds(bound, current.Bound) && // NOT in contact to border page AND
!selected.Any(q => IsAdjacentTo(q, current.Bound))) // NOT in contact to any already accepted rectangle
{
// In order to maintain the correctness of the algorithm,
// rejected rectangles are put in a hold list.
holdList.Add(current);
continue;
}
selected.Add(current.Bound);
if (selected.Count >= maxRectangleCount) return selected.ToList();
obstacles.Add(current.Bound);
// Each time a new rectangle is identified and accepted, this hold list
// will be added back to the queue in case any of them will have become valid.
foreach (var hold in holdList)
{
queueEntries.Enqueue(hold);
}
// After a maximal rectangle has been found, it is added back to the list
// of obstacles. Whenever a QueueEntry is dequeued, its list of obstacles
// can be recomputed to include newly identified whitespace rectangles.
foreach (var overlapping in queueEntries)
{
if (OverlapsHard(current.Bound, overlapping.Bound))
overlapping.AddWhitespace(current.Bound);
}
continue;
}
var pivot = current.GetPivot();
var b = current.Bound;
List<PdfRectangle> subRectangles = new List<PdfRectangle>();
var rRight = new PdfRectangle(pivot.Right, b.Bottom, b.Right, b.Top);
if (b.Right > pivot.Right && rRight.Height > minHeight && rRight.Width > minWidth)
{
queueEntries.Enqueue(new QueueEntry(rRight,
new HashSet<PdfRectangle>(current.Obstacles.Where(o => OverlapsHard(rRight, o))),
whitespaceFuzziness));
}
var rLeft = new PdfRectangle(b.Left, b.Bottom, pivot.Left, b.Top);
if (b.Left < pivot.Left && rLeft.Height > minHeight && rLeft.Width > minWidth)
{
queueEntries.Enqueue(new QueueEntry(rLeft,
new HashSet<PdfRectangle>(current.Obstacles.Where(o => OverlapsHard(rLeft, o))),
whitespaceFuzziness));
}
var rAbove = new PdfRectangle(b.Left, b.Bottom, b.Right, pivot.Bottom);
if (b.Bottom < pivot.Bottom && rAbove.Height > minHeight && rAbove.Width > minWidth)
{
queueEntries.Enqueue(new QueueEntry(rAbove,
new HashSet<PdfRectangle>(current.Obstacles.Where(o => OverlapsHard(rAbove, o))),
whitespaceFuzziness));
}
var rBelow = new PdfRectangle(b.Left, pivot.Top, b.Right, b.Top);
if (b.Top > pivot.Top && rBelow.Height > minHeight && rBelow.Width > minWidth)
{
queueEntries.Enqueue(new QueueEntry(rBelow,
new HashSet<PdfRectangle>(current.Obstacles.Where(o => OverlapsHard(rBelow, o))),
whitespaceFuzziness));
}
}
return selected.ToList();
}
private static bool IsAdjacentTo(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
if (rectangle1.Left > rectangle2.Right ||
rectangle2.Left > rectangle1.Right ||
rectangle1.Top < rectangle2.Bottom ||
rectangle2.Top < rectangle1.Bottom)
{
return false;
}
if (rectangle1.Left == rectangle2.Right ||
rectangle1.Right == rectangle2.Left ||
rectangle1.Bottom == rectangle2.Top ||
rectangle1.Top == rectangle2.Bottom)
{
return true;
}
return false;
}
private static bool IsAdjacentToPageBounds(PdfRectangle pageBound, PdfRectangle rectangle)
{
if (rectangle.Bottom == pageBound.Bottom ||
rectangle.Top == pageBound.Top ||
rectangle.Left == pageBound.Left ||
rectangle.Right == pageBound.Right)
{
return true;
}
return false;
}
private static bool OverlapsHard(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
if (rectangle1.Left >= rectangle2.Right ||
rectangle2.Left >= rectangle1.Right ||
rectangle1.Top <= rectangle2.Bottom ||
rectangle2.Top <= rectangle1.Bottom)
{
return false;
}
return true;
}
private static bool Inside(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
if (rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left &&
rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom)
{
return true;
}
return false;
}
private static PdfRectangle GetBound(IEnumerable<PdfRectangle> obstacles)
{
return new PdfRectangle(
obstacles.Min(b => b.Left),
obstacles.Min(b => b.Bottom),
obstacles.Max(b => b.Right),
obstacles.Max(b => b.Top));
}
#region Sorted Queue
private class QueueEntries : SortedSet<QueueEntry>
{
int bound;
public QueueEntries(int maximumBound)
{
this.bound = maximumBound;
}
public QueueEntry Dequeue()
{
var current = this.Max;
this.Remove(current);
return current;
}
public void Enqueue(QueueEntry queueEntry)
{
if (this.bound > 0 && this.Count > this.bound)
{
this.Remove(this.Min);
}
this.Add(queueEntry);
}
}
private class QueueEntry : IComparable<QueueEntry>
{
public PdfRectangle Bound { get; private set; }
public decimal Quality { get; private set; }
public HashSet<PdfRectangle> Obstacles { get; private set; }
private decimal WhitespaceFuzziness;
public QueueEntry(PdfRectangle bound, HashSet<PdfRectangle> obstacles, decimal whitespaceFuzziness)
{
this.Bound = bound;
this.Quality = ScoringFunction(Bound);
this.Obstacles = obstacles;
this.WhitespaceFuzziness = whitespaceFuzziness;
}
public PdfRectangle GetPivot()
{
int indexMiddle = Distances.FindIndexNearest(Bound.Centroid,
Obstacles.Select(o => o.Centroid).ToList(),
p => p, p => p, Distances.Euclidean, out double d);
return indexMiddle == -1 ? Obstacles.First() : Obstacles.ElementAt(indexMiddle);
}
public bool IsEmptyEnough()
{
return !Obstacles.Any();
}
public bool IsEmptyEnough(IEnumerable<PdfRectangle> pageObstacles)
{
if (IsEmptyEnough()) return true;
decimal sum = 0;
foreach (var obstacle in pageObstacles)
{
var intersect = Bound.Intersect(obstacle);
if (!intersect.HasValue) return false;
decimal minimumArea = MinimumOverlappingArea(obstacle, Bound, WhitespaceFuzziness);
if (intersect.Value.Area > minimumArea)
{
return false;
}
sum += intersect.Value.Area;
}
return sum < Bound.Area * WhitespaceFuzziness;
}
public override string ToString()
{
return "Q=" + Quality.ToString("#0.0") + ", O=" + Obstacles.Count + ", " + Bound.ToString();
}
public void AddWhitespace(PdfRectangle rectangle)
{
Obstacles.Add(rectangle);
}
public int CompareTo(QueueEntry entry)
{
return this.Quality.CompareTo(entry.Quality);
}
public override bool Equals(object obj)
{
if (obj is QueueEntry entry)
{
if (this.Bound.Left != entry.Bound.Left ||
this.Bound.Right != entry.Bound.Right ||
this.Bound.Top != entry.Bound.Top ||
this.Bound.Bottom != entry.Bound.Bottom ||
this.Obstacles != entry.Obstacles) return false;
return true;
}
return false;
}
public override int GetHashCode()
{
return (Bound.Left, Bound.Right,
Bound.Top, Bound.Bottom,
Obstacles).GetHashCode();
}
private static decimal MinimumOverlappingArea(PdfRectangle r1, PdfRectangle r2, decimal whitespaceFuzziness)
{
return Math.Min(r1.Area, r2.Area) * whitespaceFuzziness;
}
/// <summary>
/// The scoring function Q(r) which is subsequently used to sort a priority queue.
/// </summary>
/// <param name="rectangle"></param>
private static decimal ScoringFunction(PdfRectangle rectangle)
{
// As can be seen, tall rectangles are preferred. The trick while choosing this Q(r) was
// to keep that preference while still allowing wide rectangles to be chosen. After having
// experimented with quite a few variations, this simple function was considered a good
// solution.
return rectangle.Area * (rectangle.Height / 4m);
}
private static decimal OverlappingArea(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
var intersect = rectangle1.Intersect(rectangle2);
if (intersect.HasValue)
{
return intersect.Value.Area;
}
return 0;
}
}
#endregion
}
}