diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 4a8f7575..4e4d643a 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -91,6 +91,7 @@ "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", "UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor", "UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType", + "UglyToad.PdfPig.DocumentLayoutAnalysis.WhitespaceCoverExtractor", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Export.AltoXmlTextExporter", diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs index 97231d9f..5b1740d3 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs @@ -22,11 +22,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. internal static IEnumerable> ClusterNearestNeighbours(List elements, Func distMeasure, Func maxDistanceFunction, Func pivotPoint, Func candidatesPoint, - Func filterPivot, Func filterFinal) + Func filterPivot, Func filterFinal, + int maxDegreeOfParallelism) { /************************************************************************************* * Algorithm steps @@ -48,8 +52,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis int[] indexes = Enumerable.Repeat((int)-1, elements.Count).ToArray(); var candidatesPoints = elements.Select(candidatesPoint).ToList(); + ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; + // 1. Find nearest neighbours indexes - Parallel.For(0, elements.Count, e => + Parallel.For(0, elements.Count, parallelOptions, e => { var pivot = elements[e]; @@ -85,11 +91,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. internal static IEnumerable> ClusterNearestNeighbours(T[] elements, Func distMeasure, Func maxDistanceFunction, Func pivotPoint, Func candidatesPoint, - Func filterPivot, Func filterFinal) + Func filterPivot, Func filterFinal, + int maxDegreeOfParallelism) { /************************************************************************************* * Algorithm steps @@ -111,8 +121,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray(); var candidatesPoints = elements.Select(candidatesPoint).ToList(); + ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; + // 1. Find nearest neighbours indexes - Parallel.For(0, elements.Length, e => + Parallel.For(0, elements.Length, parallelOptions, e => { var pivot = elements[e]; @@ -148,11 +160,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The candidates' line to use for pairing. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. internal static IEnumerable> ClusterNearestNeighbours(T[] elements, Func distMeasure, Func maxDistanceFunction, Func pivotLine, Func candidatesLine, - Func filterPivot, Func filterFinal) + Func filterPivot, Func filterFinal, + int maxDegreeOfParallelism) { /************************************************************************************* * Algorithm steps @@ -174,8 +190,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray(); var candidatesLines = elements.Select(x => candidatesLine(x)).ToList(); + ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; + // 1. Find nearest neighbours indexes - Parallel.For(0, elements.Length, e => + Parallel.For(0, elements.Length, parallelOptions, e => { var pivot = elements[e]; diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs index 4dc02c1e..136265f6 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs @@ -28,11 +28,25 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// Get the blocks. /// Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3. /// - /// - /// - public IReadOnlyList GetBlocks(IEnumerable pageWords) + /// The words to segment into s. + /// The s generated by the document spectrum method. + public IReadOnlyList GetBlocks(IEnumerable words) { - return GetBlocks(pageWords, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3); + return GetBlocks(words, -1); + } + + /// + /// Get the blocks. + /// Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3. + /// + /// The words to segment into s. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. + /// The s generated by the document spectrum method. + public IReadOnlyList GetBlocks(IEnumerable words, int maxDegreeOfParallelism) + { + return GetBlocks(words, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3, maxDegreeOfParallelism); } /// @@ -46,8 +60,26 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// distance found by the analysis. /// The s generated by the document spectrum method. public IReadOnlyList GetBlocks(IEnumerable words, AngleBounds withinLine, - AngleBounds betweenLine, - double betweenLineMultiplier) + AngleBounds betweenLine, double betweenLineMultiplier) + { + return GetBlocks(words, withinLine, betweenLine, betweenLineMultiplier, -1); + } + + /// + /// Get the blocks. See original paper for more information. + /// + /// The words to segment into s. + /// Angle bounds for words to be considered on the same line. + /// Angle bounds for words to be considered on separate lines. + /// Multiplier that gives the maximum perpendicular distance between + /// text lines for blocking. Maximum distance will be this number times the between-line + /// distance found by the analysis. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. + /// The s generated by the document spectrum method. + public IReadOnlyList GetBlocks(IEnumerable words, AngleBounds withinLine, + AngleBounds betweenLine, double betweenLineMultiplier, int maxDegreeOfParallelism) { if (words == null) { @@ -74,8 +106,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis var withinLineDistList = new ConcurrentBag(); var betweenLineDistList = new ConcurrentBag(); + ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; + // 1. Estimate in line and between line spacing - Parallel.For(0, wordsList.Count, i => + Parallel.For(0, wordsList.Count, parallelOptions, i => { var word = wordsList[i]; @@ -107,16 +141,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis if (withinLineDistance == null || betweenLineDistance == null) { - return new[] {new TextBlock(new[] {new TextLine(wordsList)})}; + return new[] { new TextBlock(new[] { new TextLine(wordsList) }) }; } // 2. Find lines of text double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value); - var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine).ToArray(); + var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine, maxDegreeOfParallelism).ToArray(); // 3. Find blocks of text double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value; - var blocks = GetLinesGroups(lines, maxDistanceBetweenLine).ToList(); + var blocks = GetLinesGroups(lines, maxDistanceBetweenLine, maxDegreeOfParallelism).ToList(); // 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text. for (var b = 0; b < blocks.Count; b++) @@ -128,7 +162,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis // Merge all lines (words) blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(), - double.MaxValue, withinLine).ToList()); + double.MaxValue, withinLine, maxDegreeOfParallelism).ToList()); for (var c = 0; c < blocks.Count; c++) { @@ -147,7 +181,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis // 2. Rebuild lines, using max distance = +Inf as we know all words will be in the // same block. Filtering will still be done based on angle. // Merge all lines (words) sharing same bottom (baseline) - var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine).ToList(); + var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine, maxDegreeOfParallelism).ToList(); blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList()); // Remove @@ -207,7 +241,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex]); } - private static IEnumerable GetLines(List words, double maxDist, AngleBounds withinLine) + private static IEnumerable GetLines(List words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism) { TextDirection textDirection = words[0].TextDirection; var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, Distances.Euclidean, @@ -220,7 +254,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft); return (withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper); - }).ToList(); + }, + maxDegreeOfParallelism).ToList(); Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList(); if (textDirection == TextDirection.Rotate180) @@ -242,7 +277,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis } } - private static IEnumerable GetLinesGroups(TextLine[] lines, double maxDist) + private static IEnumerable GetLinesGroups(TextLine[] lines, double maxDist, int maxDegreeOfParallelism) { /************************************************************************************************** * We want to measure the distance between two lines using the following method: @@ -269,7 +304,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis (pivot, candidate) => maxDist, pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight), candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight), - pivot => true, (pivot, candidate) => true).ToList(); + pivot => true, (pivot, candidate) => true, + maxDegreeOfParallelism).ToList(); for (int a = 0; a < groupedIndexes.Count(); a++) { diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs index 62e212da..559e4228 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs @@ -16,7 +16,14 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// Create an instance of Nearest Neighbour Word Extractor, . /// - public static IWordExtractor Instance { get; } = new NearestNeighbourWordExtractor(); + public static NearestNeighbourWordExtractor Instance { get; } = new NearestNeighbourWordExtractor(); + + /// + /// Gets or sets the maximum number of concurrent tasks enabled. Default value is -1. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. + /// + public int MaxDegreeOfParallelism { get; set; } = -1; /// /// Gets the words. @@ -27,14 +34,14 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis List wordsH = GetWords( letters.Where(l => l.TextDirection == TextDirection.Horizontal), (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2, - Distances.Manhattan) + Distances.Manhattan, MaxDegreeOfParallelism) .OrderByDescending(x => x.BoundingBox.Bottom) .ThenBy(x => x.BoundingBox.Left).ToList(); List words180 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate180), (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2, - Distances.Manhattan) + Distances.Manhattan, MaxDegreeOfParallelism) .OrderBy(x => x.BoundingBox.Top) .ThenByDescending(x => x.BoundingBox.Right).ToList(); wordsH.AddRange(words180); @@ -42,7 +49,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis List words90 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate90), (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Height), (double)(l2.GlyphRectangle.Height)) * 0.2, - Distances.Manhattan) + Distances.Manhattan, MaxDegreeOfParallelism) .OrderByDescending(x => x.BoundingBox.Left) .ThenBy(x => x.BoundingBox.Top).ToList(); wordsH.AddRange(words90); @@ -50,7 +57,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis List words270 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate270), (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Height), (double)(l2.GlyphRectangle.Height)) * 0.2, - Distances.Manhattan) + Distances.Manhattan, MaxDegreeOfParallelism) .OrderBy(x => x.BoundingBox.Right) .ThenByDescending(x => x.BoundingBox.Bottom).ToList(); wordsH.AddRange(words270); @@ -58,7 +65,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis List wordsU = GetWords( letters.Where(l => l.TextDirection == TextDirection.Unknown), (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2, - Distances.Manhattan) + Distances.Manhattan, MaxDegreeOfParallelism) .OrderByDescending(x => x.BoundingBox.Bottom) .ThenBy(x => x.BoundingBox.Left).ToList(); wordsH.AddRange(wordsU); @@ -75,15 +82,19 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// e.g. Max(GlyphRectangle.Width) x 20%. /// The distance measure between two start and end base line points, /// e.g. the Manhattan distance. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. private List GetWords(IEnumerable pageLetters, - Func maxDistanceFunction, Func distMeasure) + Func maxDistanceFunction, Func distMeasure, + int maxDegreeOfParallelism) { if (pageLetters == null || pageLetters.Count() == 0) return new List(); TextDirection textDirection = pageLetters.ElementAt(0).TextDirection; if (pageLetters.Any(x => textDirection != x.TextDirection)) { - throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction."); + throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction."); } Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList(); @@ -106,7 +117,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis distMeasure, maxDistanceFunction, l => l.EndBaseLine, l => l.StartBaseLine, l => !string.IsNullOrWhiteSpace(l.Value), - (l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value)).ToList(); + (l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value), + maxDegreeOfParallelism).ToList(); List words = new List(); for (int a = 0; a < groupedIndexes.Count(); a++) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs index 83fc7661..e8a3806c 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -29,7 +29,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// The words in the page. /// The minimum number of elements to define a text edge. - public static IReadOnlyDictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4) + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. + public static IReadOnlyDictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4, + int maxDegreeOfParallelism = -1) { if (minimumElements < 0) { @@ -40,7 +44,9 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis ConcurrentDictionary> dictionary = new ConcurrentDictionary>(); - Parallel.ForEach(edgesFuncs, f => + ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; + + Parallel.ForEach(edgesFuncs, parallelOptions, f => { dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements)); }); diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs new file mode 100644 index 00000000..dd9cf6e8 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs @@ -0,0 +1,400 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; +using UglyToad.PdfPig.Util; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// A top-down algorithm that finds a cover of the background whitespace of a document in terms of maximal empty rectangles. + /// See Section 3.2 of 'High precision text extraction from PDF documents' by Øyvind Raddum Berg and Section 2 of 'Two geometric algorithms for layout analysis' by Thomas M. Breuel. + /// + public static class WhitespaceCoverExtractor + { + /// + /// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles. + /// + /// The words in the page. + /// The images in the page. + /// The maximum number of rectangles to find. + /// The maximum size of the queue used in the algorithm. + /// The identified whitespace rectangles. + public static IReadOnlyList GetWhitespaces(IEnumerable words, IEnumerable images = null, int maxRectangleCount = 40, int maxBoundQueueSize = 0) + { + return GetWhitespaces(words, + images, + words.SelectMany(w => w.Letters).Select(x => x.GlyphRectangle.Width).Mode() * 1.25m, + words.SelectMany(w => w.Letters).Select(x => x.GlyphRectangle.Height).Mode() * 1.25m, + maxRectangleCount: maxRectangleCount, + maxBoundQueueSize: maxBoundQueueSize); + } + + /// + /// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles. + /// + /// The words in the page. + /// The images in the page. + /// Lower bounds for the width of rectangles. + /// Lower bounds for the height of rectangles. + /// The maximum number of rectangles to find. + /// Constant value to allow candidate whitespace rectangle to overlap the + /// surrounding obstacles by some percent. Default value is 15%. + /// The maximum size of the queue used in the algorithm. + /// The identified whitespace rectangles. + public static IReadOnlyList GetWhitespaces(IEnumerable words, IEnumerable images, + decimal minWidth, decimal minHeight, int maxRectangleCount = 40, decimal whitespaceFuzziness = 0.15m, int maxBoundQueueSize = 0) + { + var bboxes = words.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0) + .Select(o => o.BoundingBox).ToList(); + + if (images != null && images.Count() > 0) + { + bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds)); + } + + return GetWhitespaces(bboxes, + minWidth: minWidth, + minHeight: minHeight, + maxRectangleCount: maxRectangleCount, + whitespaceFuzziness: whitespaceFuzziness, + maxBoundQueueSize: maxBoundQueueSize); + } + + /// + /// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles. + /// + /// The list of obstacles' bounding boxes in the page. + /// Lower bounds for the width of rectangles. + /// Lower bounds for the height of rectangles. + /// The maximum number of rectangles to find. + /// Constant value to allow candidate whitespace rectangle to overlap the + /// surrounding obstacles by some percent. Default value is 15%. + /// The maximum size of the queue used in the algorithm. + /// The identified whitespace rectangles. + public static IReadOnlyList GetWhitespaces(IEnumerable boundingboxes, + decimal minWidth, decimal minHeight, int maxRectangleCount = 40, decimal whitespaceFuzziness = 0.15m, int maxBoundQueueSize = 0) + { + if (boundingboxes.Count() == 0) return EmptyArray.Instance; + + var obstacles = new HashSet(boundingboxes); + var pageBound = GetBound(obstacles); + return GetMaximalRectangles(pageBound, + obstacles, + minWidth: minWidth, + minHeight: minHeight, + maxRectangleCount: maxRectangleCount, + whitespaceFuzziness: whitespaceFuzziness, + maxBoundQueueSize: maxBoundQueueSize); + } + + private static IReadOnlyList GetMaximalRectangles(PdfRectangle bound, + HashSet obstacles, decimal minWidth, decimal minHeight, int maxRectangleCount, + decimal whitespaceFuzziness, int maxBoundQueueSize) + { + QueueEntries queueEntries = new QueueEntries(maxBoundQueueSize); + queueEntries.Enqueue(new QueueEntry(bound, obstacles, whitespaceFuzziness)); + + HashSet selected = new HashSet(); + HashSet holdList = new HashSet(); + + while (queueEntries.Any()) + { + var current = queueEntries.Dequeue(); + + if (current.IsEmptyEnough(obstacles)) + { + if (selected.Any(c => Inside(c, current.Bound))) continue; + + // A check was added which impeded the algorithm from accepting + // rectangles which were not adjacent to an already accepted + // rectangle, or to the border of the page. + if (!IsAdjacentToPageBounds(bound, current.Bound) && // NOT in contact to border page AND + !selected.Any(q => IsAdjacentTo(q, current.Bound))) // NOT in contact to any already accepted rectangle + { + // In order to maintain the correctness of the algorithm, + // rejected rectangles are put in a hold list. + holdList.Add(current); + continue; + } + + selected.Add(current.Bound); + + if (selected.Count >= maxRectangleCount) return selected.ToList(); + + obstacles.Add(current.Bound); + + // Each time a new rectangle is identified and accepted, this hold list + // will be added back to the queue in case any of them will have become valid. + foreach (var hold in holdList) + { + queueEntries.Enqueue(hold); + } + + // After a maximal rectangle has been found, it is added back to the list + // of obstacles. Whenever a QueueEntry is dequeued, its list of obstacles + // can be recomputed to include newly identified whitespace rectangles. + foreach (var overlapping in queueEntries) + { + if (OverlapsHard(current.Bound, overlapping.Bound)) + overlapping.AddWhitespace(current.Bound); + } + + continue; + } + + var pivot = current.GetPivot(); + var b = current.Bound; + + List subRectangles = new List(); + + var rRight = new PdfRectangle(pivot.Right, b.Bottom, b.Right, b.Top); + if (b.Right > pivot.Right && rRight.Height > minHeight && rRight.Width > minWidth) + { + queueEntries.Enqueue(new QueueEntry(rRight, + new HashSet(current.Obstacles.Where(o => OverlapsHard(rRight, o))), + whitespaceFuzziness)); + } + + var rLeft = new PdfRectangle(b.Left, b.Bottom, pivot.Left, b.Top); + if (b.Left < pivot.Left && rLeft.Height > minHeight && rLeft.Width > minWidth) + { + queueEntries.Enqueue(new QueueEntry(rLeft, + new HashSet(current.Obstacles.Where(o => OverlapsHard(rLeft, o))), + whitespaceFuzziness)); + } + + var rAbove = new PdfRectangle(b.Left, b.Bottom, b.Right, pivot.Bottom); + if (b.Bottom < pivot.Bottom && rAbove.Height > minHeight && rAbove.Width > minWidth) + { + queueEntries.Enqueue(new QueueEntry(rAbove, + new HashSet(current.Obstacles.Where(o => OverlapsHard(rAbove, o))), + whitespaceFuzziness)); + } + + var rBelow = new PdfRectangle(b.Left, pivot.Top, b.Right, b.Top); + if (b.Top > pivot.Top && rBelow.Height > minHeight && rBelow.Width > minWidth) + { + queueEntries.Enqueue(new QueueEntry(rBelow, + new HashSet(current.Obstacles.Where(o => OverlapsHard(rBelow, o))), + whitespaceFuzziness)); + } + } + + return selected.ToList(); + } + + private static bool IsAdjacentTo(PdfRectangle rectangle1, PdfRectangle rectangle2) + { + if (rectangle1.Left > rectangle2.Right || + rectangle2.Left > rectangle1.Right || + rectangle1.Top < rectangle2.Bottom || + rectangle2.Top < rectangle1.Bottom) + { + return false; + } + + if (rectangle1.Left == rectangle2.Right || + rectangle1.Right == rectangle2.Left || + rectangle1.Bottom == rectangle2.Top || + rectangle1.Top == rectangle2.Bottom) + { + return true; + } + return false; + } + + private static bool IsAdjacentToPageBounds(PdfRectangle pageBound, PdfRectangle rectangle) + { + if (rectangle.Bottom == pageBound.Bottom || + rectangle.Top == pageBound.Top || + rectangle.Left == pageBound.Left || + rectangle.Right == pageBound.Right) + { + return true; + } + + return false; + } + + private static bool OverlapsHard(PdfRectangle rectangle1, PdfRectangle rectangle2) + { + if (rectangle1.Left >= rectangle2.Right || + rectangle2.Left >= rectangle1.Right || + rectangle1.Top <= rectangle2.Bottom || + rectangle2.Top <= rectangle1.Bottom) + { + return false; + } + + return true; + } + + private static bool Inside(PdfRectangle rectangle1, PdfRectangle rectangle2) + { + if (rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left && + rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom) + { + return true; + } + + return false; + } + + private static PdfRectangle GetBound(IEnumerable obstacles) + { + return new PdfRectangle( + obstacles.Min(b => b.Left), + obstacles.Min(b => b.Bottom), + obstacles.Max(b => b.Right), + obstacles.Max(b => b.Top)); + } + + #region Sorted Queue + private class QueueEntries : SortedSet + { + int bound; + + public QueueEntries(int maximumBound) + { + this.bound = maximumBound; + } + + public QueueEntry Dequeue() + { + var current = this.Max; + this.Remove(current); + return current; + } + + public void Enqueue(QueueEntry queueEntry) + { + if (this.bound > 0 && this.Count > this.bound) + { + this.Remove(this.Min); + } + this.Add(queueEntry); + } + } + + private class QueueEntry : IComparable + { + public PdfRectangle Bound { get; private set; } + + public decimal Quality { get; private set; } + + public HashSet Obstacles { get; private set; } + + private decimal WhitespaceFuzziness; + + public QueueEntry(PdfRectangle bound, HashSet obstacles, decimal whitespaceFuzziness) + { + this.Bound = bound; + this.Quality = ScoringFunction(Bound); + this.Obstacles = obstacles; + this.WhitespaceFuzziness = whitespaceFuzziness; + } + + public PdfRectangle GetPivot() + { + int indexMiddle = Distances.FindIndexNearest(Bound.Centroid, + Obstacles.Select(o => o.Centroid).ToList(), + p => p, p => p, Distances.Euclidean, out double d); + + return indexMiddle == -1 ? Obstacles.First() : Obstacles.ElementAt(indexMiddle); + } + + public bool IsEmptyEnough() + { + return !Obstacles.Any(); + } + + public bool IsEmptyEnough(IEnumerable pageObstacles) + { + if (IsEmptyEnough()) return true; + + decimal sum = 0; + foreach (var obstacle in pageObstacles) + { + var intersect = Bound.Intersect(obstacle); + if (!intersect.HasValue) return false; + + decimal minimumArea = MinimumOverlappingArea(obstacle, Bound, WhitespaceFuzziness); + + if (intersect.Value.Area > minimumArea) + { + return false; + } + sum += intersect.Value.Area; + } + return sum < Bound.Area * WhitespaceFuzziness; + } + + public override string ToString() + { + return "Q=" + Quality.ToString("#0.0") + ", O=" + Obstacles.Count + ", " + Bound.ToString(); + } + + public void AddWhitespace(PdfRectangle rectangle) + { + Obstacles.Add(rectangle); + } + + public int CompareTo(QueueEntry entry) + { + return this.Quality.CompareTo(entry.Quality); + } + + public override bool Equals(object obj) + { + if (obj is QueueEntry entry) + { + if (this.Bound.Left != entry.Bound.Left || + this.Bound.Right != entry.Bound.Right || + this.Bound.Top != entry.Bound.Top || + this.Bound.Bottom != entry.Bound.Bottom || + this.Obstacles != entry.Obstacles) return false; + return true; + } + return false; + } + + public override int GetHashCode() + { + return (Bound.Left, Bound.Right, + Bound.Top, Bound.Bottom, + Obstacles).GetHashCode(); + } + + private static decimal MinimumOverlappingArea(PdfRectangle r1, PdfRectangle r2, decimal whitespaceFuzziness) + { + return Math.Min(r1.Area, r2.Area) * whitespaceFuzziness; + } + + /// + /// The scoring function Q(r) which is subsequently used to sort a priority queue. + /// + /// + private static decimal ScoringFunction(PdfRectangle rectangle) + { + // As can be seen, tall rectangles are preferred. The trick while choosing this Q(r) was + // to keep that preference while still allowing wide rectangles to be chosen. After having + // experimented with quite a few variations, this simple function was considered a good + // solution. + return rectangle.Area * (rectangle.Height / 4m); + } + + private static decimal OverlappingArea(PdfRectangle rectangle1, PdfRectangle rectangle2) + { + var intersect = rectangle1.Intersect(rectangle2); + if (intersect.HasValue) + { + return intersect.Value.Area; + } + return 0; + } + } + #endregion + } +}