namespace UglyToad.PdfPig.DocumentLayoutAnalysis { using Content; using Core; using Geometry; using System; using System.Collections.Generic; using System.Linq; /// /// A top-down algorithm that finds a cover of the background whitespace of a document in terms of maximal empty rectangles. /// See Section 3.2 of 'High precision text extraction from PDF documents' by Øyvind Raddum Berg and Section 2 of 'Two geometric algorithms for layout analysis' by Thomas M. Breuel. /// public static class WhitespaceCoverExtractor { /// /// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles. /// /// The words in the page. /// The images in the page. /// The maximum number of rectangles to find. /// The maximum size of the queue used in the algorithm. /// The identified whitespace rectangles. public static IReadOnlyList GetWhitespaces(IEnumerable words, IEnumerable images = null, int maxRectangleCount = 40, int maxBoundQueueSize = 0) { return GetWhitespaces(words, images, words.SelectMany(w => w.Letters).Select(x => x.GlyphRectangle.Width).Mode() * 1.25, words.SelectMany(w => w.Letters).Select(x => x.GlyphRectangle.Height).Mode() * 1.25, maxRectangleCount: maxRectangleCount, maxBoundQueueSize: maxBoundQueueSize); } /// /// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles. /// /// The words in the page. /// The images in the page. /// Lower bounds for the width of rectangles. /// Lower bounds for the height of rectangles. /// The maximum number of rectangles to find. /// Constant value to allow candidate whitespace rectangle to overlap the /// surrounding obstacles by some percent. Default value is 15%. /// The maximum size of the queue used in the algorithm. /// The identified whitespace rectangles. public static IReadOnlyList GetWhitespaces(IEnumerable words, IEnumerable images, double minWidth, double minHeight, int maxRectangleCount = 40, double whitespaceFuzziness = 0.15, int maxBoundQueueSize = 0) { var bboxes = words.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0) .Select(o => o.BoundingBox).ToList(); if (images?.Any() == true) { bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds)); } return GetWhitespaces(bboxes, minWidth: minWidth, minHeight: minHeight, maxRectangleCount: maxRectangleCount, whitespaceFuzziness: whitespaceFuzziness, maxBoundQueueSize: maxBoundQueueSize); } /// /// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles. /// /// The list of obstacles' bounding boxes in the page. /// Lower bounds for the width of rectangles. /// Lower bounds for the height of rectangles. /// The maximum number of rectangles to find. /// Constant value to allow candidate whitespace rectangle to overlap the /// surrounding obstacles by some percent. Default value is 15%. /// The maximum size of the queue used in the algorithm. /// The identified whitespace rectangles. public static IReadOnlyList GetWhitespaces(IEnumerable boundingboxes, double minWidth, double minHeight, int maxRectangleCount = 40, double whitespaceFuzziness = 0.15, int maxBoundQueueSize = 0) { if (!boundingboxes.Any()) { return Array.Empty(); } var obstacles = new HashSet(boundingboxes); var pageBound = GetBound(obstacles); return GetMaximalRectangles(pageBound, obstacles, minWidth: minWidth, minHeight: minHeight, maxRectangleCount: maxRectangleCount, whitespaceFuzziness: whitespaceFuzziness, maxBoundQueueSize: maxBoundQueueSize); } private static IReadOnlyList GetMaximalRectangles(PdfRectangle bound, HashSet obstacles, double minWidth, double minHeight, int maxRectangleCount, double whitespaceFuzziness, int maxBoundQueueSize) { var queueEntries = new QueueEntries(maxBoundQueueSize); queueEntries.Enqueue(new QueueEntry(bound, obstacles, whitespaceFuzziness)); var selected = new HashSet(); var holdList = new HashSet(); while (queueEntries.Any()) { var current = queueEntries.Dequeue(); if (current.IsEmptyEnough(obstacles)) { if (selected.Any(c => Inside(c, current.Bound))) { continue; } // A check was added which impeded the algorithm from accepting // rectangles which were not adjacent to an already accepted // rectangle, or to the border of the page. if (!IsAdjacentToPageBounds(bound, current.Bound) && // NOT in contact to border page AND !selected.Any(q => IsAdjacentTo(q, current.Bound))) // NOT in contact to any already accepted rectangle { // In order to maintain the correctness of the algorithm, // rejected rectangles are put in a hold list. holdList.Add(current); continue; } selected.Add(current.Bound); if (selected.Count >= maxRectangleCount) { return selected.ToList(); } obstacles.Add(current.Bound); // Each time a new rectangle is identified and accepted, this hold list // will be added back to the queue in case any of them will have become valid. foreach (var hold in holdList) { queueEntries.Enqueue(hold); } // After a maximal rectangle has been found, it is added back to the list // of obstacles. Whenever a QueueEntry is dequeued, its list of obstacles // can be recomputed to include newly identified whitespace rectangles. foreach (var overlapping in queueEntries) { if (OverlapsHard(current.Bound, overlapping.Bound)) { overlapping.AddWhitespace(current.Bound); } } continue; } var pivot = current.GetPivot(); var b = current.Bound; var subRectangles = new List(); var rRight = new PdfRectangle(pivot.Right, b.Bottom, b.Right, b.Top); if (b.Right > pivot.Right && rRight.Height > minHeight && rRight.Width > minWidth) { queueEntries.Enqueue(new QueueEntry(rRight, new HashSet(current.Obstacles.Where(o => OverlapsHard(rRight, o))), whitespaceFuzziness)); } var rLeft = new PdfRectangle(b.Left, b.Bottom, pivot.Left, b.Top); if (b.Left < pivot.Left && rLeft.Height > minHeight && rLeft.Width > minWidth) { queueEntries.Enqueue(new QueueEntry(rLeft, new HashSet(current.Obstacles.Where(o => OverlapsHard(rLeft, o))), whitespaceFuzziness)); } var rAbove = new PdfRectangle(b.Left, b.Bottom, b.Right, pivot.Bottom); if (b.Bottom < pivot.Bottom && rAbove.Height > minHeight && rAbove.Width > minWidth) { queueEntries.Enqueue(new QueueEntry(rAbove, new HashSet(current.Obstacles.Where(o => OverlapsHard(rAbove, o))), whitespaceFuzziness)); } var rBelow = new PdfRectangle(b.Left, pivot.Top, b.Right, b.Top); if (b.Top > pivot.Top && rBelow.Height > minHeight && rBelow.Width > minWidth) { queueEntries.Enqueue(new QueueEntry(rBelow, new HashSet(current.Obstacles.Where(o => OverlapsHard(rBelow, o))), whitespaceFuzziness)); } } return selected.ToList(); } private static bool IsAdjacentTo(PdfRectangle rectangle1, PdfRectangle rectangle2) { if (rectangle1.Left > rectangle2.Right || rectangle2.Left > rectangle1.Right || rectangle1.Top < rectangle2.Bottom || rectangle2.Top < rectangle1.Bottom) { return false; } return rectangle1.Left == rectangle2.Right || rectangle1.Right == rectangle2.Left || rectangle1.Bottom == rectangle2.Top || rectangle1.Top == rectangle2.Bottom; } private static bool IsAdjacentToPageBounds(PdfRectangle pageBound, PdfRectangle rectangle) { return rectangle.Bottom == pageBound.Bottom || rectangle.Top == pageBound.Top || rectangle.Left == pageBound.Left || rectangle.Right == pageBound.Right; } private static bool OverlapsHard(PdfRectangle rectangle1, PdfRectangle rectangle2) { return rectangle1.Left < rectangle2.Right && rectangle2.Left < rectangle1.Right && rectangle1.Top > rectangle2.Bottom && rectangle2.Top > rectangle1.Bottom; } private static bool Inside(PdfRectangle rectangle1, PdfRectangle rectangle2) { return rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left && rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom; } private static PdfRectangle GetBound(IEnumerable obstacles) { return new PdfRectangle( obstacles.Min(b => b.Left), obstacles.Min(b => b.Bottom), obstacles.Max(b => b.Right), obstacles.Max(b => b.Top)); } #region Sorted Queue private class QueueEntries : SortedSet { private readonly int bound; public QueueEntries(int maximumBound) { bound = maximumBound; } public QueueEntry Dequeue() { var current = Max; Remove(current); return current; } public void Enqueue(QueueEntry queueEntry) { if (bound > 0 && Count > bound) { Remove(Min); } Add(queueEntry); } } private class QueueEntry : IComparable { private readonly double quality; private readonly double whitespaceFuzziness; public PdfRectangle Bound { get; } public HashSet Obstacles { get; } public QueueEntry(PdfRectangle bound, HashSet obstacles, double whitespaceFuzziness) { Bound = bound; quality = ScoringFunction(Bound); Obstacles = obstacles; this.whitespaceFuzziness = whitespaceFuzziness; } public PdfRectangle GetPivot() { int indexMiddle = Distances.FindIndexNearest(Bound.Centroid, Obstacles.Select(o => o.Centroid).ToList(), p => p, p => p, Distances.Euclidean, out double d); return indexMiddle == -1 ? Obstacles.First() : Obstacles.ElementAt(indexMiddle); } public bool IsEmptyEnough() { return Obstacles.Count == 0; } public bool IsEmptyEnough(IEnumerable pageObstacles) { if (IsEmptyEnough()) { return true; } double sum = 0; foreach (var obstacle in pageObstacles) { var intersect = Bound.Intersect(obstacle); if (!intersect.HasValue) { return false; } double minimumArea = MinimumOverlappingArea(obstacle, Bound, whitespaceFuzziness); if (intersect.Value.Area > minimumArea) { return false; } sum += intersect.Value.Area; } return sum < Bound.Area * whitespaceFuzziness; } public override string ToString() { return "Q=" + quality.ToString("#0.0") + ", O=" + Obstacles.Count + ", " + Bound.ToString(); } public void AddWhitespace(PdfRectangle rectangle) { Obstacles.Add(rectangle); } public int CompareTo(QueueEntry entry) { return quality.CompareTo(entry.quality); } public override bool Equals(object obj) { if (obj is QueueEntry entry) { return Bound.Left == entry.Bound.Left && Bound.Right == entry.Bound.Right && Bound.Top == entry.Bound.Top && Bound.Bottom == entry.Bound.Bottom && Obstacles == entry.Obstacles; } return false; } public override int GetHashCode() { return (Bound.Left, Bound.Right, Bound.Top, Bound.Bottom, Obstacles).GetHashCode(); } private static double MinimumOverlappingArea(PdfRectangle r1, PdfRectangle r2, double whitespaceFuzziness) { return Math.Min(r1.Area, r2.Area) * whitespaceFuzziness; } /// /// The scoring function Q(r) which is subsequently used to sort a priority queue. /// /// private static double ScoringFunction(PdfRectangle rectangle) { // As can be seen, tall rectangles are preferred. The trick while choosing this Q(r) was // to keep that preference while still allowing wide rectangles to be chosen. After having // experimented with quite a few variations, this simple function was considered a good // solution. return rectangle.Area * (rectangle.Height / 4.0); } } #endregion } }