2020-01-05 00:38:18 +08:00
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
2019-12-18 19:41:39 +08:00
{
2020-01-05 00:38:18 +08:00
using Content ;
2020-01-05 17:19:58 +08:00
using Core ;
2020-01-05 00:38:18 +08:00
using Geometry ;
2020-01-11 02:08:33 +08:00
using System ;
using System.Collections.Generic ;
using System.Linq ;
2020-01-05 00:38:18 +08:00
2019-12-18 19:41:39 +08:00
/// <summary>
/// A top-down algorithm that finds a cover of the background whitespace of a document in terms of maximal empty rectangles.
/// <para>See Section 3.2 of 'High precision text extraction from PDF documents' by Øyvind Raddum Berg and Section 2 of 'Two geometric algorithms for layout analysis' by Thomas M. Breuel.</para>
/// </summary>
public static class WhitespaceCoverExtractor
{
/// <summary>
/// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
/// </summary>
/// <param name="words">The words in the page.</param>
/// <param name="images">The images in the page.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
public static IReadOnlyList < PdfRectangle > GetWhitespaces ( IEnumerable < Word > words , IEnumerable < IPdfImage > images = null , int maxRectangleCount = 40 , int maxBoundQueueSize = 0 )
{
return GetWhitespaces ( words ,
images ,
2019-12-22 02:09:49 +08:00
words . SelectMany ( w = > w . Letters ) . Select ( x = > x . GlyphRectangle . Width ) . Mode ( ) * 1.25 ,
words . SelectMany ( w = > w . Letters ) . Select ( x = > x . GlyphRectangle . Height ) . Mode ( ) * 1.25 ,
2019-12-18 19:41:39 +08:00
maxRectangleCount : maxRectangleCount ,
maxBoundQueueSize : maxBoundQueueSize ) ;
}
/// <summary>
/// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
/// </summary>
/// <param name="words">The words in the page.</param>
/// <param name="images">The images in the page.</param>
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
2020-05-25 19:11:59 +08:00
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
2019-12-18 19:41:39 +08:00
/// surrounding obstacles by some percent. Default value is 15%.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
public static IReadOnlyList < PdfRectangle > GetWhitespaces ( IEnumerable < Word > words , IEnumerable < IPdfImage > images ,
2019-12-22 02:09:49 +08:00
double minWidth , double minHeight , int maxRectangleCount = 40 , double whitespaceFuzziness = 0.15 , int maxBoundQueueSize = 0 )
2019-12-18 19:41:39 +08:00
{
var bboxes = words . Where ( w = > w . BoundingBox . Width > 0 & & w . BoundingBox . Height > 0 )
. Select ( o = > o . BoundingBox ) . ToList ( ) ;
2020-05-25 19:11:59 +08:00
if ( images ? . Any ( ) = = true )
2019-12-18 19:41:39 +08:00
{
bboxes . AddRange ( images . Where ( w = > w . Bounds . Width > 0 & & w . Bounds . Height > 0 ) . Select ( o = > o . Bounds ) ) ;
}
return GetWhitespaces ( bboxes ,
minWidth : minWidth ,
minHeight : minHeight ,
maxRectangleCount : maxRectangleCount ,
whitespaceFuzziness : whitespaceFuzziness ,
maxBoundQueueSize : maxBoundQueueSize ) ;
}
/// <summary>
/// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
/// </summary>
/// <param name="boundingboxes">The list of obstacles' bounding boxes in the page.</param>
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
2020-05-25 19:11:59 +08:00
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
2019-12-18 19:41:39 +08:00
/// surrounding obstacles by some percent. Default value is 15%.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
public static IReadOnlyList < PdfRectangle > GetWhitespaces ( IEnumerable < PdfRectangle > boundingboxes ,
2019-12-22 02:09:49 +08:00
double minWidth , double minHeight , int maxRectangleCount = 40 , double whitespaceFuzziness = 0.15 , int maxBoundQueueSize = 0 )
2019-12-18 19:41:39 +08:00
{
2024-03-14 10:01:00 +08:00
if ( ! boundingboxes . Any ( ) )
{
return Array . Empty < PdfRectangle > ( ) ;
}
2019-12-18 19:41:39 +08:00
var obstacles = new HashSet < PdfRectangle > ( boundingboxes ) ;
var pageBound = GetBound ( obstacles ) ;
return GetMaximalRectangles ( pageBound ,
obstacles ,
minWidth : minWidth ,
minHeight : minHeight ,
maxRectangleCount : maxRectangleCount ,
whitespaceFuzziness : whitespaceFuzziness ,
maxBoundQueueSize : maxBoundQueueSize ) ;
}
private static IReadOnlyList < PdfRectangle > GetMaximalRectangles ( PdfRectangle bound ,
2019-12-22 02:09:49 +08:00
HashSet < PdfRectangle > obstacles , double minWidth , double minHeight , int maxRectangleCount ,
double whitespaceFuzziness , int maxBoundQueueSize )
2019-12-18 19:41:39 +08:00
{
2024-03-14 10:01:00 +08:00
var queueEntries = new QueueEntries ( maxBoundQueueSize ) ;
2019-12-18 19:41:39 +08:00
queueEntries . Enqueue ( new QueueEntry ( bound , obstacles , whitespaceFuzziness ) ) ;
2024-03-14 10:01:00 +08:00
var selected = new HashSet < PdfRectangle > ( ) ;
var holdList = new HashSet < QueueEntry > ( ) ;
2019-12-18 19:41:39 +08:00
while ( queueEntries . Any ( ) )
{
var current = queueEntries . Dequeue ( ) ;
if ( current . IsEmptyEnough ( obstacles ) )
{
2024-03-14 10:01:00 +08:00
if ( selected . Any ( c = > Inside ( c , current . Bound ) ) )
{
continue ;
}
2019-12-18 19:41:39 +08:00
// A check was added which impeded the algorithm from accepting
// rectangles which were not adjacent to an already accepted
// rectangle, or to the border of the page.
if ( ! IsAdjacentToPageBounds ( bound , current . Bound ) & & // NOT in contact to border page AND
! selected . Any ( q = > IsAdjacentTo ( q , current . Bound ) ) ) // NOT in contact to any already accepted rectangle
{
// In order to maintain the correctness of the algorithm,
// rejected rectangles are put in a hold list.
holdList . Add ( current ) ;
continue ;
}
selected . Add ( current . Bound ) ;
2024-03-14 10:01:00 +08:00
if ( selected . Count > = maxRectangleCount )
{
return selected . ToList ( ) ;
}
2019-12-18 19:41:39 +08:00
obstacles . Add ( current . Bound ) ;
// Each time a new rectangle is identified and accepted, this hold list
// will be added back to the queue in case any of them will have become valid.
foreach ( var hold in holdList )
{
queueEntries . Enqueue ( hold ) ;
}
// After a maximal rectangle has been found, it is added back to the list
// of obstacles. Whenever a QueueEntry is dequeued, its list of obstacles
// can be recomputed to include newly identified whitespace rectangles.
foreach ( var overlapping in queueEntries )
{
if ( OverlapsHard ( current . Bound , overlapping . Bound ) )
2024-03-14 10:01:00 +08:00
{
2019-12-18 19:41:39 +08:00
overlapping . AddWhitespace ( current . Bound ) ;
2024-03-14 10:01:00 +08:00
}
2019-12-18 19:41:39 +08:00
}
continue ;
}
var pivot = current . GetPivot ( ) ;
var b = current . Bound ;
2024-03-14 10:01:00 +08:00
var subRectangles = new List < PdfRectangle > ( ) ;
2019-12-18 19:41:39 +08:00
var rRight = new PdfRectangle ( pivot . Right , b . Bottom , b . Right , b . Top ) ;
if ( b . Right > pivot . Right & & rRight . Height > minHeight & & rRight . Width > minWidth )
{
queueEntries . Enqueue ( new QueueEntry ( rRight ,
new HashSet < PdfRectangle > ( current . Obstacles . Where ( o = > OverlapsHard ( rRight , o ) ) ) ,
whitespaceFuzziness ) ) ;
}
var rLeft = new PdfRectangle ( b . Left , b . Bottom , pivot . Left , b . Top ) ;
if ( b . Left < pivot . Left & & rLeft . Height > minHeight & & rLeft . Width > minWidth )
{
queueEntries . Enqueue ( new QueueEntry ( rLeft ,
new HashSet < PdfRectangle > ( current . Obstacles . Where ( o = > OverlapsHard ( rLeft , o ) ) ) ,
whitespaceFuzziness ) ) ;
}
var rAbove = new PdfRectangle ( b . Left , b . Bottom , b . Right , pivot . Bottom ) ;
if ( b . Bottom < pivot . Bottom & & rAbove . Height > minHeight & & rAbove . Width > minWidth )
{
queueEntries . Enqueue ( new QueueEntry ( rAbove ,
new HashSet < PdfRectangle > ( current . Obstacles . Where ( o = > OverlapsHard ( rAbove , o ) ) ) ,
whitespaceFuzziness ) ) ;
}
var rBelow = new PdfRectangle ( b . Left , pivot . Top , b . Right , b . Top ) ;
if ( b . Top > pivot . Top & & rBelow . Height > minHeight & & rBelow . Width > minWidth )
{
queueEntries . Enqueue ( new QueueEntry ( rBelow ,
new HashSet < PdfRectangle > ( current . Obstacles . Where ( o = > OverlapsHard ( rBelow , o ) ) ) ,
whitespaceFuzziness ) ) ;
}
}
return selected . ToList ( ) ;
}
private static bool IsAdjacentTo ( PdfRectangle rectangle1 , PdfRectangle rectangle2 )
{
if ( rectangle1 . Left > rectangle2 . Right | |
rectangle2 . Left > rectangle1 . Right | |
rectangle1 . Top < rectangle2 . Bottom | |
rectangle2 . Top < rectangle1 . Bottom )
{
return false ;
}
2020-05-25 19:11:59 +08:00
return rectangle1 . Left = = rectangle2 . Right | |
rectangle1 . Right = = rectangle2 . Left | |
rectangle1 . Bottom = = rectangle2 . Top | |
rectangle1 . Top = = rectangle2 . Bottom ;
2019-12-18 19:41:39 +08:00
}
private static bool IsAdjacentToPageBounds ( PdfRectangle pageBound , PdfRectangle rectangle )
{
2020-05-25 19:11:59 +08:00
return rectangle . Bottom = = pageBound . Bottom | |
rectangle . Top = = pageBound . Top | |
rectangle . Left = = pageBound . Left | |
rectangle . Right = = pageBound . Right ;
2019-12-18 19:41:39 +08:00
}
private static bool OverlapsHard ( PdfRectangle rectangle1 , PdfRectangle rectangle2 )
{
2020-05-25 19:11:59 +08:00
return rectangle1 . Left < rectangle2 . Right & &
rectangle2 . Left < rectangle1 . Right & &
rectangle1 . Top > rectangle2 . Bottom & &
rectangle2 . Top > rectangle1 . Bottom ;
2019-12-18 19:41:39 +08:00
}
private static bool Inside ( PdfRectangle rectangle1 , PdfRectangle rectangle2 )
{
2020-05-25 19:11:59 +08:00
return rectangle2 . Right < = rectangle1 . Right & & rectangle2 . Left > = rectangle1 . Left & &
rectangle2 . Top < = rectangle1 . Top & & rectangle2 . Bottom > = rectangle1 . Bottom ;
2019-12-18 19:41:39 +08:00
}
private static PdfRectangle GetBound ( IEnumerable < PdfRectangle > obstacles )
{
return new PdfRectangle (
obstacles . Min ( b = > b . Left ) ,
obstacles . Min ( b = > b . Bottom ) ,
obstacles . Max ( b = > b . Right ) ,
obstacles . Max ( b = > b . Top ) ) ;
}
#region Sorted Queue
private class QueueEntries : SortedSet < QueueEntry >
{
2020-05-25 19:11:59 +08:00
private readonly int bound ;
2019-12-18 19:41:39 +08:00
public QueueEntries ( int maximumBound )
{
2019-12-22 02:09:49 +08:00
bound = maximumBound ;
2019-12-18 19:41:39 +08:00
}
public QueueEntry Dequeue ( )
{
2019-12-22 02:09:49 +08:00
var current = Max ;
Remove ( current ) ;
2019-12-18 19:41:39 +08:00
return current ;
}
public void Enqueue ( QueueEntry queueEntry )
{
2019-12-22 02:09:49 +08:00
if ( bound > 0 & & Count > bound )
2019-12-18 19:41:39 +08:00
{
2019-12-22 02:09:49 +08:00
Remove ( Min ) ;
2019-12-18 19:41:39 +08:00
}
2019-12-22 02:09:49 +08:00
Add ( queueEntry ) ;
2019-12-18 19:41:39 +08:00
}
}
private class QueueEntry : IComparable < QueueEntry >
{
2019-12-22 02:09:49 +08:00
private readonly double quality ;
private readonly double whitespaceFuzziness ;
2019-12-18 19:41:39 +08:00
2019-12-22 02:09:49 +08:00
public PdfRectangle Bound { get ; }
2019-12-18 19:41:39 +08:00
2019-12-22 02:09:49 +08:00
public HashSet < PdfRectangle > Obstacles { get ; }
2020-01-05 00:38:18 +08:00
2019-12-22 02:09:49 +08:00
public QueueEntry ( PdfRectangle bound , HashSet < PdfRectangle > obstacles , double whitespaceFuzziness )
2019-12-18 19:41:39 +08:00
{
2019-12-22 02:09:49 +08:00
Bound = bound ;
quality = ScoringFunction ( Bound ) ;
Obstacles = obstacles ;
this . whitespaceFuzziness = whitespaceFuzziness ;
2019-12-18 19:41:39 +08:00
}
public PdfRectangle GetPivot ( )
{
int indexMiddle = Distances . FindIndexNearest ( Bound . Centroid ,
Obstacles . Select ( o = > o . Centroid ) . ToList ( ) ,
p = > p , p = > p , Distances . Euclidean , out double d ) ;
return indexMiddle = = - 1 ? Obstacles . First ( ) : Obstacles . ElementAt ( indexMiddle ) ;
}
public bool IsEmptyEnough ( )
{
2020-05-25 19:11:59 +08:00
return Obstacles . Count = = 0 ;
2019-12-18 19:41:39 +08:00
}
public bool IsEmptyEnough ( IEnumerable < PdfRectangle > pageObstacles )
{
2024-03-14 10:01:00 +08:00
if ( IsEmptyEnough ( ) )
{
return true ;
}
2019-12-18 19:41:39 +08:00
2019-12-22 02:09:49 +08:00
double sum = 0 ;
2019-12-18 19:41:39 +08:00
foreach ( var obstacle in pageObstacles )
{
var intersect = Bound . Intersect ( obstacle ) ;
2024-03-14 10:01:00 +08:00
if ( ! intersect . HasValue )
{
return false ;
}
2019-12-18 19:41:39 +08:00
2019-12-22 02:09:49 +08:00
double minimumArea = MinimumOverlappingArea ( obstacle , Bound , whitespaceFuzziness ) ;
2019-12-18 19:41:39 +08:00
if ( intersect . Value . Area > minimumArea )
{
return false ;
}
sum + = intersect . Value . Area ;
}
2019-12-22 02:09:49 +08:00
return sum < Bound . Area * whitespaceFuzziness ;
2019-12-18 19:41:39 +08:00
}
public override string ToString ( )
{
2019-12-22 02:09:49 +08:00
return "Q=" + quality . ToString ( "#0.0" ) + ", O=" + Obstacles . Count + ", " + Bound . ToString ( ) ;
2019-12-18 19:41:39 +08:00
}
public void AddWhitespace ( PdfRectangle rectangle )
{
Obstacles . Add ( rectangle ) ;
}
public int CompareTo ( QueueEntry entry )
{
2019-12-22 02:09:49 +08:00
return quality . CompareTo ( entry . quality ) ;
2019-12-18 19:41:39 +08:00
}
public override bool Equals ( object obj )
{
if ( obj is QueueEntry entry )
{
2020-05-25 19:11:59 +08:00
return Bound . Left = = entry . Bound . Left & &
Bound . Right = = entry . Bound . Right & &
Bound . Top = = entry . Bound . Top & &
Bound . Bottom = = entry . Bound . Bottom & &
Obstacles = = entry . Obstacles ;
2019-12-18 19:41:39 +08:00
}
return false ;
}
public override int GetHashCode ( )
{
return ( Bound . Left , Bound . Right ,
Bound . Top , Bound . Bottom ,
Obstacles ) . GetHashCode ( ) ;
}
2019-12-22 02:09:49 +08:00
private static double MinimumOverlappingArea ( PdfRectangle r1 , PdfRectangle r2 , double whitespaceFuzziness )
2019-12-18 19:41:39 +08:00
{
return Math . Min ( r1 . Area , r2 . Area ) * whitespaceFuzziness ;
}
/// <summary>
/// The scoring function Q(r) which is subsequently used to sort a priority queue.
/// </summary>
/// <param name="rectangle"></param>
2019-12-22 02:09:49 +08:00
private static double ScoringFunction ( PdfRectangle rectangle )
2019-12-18 19:41:39 +08:00
{
// As can be seen, tall rectangles are preferred. The trick while choosing this Q(r) was
// to keep that preference while still allowing wide rectangles to be chosen. After having
// experimented with quite a few variations, this simple function was considered a good
// solution.
2019-12-22 02:09:49 +08:00
return rectangle . Area * ( rectangle . Height / 4.0 ) ;
2019-12-18 19:41:39 +08:00
}
}
#endregion
}
}