namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
{
using System.Collections.Generic;
using System.Linq;
///
/// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order (TextSequence).
/// See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern and 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz.
///
public class UnsupervisedReadingOrderDetector : IReadingOrderDetector
{
///
/// Create an instance of unsupervised reading order detector, .
/// This detector uses the (spatial) Allen’s interval relations and rendering order (TextSequence).
///
public static UnsupervisedReadingOrderDetector Instance { get; } = new UnsupervisedReadingOrderDetector();
private double T;
///
/// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order.
///
/// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
/// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
/// same column might not be exactly aligned.
public UnsupervisedReadingOrderDetector(double T = 5)
{
this.T = T;
}
///
/// Gets the blocks in reading order and sets the .
///
/// The s to order.
public IEnumerable Get(IReadOnlyList textBlocks)
{
int readingOrder = 0;
var graph = BuildGraph(textBlocks, T);
while (graph.Any())
{
var maxCount = graph.Max(kvp => kvp.Value.Count);
var current = graph.Where(kvp => kvp.Value.Count == maxCount).FirstOrDefault();
graph.Remove(current.Key);
int index = current.Key;
foreach (var g in graph)
{
g.Value.Remove(index);
}
var block = textBlocks[index];
block.SetReadingOrder(readingOrder++);
yield return block;
}
}
private Dictionary> BuildGraph(IReadOnlyList textBlocks, double T)
{
// We incorporate both relations into a single partial ordering of blocks by specifying a
// directed graph with an edge between every pair of blocks for which at least one of the
// two relations hold.
var graph = new Dictionary>();
for (int i = 0; i < textBlocks.Count; i++)
{
graph.Add(i, new List());
}
for (int i = 0; i < textBlocks.Count; i++)
{
var a = textBlocks[i];
for (int j = 0; j < textBlocks.Count; j++)
{
if (i == j) continue;
var b = textBlocks[j];
if (GetBeforeInReadingRendering(a, b, T))
{
graph[i].Add(j);
}
}
}
return graph;
}
private bool GetBeforeInReadingRendering(TextBlock a, TextBlock b, double T)
{
return GetBeforeInReadingVertical(a, b, T) || GetBeforeInRendering(a, b);
}
private bool GetBeforeInRendering(TextBlock a, TextBlock b)
{
var avgTextSequenceA = a.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
var avgTextSequenceB = b.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
return avgTextSequenceA < avgTextSequenceB;
}
private bool GetBeforeInReading(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
if (xRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
yRelation == IntervalRelations.Meets ||
xRelation == IntervalRelations.Overlaps ||
yRelation == IntervalRelations.Overlaps)
{
return true;
}
return false;
}
///
/// Column-wise: text-blocks are read in columns, from top-to-bottom and from left-to-right.
///
///
///
/// The tolerance parameter T.
///
private bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
if (xRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
(xRelation == IntervalRelations.Overlaps && (yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
yRelation == IntervalRelations.Overlaps)) ||
((yRelation == IntervalRelations.Precedes || yRelation == IntervalRelations.Meets || yRelation == IntervalRelations.Overlaps) &&
(xRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
xRelation == IntervalRelations.Overlaps ||
xRelation == IntervalRelations.Starts ||
xRelation == IntervalRelations.FinishesI ||
xRelation == IntervalRelations.Equals ||
xRelation == IntervalRelations.During ||
xRelation == IntervalRelations.DuringI ||
xRelation == IntervalRelations.Finishes ||
xRelation == IntervalRelations.StartsI ||
xRelation == IntervalRelations.OverlapsI)))
{
return true;
}
return false;
}
///
/// Row-wise: text-blocks are read in rows from left-to-right, top- to-bottom.
///
///
///
/// The tolerance parameter T.
///
private bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
if (yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
(yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
xRelation == IntervalRelations.Overlaps)) ||
((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) &&
(yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
yRelation == IntervalRelations.Overlaps ||
yRelation == IntervalRelations.Starts ||
yRelation == IntervalRelations.FinishesI ||
yRelation == IntervalRelations.Equals ||
yRelation == IntervalRelations.During ||
yRelation == IntervalRelations.DuringI ||
yRelation == IntervalRelations.Finishes ||
yRelation == IntervalRelations.StartsI ||
yRelation == IntervalRelations.OverlapsI)))
{
return true;
}
return false;
}
///
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate.
/// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.
///
///
///
/// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
private IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = IntervalRelations.Unknown;
if (a.BoundingBox.Right < b.BoundingBox.Left - T)
{
xRelation = IntervalRelations.Precedes;
}
else if (a.BoundingBox.Right >= b.BoundingBox.Left - T)
{
xRelation = IntervalRelations.PrecedesI;
}
else if (b.BoundingBox.Left - T <= a.BoundingBox.Right
&& a.BoundingBox.Right <= b.BoundingBox.Left + T)
{
xRelation = IntervalRelations.Meets;
}
else if (b.BoundingBox.Left - T > a.BoundingBox.Right
&& a.BoundingBox.Right > b.BoundingBox.Left + T)
{
xRelation = IntervalRelations.MeetsI;
}
else if (a.BoundingBox.Left < b.BoundingBox.Left - T
&& (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T))
{
xRelation = IntervalRelations.Overlaps;
}
else if (a.BoundingBox.Left >= b.BoundingBox.Left - T
&& (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T))
{
xRelation = IntervalRelations.OverlapsI;
}
else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
{
xRelation = IntervalRelations.Starts;
}
else if ((b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T)
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
{
xRelation = IntervalRelations.StartsI;
}
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
{
xRelation = IntervalRelations.During;
}
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
{
xRelation = IntervalRelations.DuringI;
}
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
{
xRelation = IntervalRelations.Finishes;
}
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T))
{
xRelation = IntervalRelations.FinishesI;
}
else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
{
xRelation = IntervalRelations.Equals;
}
return xRelation;
}
///
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate.
/// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.
///
///
///
/// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
private IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T)
{
IntervalRelations yRelation = IntervalRelations.Unknown;
if (a.BoundingBox.Bottom < b.BoundingBox.Top - T)
{
yRelation = IntervalRelations.PrecedesI;
}
else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T)
{
yRelation = IntervalRelations.Precedes;
}
else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom
&& a.BoundingBox.Bottom <= b.BoundingBox.Top + T)
{
yRelation = IntervalRelations.MeetsI;
}
else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom
&& a.BoundingBox.Bottom > b.BoundingBox.Top + T)
{
yRelation = IntervalRelations.Meets;
}
else if (a.BoundingBox.Top < b.BoundingBox.Top - T
&& (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T))
{
yRelation = IntervalRelations.OverlapsI;
}
else if (a.BoundingBox.Top >= b.BoundingBox.Top - T
&& (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T))
{
yRelation = IntervalRelations.Overlaps;
}
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
{
yRelation = IntervalRelations.StartsI;
}
else if ((b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T)
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
{
yRelation = IntervalRelations.Starts;
}
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
{
yRelation = IntervalRelations.DuringI;
}
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
{
yRelation = IntervalRelations.During;
}
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
{
yRelation = IntervalRelations.FinishesI;
}
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
&& (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T))
{
yRelation = IntervalRelations.Finishes;
}
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
{
yRelation = IntervalRelations.Equals;
}
return yRelation;
}
///
/// Allen’s interval thirteen relations.
/// See https://en.wikipedia.org/wiki/Allen%27s_interval_algebra
///
private enum IntervalRelations
{
///
/// Unknown interval relations.
///
Unknown,
///
/// X takes place before Y.
/// |____X____|......................
/// ......................|____Y____|
///
Precedes,
///
/// X meets Y.
/// |____X____|.................
/// .................|____Y____|
///
Meets,
///
/// X overlaps with Y.
/// |______X______|.................
/// .................|______Y______|
///
Overlaps,
///
/// X starts Y.
/// |____X____|.................
/// |_____Y_____|..............
///
Starts,
///
/// X during Y.
/// ........|____X____|.........
/// .....|______Y______|.....
///
During,
///
/// X finishes Y.
/// .................|____X____|
/// ..............|_____Y_____|
///
Finishes,
///
/// Inverse precedes.
///
PrecedesI,
///
/// Inverse meets.
///
MeetsI,
///
/// Inverse overlaps.
///
OverlapsI,
///
/// Inverse Starts.
///
StartsI,
///
/// Inverse during.
///
DuringI,
///
/// Inverse finishes.
///
FinishesI,
///
/// X is equal to Y.
/// ..........|____X____|............
/// ..........|____Y____|............
///
Equals
}
private class NodeComparer : IComparer>>
{
public int Compare(KeyValuePair> x, KeyValuePair> y)
{
return x.Value.Count.CompareTo(y.Value.Count);
}
}
}
}