diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs
index 948c10b1..8fd1c736 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs
@@ -1,31 +1,115 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
{
+ using System;
using System.Collections.Generic;
using System.Linq;
///
- /// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order (TextSequence).
+ /// Algorithm that retrieve the blocks' reading order using spatial reasoning (Allen’s interval relations) and possibly the rendering order (TextSequence).
/// See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern and 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz.
///
public class UnsupervisedReadingOrderDetector : IReadingOrderDetector
{
+ ///
+ /// The rules encoding the spatial reasoning constraints.
+ /// See 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz.
+ ///
+ public enum SpatialReasoningRules
+ {
+ ///
+ /// Basic spacial reasoning.
+ /// In western culture the reading order is from left to right and from top to bottom.
+ ///
+ Basic = 0,
+
+ ///
+ /// Text-blocks are read in rows from left-to-right, top-to-bottom.
+ /// The diagonal direction 'left-bottom to top-right' cannot be present among the Basic relations allowed.
+ ///
+ RowWise = 1,
+
+ ///
+ /// Text-blocks are read in columns, from top-to-bottom and from left-to-right.
+ /// The diagonal direction 'right-top to bottom-left' cannot be present among the Basic relations allowed.
+ ///
+ ColumnWise = 2
+ }
+
///
/// Create an instance of unsupervised reading order detector, .
- /// This detector uses the (spatial) Allen’s interval relations and rendering order (TextSequence).
+ /// This detector uses spatial reasoning (Allen’s interval relations) and possibly the rendering order (TextSequence).
///
public static UnsupervisedReadingOrderDetector Instance { get; } = new UnsupervisedReadingOrderDetector();
- private readonly double T;
+ ///
+ /// Whether or not to also use the rendering order, as indicated by the TextSequence.
+ ///
+ public bool UseRenderingOrder { get; }
///
- /// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order.
+ /// The rule to be used that encodes the spatial reasoning constraints.
+ ///
+ public SpatialReasoningRules SpatialReasoningRule { get; }
+
+ ///
+ /// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
+ /// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
+ /// same column might not be exactly aligned.
+ ///
+ public double T { get; }
+
+ private Func getBeforeInMethod;
+
+ ///
+ /// Algorithm that retrieve the blocks' reading order using spatial reasoning (Allen’s interval relations) and possibly the rendering order (TextSequence).
///
/// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
/// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
/// same column might not be exactly aligned.
- public UnsupervisedReadingOrderDetector(double T = 5)
+ /// The rule to be used that encodes the spatial reasoning constraints.
+ /// Whether or not to also use the rendering order, as indicated by the TextSequence.
+ public UnsupervisedReadingOrderDetector(double T = 5, SpatialReasoningRules spatialReasoningRule = SpatialReasoningRules.ColumnWise, bool useRenderingOrder = true)
{
this.T = T;
+ this.SpatialReasoningRule = spatialReasoningRule;
+ this.UseRenderingOrder = useRenderingOrder;
+
+ switch (SpatialReasoningRule)
+ {
+ case SpatialReasoningRules.ColumnWise:
+ if (UseRenderingOrder)
+ {
+ getBeforeInMethod = (TextBlock a, TextBlock b, double T) => GetBeforeInReadingVertical(a, b, T) || GetBeforeInRendering(a, b);
+ }
+ else
+ {
+ getBeforeInMethod = GetBeforeInReadingVertical;
+ }
+ break;
+
+ case SpatialReasoningRules.RowWise:
+ if (UseRenderingOrder)
+ {
+ getBeforeInMethod = (TextBlock a, TextBlock b, double T) => GetBeforeInReadingHorizontal(a, b, T) || GetBeforeInRendering(a, b);
+ }
+ else
+ {
+ getBeforeInMethod = GetBeforeInReadingHorizontal;
+ }
+ break;
+
+ case SpatialReasoningRules.Basic:
+ default:
+ if (UseRenderingOrder)
+ {
+ getBeforeInMethod = (TextBlock a, TextBlock b, double T) => GetBeforeInReading(a, b, T) || GetBeforeInRendering(a, b);
+ }
+ else
+ {
+ getBeforeInMethod = GetBeforeInReading;
+ }
+ break;
+ }
}
///
@@ -78,7 +162,8 @@
if (i == j) continue;
var b = textBlocks[j];
- if (GetBeforeInReadingRendering(a, b, T))
+ //if (GetBeforeInReadingRendering(a, b, T))
+ if (getBeforeInMethod(a, b, T))
{
graph[i].Add(j);
}
@@ -88,19 +173,20 @@
return graph;
}
- private bool GetBeforeInReadingRendering(TextBlock a, TextBlock b, double T)
- {
- return GetBeforeInReadingVertical(a, b, T) || GetBeforeInRendering(a, b);
- }
-
- private bool GetBeforeInRendering(TextBlock a, TextBlock b)
+ private static bool GetBeforeInRendering(TextBlock a, TextBlock b)
{
var avgTextSequenceA = a.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
var avgTextSequenceB = b.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
return avgTextSequenceA < avgTextSequenceB;
}
- private bool GetBeforeInReading(TextBlock a, TextBlock b, double T)
+ ///
+ /// Rule encoding the fact that in western culture the reading order is from left to right and from top to bottom.
+ ///
+ ///
+ ///
+ /// The tolerance parameter T.
+ private static bool GetBeforeInReading(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
@@ -119,8 +205,7 @@
///
///
/// The tolerance parameter T.
- ///
- private bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T)
+ private static bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
@@ -150,7 +235,7 @@
///
///
/// The tolerance parameter T.
- private bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
+ private static bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
@@ -183,7 +268,7 @@
///
///
/// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
- private IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T)
+ private static IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T)
{
if (a.BoundingBox.Right < b.BoundingBox.Left - T)
{
@@ -267,7 +352,7 @@
///
///
/// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
- private IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T)
+ private static IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T)
{
if (a.BoundingBox.Bottom < b.BoundingBox.Top - T)
{