diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs index 948c10b1..8fd1c736 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs @@ -1,31 +1,115 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector { + using System; using System.Collections.Generic; using System.Linq; /// - /// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order (TextSequence). + /// Algorithm that retrieve the blocks' reading order using spatial reasoning (Allen’s interval relations) and possibly the rendering order (TextSequence). /// See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern and 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz. /// public class UnsupervisedReadingOrderDetector : IReadingOrderDetector { + /// + /// The rules encoding the spatial reasoning constraints. + /// See 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz. + /// + public enum SpatialReasoningRules + { + /// + /// Basic spacial reasoning. + /// In western culture the reading order is from left to right and from top to bottom. + /// + Basic = 0, + + /// + /// Text-blocks are read in rows from left-to-right, top-to-bottom. + /// The diagonal direction 'left-bottom to top-right' cannot be present among the Basic relations allowed. + /// + RowWise = 1, + + /// + /// Text-blocks are read in columns, from top-to-bottom and from left-to-right. + /// The diagonal direction 'right-top to bottom-left' cannot be present among the Basic relations allowed. + /// + ColumnWise = 2 + } + /// /// Create an instance of unsupervised reading order detector, . - /// This detector uses the (spatial) Allen’s interval relations and rendering order (TextSequence). + /// This detector uses spatial reasoning (Allen’s interval relations) and possibly the rendering order (TextSequence). /// public static UnsupervisedReadingOrderDetector Instance { get; } = new UnsupervisedReadingOrderDetector(); - private readonly double T; + /// + /// Whether or not to also use the rendering order, as indicated by the TextSequence. + /// + public bool UseRenderingOrder { get; } /// - /// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order. + /// The rule to be used that encodes the spatial reasoning constraints. + /// + public SpatialReasoningRules SpatialReasoningRule { get; } + + /// + /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. + /// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the + /// same column might not be exactly aligned. + /// + public double T { get; } + + private Func getBeforeInMethod; + + /// + /// Algorithm that retrieve the blocks' reading order using spatial reasoning (Allen’s interval relations) and possibly the rendering order (TextSequence). /// /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. /// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the /// same column might not be exactly aligned. - public UnsupervisedReadingOrderDetector(double T = 5) + /// The rule to be used that encodes the spatial reasoning constraints. + /// Whether or not to also use the rendering order, as indicated by the TextSequence. + public UnsupervisedReadingOrderDetector(double T = 5, SpatialReasoningRules spatialReasoningRule = SpatialReasoningRules.ColumnWise, bool useRenderingOrder = true) { this.T = T; + this.SpatialReasoningRule = spatialReasoningRule; + this.UseRenderingOrder = useRenderingOrder; + + switch (SpatialReasoningRule) + { + case SpatialReasoningRules.ColumnWise: + if (UseRenderingOrder) + { + getBeforeInMethod = (TextBlock a, TextBlock b, double T) => GetBeforeInReadingVertical(a, b, T) || GetBeforeInRendering(a, b); + } + else + { + getBeforeInMethod = GetBeforeInReadingVertical; + } + break; + + case SpatialReasoningRules.RowWise: + if (UseRenderingOrder) + { + getBeforeInMethod = (TextBlock a, TextBlock b, double T) => GetBeforeInReadingHorizontal(a, b, T) || GetBeforeInRendering(a, b); + } + else + { + getBeforeInMethod = GetBeforeInReadingHorizontal; + } + break; + + case SpatialReasoningRules.Basic: + default: + if (UseRenderingOrder) + { + getBeforeInMethod = (TextBlock a, TextBlock b, double T) => GetBeforeInReading(a, b, T) || GetBeforeInRendering(a, b); + } + else + { + getBeforeInMethod = GetBeforeInReading; + } + break; + } } /// @@ -78,7 +162,8 @@ if (i == j) continue; var b = textBlocks[j]; - if (GetBeforeInReadingRendering(a, b, T)) + //if (GetBeforeInReadingRendering(a, b, T)) + if (getBeforeInMethod(a, b, T)) { graph[i].Add(j); } @@ -88,19 +173,20 @@ return graph; } - private bool GetBeforeInReadingRendering(TextBlock a, TextBlock b, double T) - { - return GetBeforeInReadingVertical(a, b, T) || GetBeforeInRendering(a, b); - } - - private bool GetBeforeInRendering(TextBlock a, TextBlock b) + private static bool GetBeforeInRendering(TextBlock a, TextBlock b) { var avgTextSequenceA = a.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average(); var avgTextSequenceB = b.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average(); return avgTextSequenceA < avgTextSequenceB; } - private bool GetBeforeInReading(TextBlock a, TextBlock b, double T) + /// + /// Rule encoding the fact that in western culture the reading order is from left to right and from top to bottom. + /// + /// + /// + /// The tolerance parameter T. + private static bool GetBeforeInReading(TextBlock a, TextBlock b, double T) { IntervalRelations xRelation = GetIntervalRelationX(a, b, T); IntervalRelations yRelation = GetIntervalRelationY(a, b, T); @@ -119,8 +205,7 @@ /// /// /// The tolerance parameter T. - /// - private bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T) + private static bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T) { IntervalRelations xRelation = GetIntervalRelationX(a, b, T); IntervalRelations yRelation = GetIntervalRelationY(a, b, T); @@ -150,7 +235,7 @@ /// /// /// The tolerance parameter T. - private bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T) + private static bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T) { IntervalRelations xRelation = GetIntervalRelationX(a, b, T); IntervalRelations yRelation = GetIntervalRelationY(a, b, T); @@ -183,7 +268,7 @@ /// /// /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. - private IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T) + private static IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T) { if (a.BoundingBox.Right < b.BoundingBox.Left - T) { @@ -267,7 +352,7 @@ /// /// /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. - private IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T) + private static IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T) { if (a.BoundingBox.Bottom < b.BoundingBox.Top - T) {