namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector { using System.Collections.Generic; using System.Linq; /// /// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order (TextSequence). /// See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern and 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz. /// public class UnsupervisedReadingOrderDetector : IReadingOrderDetector { /// /// Create an instance of unsupervised reading order detector, . /// This detector uses the (spatial) Allen’s interval relations and rendering order (TextSequence). /// public static UnsupervisedReadingOrderDetector Instance { get; } = new UnsupervisedReadingOrderDetector(); private double T; /// /// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order. /// /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. /// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the /// same column might not be exactly aligned. public UnsupervisedReadingOrderDetector(double T = 5) { this.T = T; } /// /// Gets the blocks in reading order and sets the . /// /// The s to order. public IEnumerable Get(IReadOnlyList textBlocks) { int readingOrder = 0; var graph = BuildGraph(textBlocks, T); while (graph.Any()) { var maxCount = graph.Max(kvp => kvp.Value.Count); var current = graph.Where(kvp => kvp.Value.Count == maxCount).FirstOrDefault(); graph.Remove(current.Key); int index = current.Key; foreach (var g in graph) { g.Value.Remove(index); } var block = textBlocks[index]; block.SetReadingOrder(readingOrder++); yield return block; } } private Dictionary> BuildGraph(IReadOnlyList textBlocks, double T) { // We incorporate both relations into a single partial ordering of blocks by specifying a // directed graph with an edge between every pair of blocks for which at least one of the // two relations hold. var graph = new Dictionary>(); for (int i = 0; i < textBlocks.Count; i++) { graph.Add(i, new List()); } for (int i = 0; i < textBlocks.Count; i++) { var a = textBlocks[i]; for (int j = 0; j < textBlocks.Count; j++) { if (i == j) continue; var b = textBlocks[j]; if (GetBeforeInReadingRendering(a, b, T)) { graph[i].Add(j); } } } return graph; } private bool GetBeforeInReadingRendering(TextBlock a, TextBlock b, double T) { return GetBeforeInReadingVertical(a, b, T) || GetBeforeInRendering(a, b); } private bool GetBeforeInRendering(TextBlock a, TextBlock b) { var avgTextSequenceA = a.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average(); var avgTextSequenceB = b.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average(); return avgTextSequenceA < avgTextSequenceB; } private bool GetBeforeInReading(TextBlock a, TextBlock b, double T) { IntervalRelations xRelation = GetIntervalRelationX(a, b, T); IntervalRelations yRelation = GetIntervalRelationY(a, b, T); if (xRelation == IntervalRelations.Precedes || yRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || yRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps || yRelation == IntervalRelations.Overlaps) { return true; } return false; } /// /// Column-wise: text-blocks are read in columns, from top-to-bottom and from left-to-right. /// /// /// /// The tolerance parameter T. /// private bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T) { IntervalRelations xRelation = GetIntervalRelationX(a, b, T); IntervalRelations yRelation = GetIntervalRelationY(a, b, T); if (xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || (xRelation == IntervalRelations.Overlaps && (yRelation == IntervalRelations.Precedes || yRelation == IntervalRelations.Meets || yRelation == IntervalRelations.Overlaps)) || ((yRelation == IntervalRelations.Precedes || yRelation == IntervalRelations.Meets || yRelation == IntervalRelations.Overlaps) && (xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps || xRelation == IntervalRelations.Starts || xRelation == IntervalRelations.FinishesI || xRelation == IntervalRelations.Equals || xRelation == IntervalRelations.During || xRelation == IntervalRelations.DuringI || xRelation == IntervalRelations.Finishes || xRelation == IntervalRelations.StartsI || xRelation == IntervalRelations.OverlapsI))) { return true; } return false; } /// /// Row-wise: text-blocks are read in rows from left-to-right, top- to-bottom. /// /// /// /// The tolerance parameter T. /// private bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T) { IntervalRelations xRelation = GetIntervalRelationX(a, b, T); IntervalRelations yRelation = GetIntervalRelationY(a, b, T); if (yRelation == IntervalRelations.Precedes || yRelation == IntervalRelations.Meets || (yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps)) || ((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) && (yRelation == IntervalRelations.Precedes || yRelation == IntervalRelations.Meets || yRelation == IntervalRelations.Overlaps || yRelation == IntervalRelations.Starts || yRelation == IntervalRelations.FinishesI || yRelation == IntervalRelations.Equals || yRelation == IntervalRelations.During || yRelation == IntervalRelations.DuringI || yRelation == IntervalRelations.Finishes || yRelation == IntervalRelations.StartsI || yRelation == IntervalRelations.OverlapsI))) { return true; } return false; } /// /// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate. /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete. /// /// /// /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. private IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T) { IntervalRelations xRelation = IntervalRelations.Unknown; if (a.BoundingBox.Right < b.BoundingBox.Left - T) { xRelation = IntervalRelations.Precedes; } else if (a.BoundingBox.Right >= b.BoundingBox.Left - T) { xRelation = IntervalRelations.PrecedesI; } else if (b.BoundingBox.Left - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Left + T) { xRelation = IntervalRelations.Meets; } else if (b.BoundingBox.Left - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Left + T) { xRelation = IntervalRelations.MeetsI; } else if (a.BoundingBox.Left < b.BoundingBox.Left - T && (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T)) { xRelation = IntervalRelations.Overlaps; } else if (a.BoundingBox.Left >= b.BoundingBox.Left - T && (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T)) { xRelation = IntervalRelations.OverlapsI; } else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T) && a.BoundingBox.Right < b.BoundingBox.Right - T) { xRelation = IntervalRelations.Starts; } else if ((b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T) && a.BoundingBox.Right >= b.BoundingBox.Right - T) { xRelation = IntervalRelations.StartsI; } else if (a.BoundingBox.Left > b.BoundingBox.Left + T && a.BoundingBox.Right < b.BoundingBox.Right - T) { xRelation = IntervalRelations.During; } else if (a.BoundingBox.Left <= b.BoundingBox.Left + T && a.BoundingBox.Right >= b.BoundingBox.Right - T) { xRelation = IntervalRelations.DuringI; } else if (a.BoundingBox.Left > b.BoundingBox.Left + T && (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T)) { xRelation = IntervalRelations.Finishes; } else if (a.BoundingBox.Left <= b.BoundingBox.Left + T && (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T)) { xRelation = IntervalRelations.FinishesI; } else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T) && (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T)) { xRelation = IntervalRelations.Equals; } return xRelation; } /// /// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate. /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete. /// /// /// /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. private IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T) { IntervalRelations yRelation = IntervalRelations.Unknown; if (a.BoundingBox.Bottom < b.BoundingBox.Top - T) { yRelation = IntervalRelations.PrecedesI; } else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T) { yRelation = IntervalRelations.Precedes; } else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Top + T) { yRelation = IntervalRelations.MeetsI; } else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Top + T) { yRelation = IntervalRelations.Meets; } else if (a.BoundingBox.Top < b.BoundingBox.Top - T && (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)) { yRelation = IntervalRelations.OverlapsI; } else if (a.BoundingBox.Top >= b.BoundingBox.Top - T && (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)) { yRelation = IntervalRelations.Overlaps; } else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T) && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T) { yRelation = IntervalRelations.StartsI; } else if ((b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T) && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T) { yRelation = IntervalRelations.Starts; } else if (a.BoundingBox.Top > b.BoundingBox.Top + T && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T) { yRelation = IntervalRelations.DuringI; } else if (a.BoundingBox.Top <= b.BoundingBox.Top + T && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T) { yRelation = IntervalRelations.During; } else if (a.BoundingBox.Top > b.BoundingBox.Top + T && (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T)) { yRelation = IntervalRelations.FinishesI; } else if (a.BoundingBox.Top <= b.BoundingBox.Top + T && (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T)) { yRelation = IntervalRelations.Finishes; } else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T) && (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T)) { yRelation = IntervalRelations.Equals; } return yRelation; } /// /// Allen’s interval thirteen relations. /// See https://en.wikipedia.org/wiki/Allen%27s_interval_algebra /// private enum IntervalRelations { /// /// Unknown interval relations. /// Unknown, /// /// X takes place before Y. /// |____X____|...................... /// ......................|____Y____| /// Precedes, /// /// X meets Y. /// |____X____|................. /// .................|____Y____| /// Meets, /// /// X overlaps with Y. /// |______X______|................. /// .................|______Y______| /// Overlaps, /// /// X starts Y. /// |____X____|................. /// |_____Y_____|.............. /// Starts, /// /// X during Y. /// ........|____X____|......... /// .....|______Y______|..... /// During, /// /// X finishes Y. /// .................|____X____| /// ..............|_____Y_____| /// Finishes, /// /// Inverse precedes. /// PrecedesI, /// /// Inverse meets. /// MeetsI, /// /// Inverse overlaps. /// OverlapsI, /// /// Inverse Starts. /// StartsI, /// /// Inverse during. /// DuringI, /// /// Inverse finishes. /// FinishesI, /// /// X is equal to Y. /// ..........|____X____|............ /// ..........|____Y____|............ /// Equals } private class NodeComparer : IComparer>> { public int Compare(KeyValuePair> x, KeyValuePair> y) { return x.Value.Count.CompareTo(y.Value.Count); } } } }