diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs index 7aafb60f..4effd98a 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs @@ -1,13 +1,14 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { + using Content; + using Geometry; using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; using System.Threading.Tasks; - using Content; - using Geometry; + using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; using Util; /// @@ -15,7 +16,7 @@ /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the /// left or right edge of the page. - /// See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern. + /// See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern. /// public static class DecorationTextBlockClassifier { diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs index d87e9675..2ee54379 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs @@ -1,10 +1,9 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { + using Core; using System; using System.Collections.Generic; using System.Linq; - using Core; - using Geometry; /// /// Contains helpful tools for distance measures. diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs index 52fd2097..d7010045 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs @@ -1,15 +1,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export { + using Alto; + using Content; + using Core; + using DocumentLayoutAnalysis; using System; using System.Globalization; using System.Linq; using System.Xml; using System.Xml.Serialization; - using Alto; - using Content; - using Core; - using DocumentLayoutAnalysis; - using Geometry; + using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; using Util; /// diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs index 9000e21f..1b0a40f0 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs @@ -1,11 +1,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export { - using System; - using System.Linq; using Content; using Core; using DocumentLayoutAnalysis; - using Geometry; + using System; + using System.Linq; + using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; using Util; /// diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs index 4572517d..43abea69 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs @@ -1,16 +1,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export { + using Content; + using Core; + using DocumentLayoutAnalysis; + using Graphics.Colors; + using PAGE; using System; using System.Collections.Generic; using System.Linq; using System.Xml; using System.Xml.Serialization; - using Content; - using Core; - using DocumentLayoutAnalysis; - using Geometry; - using Graphics.Colors; - using PAGE; + using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; using Util; /// diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DefaultPageSegmenter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DefaultPageSegmenter.cs similarity index 93% rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/DefaultPageSegmenter.cs rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DefaultPageSegmenter.cs index 1e1391c9..bd8fe89a 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DefaultPageSegmenter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DefaultPageSegmenter.cs @@ -1,9 +1,9 @@ -namespace UglyToad.PdfPig.DocumentLayoutAnalysis +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter { - using System.Collections.Generic; - using System.Linq; using Content; using Core; + using System.Collections.Generic; + using System.Linq; /// /// Default Page Segmenter. All words are included in one block. diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs similarity index 99% rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs index 67f86303..34df2ade 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs @@ -1,13 +1,13 @@ -namespace UglyToad.PdfPig.DocumentLayoutAnalysis +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter { + using Content; + using Core; + using Geometry; using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; - using Geometry; - using Content; - using Core; /// /// diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/IPageSegmenter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenter.cs similarity index 91% rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/IPageSegmenter.cs rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenter.cs index d4c02223..953203f5 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/IPageSegmenter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenter.cs @@ -1,7 +1,7 @@ -namespace UglyToad.PdfPig.DocumentLayoutAnalysis +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter { - using System.Collections.Generic; using Content; + using System.Collections.Generic; /// /// Page segmentation divides a page into areas, each consisting of a layout structure (blocks, lines, etc.). diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs similarity index 99% rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/RecursiveXYCut.cs rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs index 63cdc553..9f320ad7 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs @@ -1,10 +1,10 @@ -namespace UglyToad.PdfPig.DocumentLayoutAnalysis +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter { + using Content; + using Core; using System; using System.Collections.Generic; using System.Linq; - using Content; - using Core; /// /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/XYLeaf.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYLeaf.cs similarity index 97% rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/XYLeaf.cs rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYLeaf.cs index cc8c0595..784e47ea 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/XYLeaf.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYLeaf.cs @@ -1,10 +1,10 @@ -namespace UglyToad.PdfPig.DocumentLayoutAnalysis +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter { + using Content; + using Core; using System; using System.Collections.Generic; using System.Linq; - using Content; - using Core; /// /// A Leaf node used in the algorithm, i.e. a block. diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/XYNode.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYNode.cs similarity index 98% rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/XYNode.cs rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYNode.cs index 680b6184..bf5422c2 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/XYNode.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYNode.cs @@ -1,8 +1,8 @@ -namespace UglyToad.PdfPig.DocumentLayoutAnalysis +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter { + using Core; using System.Collections.Generic; using System.Linq; - using Core; /// /// A Node used in the algorithm. diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/DefaultReadingOrderDetector.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/DefaultReadingOrderDetector.cs new file mode 100644 index 00000000..5b3ba78c --- /dev/null +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/DefaultReadingOrderDetector.cs @@ -0,0 +1,25 @@ +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector +{ + using System.Collections.Generic; + + /// + /// This detector does nothing, no ordering takes place. + /// + public class DefaultReadingOrderDetector : IReadingOrderDetector + { + /// + /// Create an instance of default reading order detector, . + /// This detector does nothing, no ordering takes place. + /// + public static DefaultReadingOrderDetector Instance { get; } = new DefaultReadingOrderDetector(); + + /// + /// Gets the blocks in reading order and sets the . + /// + /// The s to order. + public IEnumerable Get(IReadOnlyList textBlocks) + { + return textBlocks; + } + } +} diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/IReadingOrderDetector.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/IReadingOrderDetector.cs new file mode 100644 index 00000000..08f62d69 --- /dev/null +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/IReadingOrderDetector.cs @@ -0,0 +1,16 @@ +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector +{ + using System.Collections.Generic; + + /// + /// Reading order detector determines the page's blocks reading order. + /// + public interface IReadingOrderDetector + { + /// + /// Gets the blocks in reading order and sets the . + /// + /// The s to order. + IEnumerable Get(IReadOnlyList textBlocks); + } +} diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/RenderingReadingOrderDetector.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/RenderingReadingOrderDetector.cs new file mode 100644 index 00000000..f5e34448 --- /dev/null +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/RenderingReadingOrderDetector.cs @@ -0,0 +1,37 @@ +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector +{ + using System.Collections.Generic; + using System.Linq; + + /// + /// Algorithm that retrieve the blocks' reading order using rendering order (TextSequence). + /// + public class RenderingReadingOrderDetector : IReadingOrderDetector + { + /// + /// Create an instance of rendering reading order detector, . + /// This detector uses the rendering order (TextSequence). + /// + public static RenderingReadingOrderDetector Instance { get; } = new RenderingReadingOrderDetector(); + + /// + /// Gets the blocks in reading order and sets the . + /// + /// The s to order. + public IEnumerable Get(IReadOnlyList textBlocks) + { + int readingOrder = 0; + + foreach (var block in textBlocks.OrderBy(b => AvgTextSequence(b))) + { + block.SetReadingOrder(readingOrder++); + yield return block; + } + } + + private double AvgTextSequence(TextBlock textBlock) + { + return textBlock.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average(); + } + } +} diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs new file mode 100644 index 00000000..25a2ba96 --- /dev/null +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs @@ -0,0 +1,464 @@ +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector +{ + using System.Collections.Generic; + using System.Linq; + + /// + /// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order (TextSequence). + /// See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern and 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz. + /// + public class UnsupervisedReadingOrderDetector : IReadingOrderDetector + { + /// + /// Create an instance of unsupervised reading order detector, . + /// This detector uses the (spatial) Allen’s interval relations and rendering order (TextSequence). + /// + public static UnsupervisedReadingOrderDetector Instance { get; } = new UnsupervisedReadingOrderDetector(); + + private double T; + + /// + /// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order. + /// + /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. + /// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the + /// same column might not be exactly aligned. + public UnsupervisedReadingOrderDetector(double T = 5) + { + this.T = T; + } + + /// + /// Gets the blocks in reading order and sets the . + /// + /// The s to order. + public IEnumerable Get(IReadOnlyList textBlocks) + { + int readingOrder = 0; + + var graph = BuildGraph(textBlocks, T); + + while (graph.Any()) + { + var maxCount = graph.Max(kvp => kvp.Value.Count); + var current = graph.Where(kvp => kvp.Value.Count == maxCount).FirstOrDefault(); + graph.Remove(current.Key); + int index = current.Key; + + foreach (var g in graph) + { + g.Value.Remove(index); + } + + var block = textBlocks[index]; + block.SetReadingOrder(readingOrder++); + + yield return block; + } + } + + private Dictionary> BuildGraph(IReadOnlyList textBlocks, double T) + { + // We incorporate both relations into a single partial ordering of blocks by specifying a + // directed graph with an edge between every pair of blocks for which at least one of the + // two relations hold. + + var graph = new Dictionary>(); + + for (int i = 0; i < textBlocks.Count; i++) + { + graph.Add(i, new List()); + } + + for (int i = 0; i < textBlocks.Count; i++) + { + var a = textBlocks[i]; + for (int j = 0; j < textBlocks.Count; j++) + { + if (i == j) continue; + var b = textBlocks[j]; + + if (GetBeforeInReadingRendering(a, b, T)) + { + graph[i].Add(j); + } + } + } + + return graph; + } + + private bool GetBeforeInReadingRendering(TextBlock a, TextBlock b, double T) + { + return GetBeforeInReadingVertical(a, b, T) || GetBeforeInRendering(a, b); + } + + private bool GetBeforeInRendering(TextBlock a, TextBlock b) + { + var avgTextSequenceA = a.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average(); + var avgTextSequenceB = b.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average(); + return avgTextSequenceA < avgTextSequenceB; + } + + private bool GetBeforeInReading(TextBlock a, TextBlock b, double T) + { + IntervalRelations xRelation = GetIntervalRelationX(a, b, T); + IntervalRelations yRelation = GetIntervalRelationY(a, b, T); + + if (xRelation == IntervalRelations.Precedes || + yRelation == IntervalRelations.Precedes || + xRelation == IntervalRelations.Meets || + yRelation == IntervalRelations.Meets || + xRelation == IntervalRelations.Overlaps || + yRelation == IntervalRelations.Overlaps) + { + return true; + } + + return false; + } + + /// + /// Column-wise: text-blocks are read in columns, from top-to-bottom and from left-to-right. + /// + /// + /// + /// The tolerance parameter T. + /// + private bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T) + { + IntervalRelations xRelation = GetIntervalRelationX(a, b, T); + IntervalRelations yRelation = GetIntervalRelationY(a, b, T); + + if (xRelation == IntervalRelations.Precedes || + xRelation == IntervalRelations.Meets || + (xRelation == IntervalRelations.Overlaps && (yRelation == IntervalRelations.Precedes || + yRelation == IntervalRelations.Meets || + yRelation == IntervalRelations.Overlaps)) || + ((yRelation == IntervalRelations.Precedes || yRelation == IntervalRelations.Meets || yRelation == IntervalRelations.Overlaps) && + (xRelation == IntervalRelations.Precedes || + xRelation == IntervalRelations.Meets || + xRelation == IntervalRelations.Overlaps || + xRelation == IntervalRelations.Starts || + xRelation == IntervalRelations.FinishesI || + xRelation == IntervalRelations.Equals || + xRelation == IntervalRelations.During || + xRelation == IntervalRelations.DuringI || + xRelation == IntervalRelations.Finishes || + xRelation == IntervalRelations.StartsI || + xRelation == IntervalRelations.OverlapsI))) + { + return true; + } + + return false; + } + + /// + /// Row-wise: text-blocks are read in rows from left-to-right, top- to-bottom. + /// + /// + /// + /// The tolerance parameter T. + /// + private bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T) + { + IntervalRelations xRelation = GetIntervalRelationX(a, b, T); + IntervalRelations yRelation = GetIntervalRelationY(a, b, T); + + if (yRelation == IntervalRelations.Precedes || + yRelation == IntervalRelations.Meets || + (yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes || + xRelation == IntervalRelations.Meets || + xRelation == IntervalRelations.Overlaps)) || + ((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) && + (yRelation == IntervalRelations.Precedes || + yRelation == IntervalRelations.Meets || + yRelation == IntervalRelations.Overlaps || + yRelation == IntervalRelations.Starts || + yRelation == IntervalRelations.FinishesI || + yRelation == IntervalRelations.Equals || + yRelation == IntervalRelations.During || + yRelation == IntervalRelations.DuringI || + yRelation == IntervalRelations.Finishes || + yRelation == IntervalRelations.StartsI || + yRelation == IntervalRelations.OverlapsI))) + { + return true; + } + + return false; + } + + /// + /// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate. + /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. + /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed + /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete. + /// + /// + /// + /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. + private IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T) + { + IntervalRelations xRelation = IntervalRelations.Unknown; + + if (a.BoundingBox.Right < b.BoundingBox.Left - T) + { + xRelation = IntervalRelations.Precedes; + } + else if (a.BoundingBox.Right >= b.BoundingBox.Left - T) + { + xRelation = IntervalRelations.PrecedesI; + } + + else if (b.BoundingBox.Left - T <= a.BoundingBox.Right + && a.BoundingBox.Right <= b.BoundingBox.Left + T) + { + xRelation = IntervalRelations.Meets; + } + else if (b.BoundingBox.Left - T > a.BoundingBox.Right + && a.BoundingBox.Right > b.BoundingBox.Left + T) + { + xRelation = IntervalRelations.MeetsI; + } + + else if (a.BoundingBox.Left < b.BoundingBox.Left - T + && (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T)) + { + xRelation = IntervalRelations.Overlaps; + } + else if (a.BoundingBox.Left >= b.BoundingBox.Left - T + && (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T)) + { + xRelation = IntervalRelations.OverlapsI; + } + + else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T) + && a.BoundingBox.Right < b.BoundingBox.Right - T) + { + xRelation = IntervalRelations.Starts; + } + else if ((b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T) + && a.BoundingBox.Right >= b.BoundingBox.Right - T) + { + xRelation = IntervalRelations.StartsI; + } + + else if (a.BoundingBox.Left > b.BoundingBox.Left + T + && a.BoundingBox.Right < b.BoundingBox.Right - T) + { + xRelation = IntervalRelations.During; + } + else if (a.BoundingBox.Left <= b.BoundingBox.Left + T + && a.BoundingBox.Right >= b.BoundingBox.Right - T) + { + xRelation = IntervalRelations.DuringI; + } + + else if (a.BoundingBox.Left > b.BoundingBox.Left + T + && (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T)) + { + xRelation = IntervalRelations.Finishes; + } + else if (a.BoundingBox.Left <= b.BoundingBox.Left + T + && (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T)) + { + xRelation = IntervalRelations.FinishesI; + } + + else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T) + && (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T)) + { + xRelation = IntervalRelations.Equals; + } + + return xRelation; + } + + /// + /// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate. + /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. + /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed + /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete. + /// + /// + /// + /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. + private IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T) + { + IntervalRelations yRelation = IntervalRelations.Unknown; + + if (a.BoundingBox.Bottom < b.BoundingBox.Top - T) + { + yRelation = IntervalRelations.PrecedesI; + } + else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T) + { + yRelation = IntervalRelations.Precedes; + } + + else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom + && a.BoundingBox.Bottom <= b.BoundingBox.Top + T) + { + yRelation = IntervalRelations.MeetsI; + } + else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom + && a.BoundingBox.Bottom > b.BoundingBox.Top + T) + { + yRelation = IntervalRelations.Meets; + } + + else if (a.BoundingBox.Top < b.BoundingBox.Top - T + && (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)) + { + yRelation = IntervalRelations.OverlapsI; + } + else if (a.BoundingBox.Top >= b.BoundingBox.Top - T + && (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)) + { + yRelation = IntervalRelations.Overlaps; + } + + else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T) + && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T) + { + yRelation = IntervalRelations.StartsI; + } + else if ((b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T) + && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T) + { + yRelation = IntervalRelations.Starts; + } + + else if (a.BoundingBox.Top > b.BoundingBox.Top + T + && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T) + { + yRelation = IntervalRelations.DuringI; + } + else if (a.BoundingBox.Top <= b.BoundingBox.Top + T + && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T) + { + yRelation = IntervalRelations.During; + } + + else if (a.BoundingBox.Top > b.BoundingBox.Top + T + && (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T)) + { + yRelation = IntervalRelations.FinishesI; + } + else if (a.BoundingBox.Top <= b.BoundingBox.Top + T + && (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T)) + { + yRelation = IntervalRelations.Finishes; + } + + else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T) + && (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T)) + { + yRelation = IntervalRelations.Equals; + } + + return yRelation; + } + + /// + /// Allen’s interval thirteen relations. + /// See https://en.wikipedia.org/wiki/Allen%27s_interval_algebra + /// + private enum IntervalRelations + { + /// + /// Unknown interval relations. + /// + Unknown, + + /// + /// X takes place before Y. + /// |____X____|...................... + /// ......................|____Y____| + /// + Precedes, + + /// + /// X meets Y. + /// |____X____|................. + /// .................|____Y____| + /// + Meets, + + /// + /// X overlaps with Y. + /// |______X______|................. + /// .................|______Y______| + /// + Overlaps, + + /// + /// X starts Y. + /// |____X____|................. + /// |_____Y_____|.............. + /// + Starts, + + /// + /// X during Y. + /// ........|____X____|......... + /// .....|______Y______|..... + /// + During, + + /// + /// X finishes Y. + /// .................|____X____| + /// ..............|_____Y_____| + /// + Finishes, + + /// + /// Inverse precedes. + /// + PrecedesI, + + /// + /// Inverse meets. + /// + MeetsI, + + /// + /// Inverse overlaps. + /// + OverlapsI, + + /// + /// Inverse Starts. + /// + StartsI, + + /// + /// Inverse during. + /// + DuringI, + + /// + /// Inverse finishes. + /// + FinishesI, + + /// + /// X is equal to Y. + /// ..........|____X____|............ + /// ..........|____Y____|............ + /// + Equals + } + + private class NodeComparer : IComparer>> + { + public int Compare(KeyValuePair> x, KeyValuePair> y) + { + return x.Value.Count.CompareTo(y.Value.Count); + } + } + } +} diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextBlock.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextBlock.cs index 04808b87..ffba3a21 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextBlock.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextBlock.cs @@ -1,10 +1,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { + using Content; + using Core; using System; using System.Collections.Generic; using System.Linq; - using Content; - using Core; /// /// A block of text. @@ -31,6 +31,11 @@ /// public IReadOnlyList TextLines { get; } + /// + /// The reading order index. Starts at 0. A value of -1 means the block is not ordered. + /// + public int ReadingOrder { get; private set; } + /// /// Create a new . /// @@ -47,6 +52,8 @@ throw new ArgumentException("Empty lines provided.", nameof(lines)); } + ReadingOrder = -1; + TextLines = lines; Text = string.Join(" ", lines.Select(x => x.Text)); @@ -60,6 +67,15 @@ TextDirection = lines[0].TextDirection; } + internal void SetReadingOrder(int readingOrder) + { + if (readingOrder < -1) + { + throw new ArgumentException("The reading order should be more or equal to -1. A value of -1 means the block is not ordered.", nameof(readingOrder)); + } + this.ReadingOrder = readingOrder; + } + /// public override string ToString() { diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs index 7437700c..c0979f26 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -1,13 +1,12 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { + using Content; + using Core; using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; - using Content; - using Core; - using Geometry; /// /// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line. diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs index 7a4b4ada..6259fd76 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs @@ -1,10 +1,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { + using Content; + using Core; using System; using System.Collections.Generic; using System.Linq; - using Content; - using Core; /// /// A line of text. diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs index 62068f33..b08164a0 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs @@ -1,11 +1,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { - using System; - using System.Collections.Generic; - using System.Linq; using Content; using Core; using Geometry; + using System; + using System.Collections.Generic; + using System.Linq; /// /// A top-down algorithm that finds a cover of the background whitespace of a document in terms of maximal empty rectangles. diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/NearestNeighbourWordExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs similarity index 98% rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/NearestNeighbourWordExtractor.cs rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs index ad250064..6ba4edea 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/NearestNeighbourWordExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs @@ -1,10 +1,10 @@ -namespace UglyToad.PdfPig.DocumentLayoutAnalysis +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor { + using Content; + using Core; using System; using System.Collections.Generic; using System.Linq; - using Content; - using Core; using Util; ///