diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs
index 7aafb60f..4effd98a 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs
@@ -1,13 +1,14 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
+ using Content;
+ using Geometry;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
- using Content;
- using Geometry;
+ using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using Util;
///
@@ -15,7 +16,7 @@
/// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.
- /// See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.
+ /// See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.
///
public static class DecorationTextBlockClassifier
{
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs
index d87e9675..2ee54379 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs
@@ -1,10 +1,9 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
+ using Core;
using System;
using System.Collections.Generic;
using System.Linq;
- using Core;
- using Geometry;
///
/// Contains helpful tools for distance measures.
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs
index 52fd2097..d7010045 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs
@@ -1,15 +1,15 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export
{
+ using Alto;
+ using Content;
+ using Core;
+ using DocumentLayoutAnalysis;
using System;
using System.Globalization;
using System.Linq;
using System.Xml;
using System.Xml.Serialization;
- using Alto;
- using Content;
- using Core;
- using DocumentLayoutAnalysis;
- using Geometry;
+ using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using Util;
///
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs
index 9000e21f..1b0a40f0 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs
@@ -1,11 +1,11 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export
{
- using System;
- using System.Linq;
using Content;
using Core;
using DocumentLayoutAnalysis;
- using Geometry;
+ using System;
+ using System.Linq;
+ using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using Util;
///
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs
index 4572517d..43abea69 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs
@@ -1,16 +1,16 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export
{
+ using Content;
+ using Core;
+ using DocumentLayoutAnalysis;
+ using Graphics.Colors;
+ using PAGE;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Xml;
using System.Xml.Serialization;
- using Content;
- using Core;
- using DocumentLayoutAnalysis;
- using Geometry;
- using Graphics.Colors;
- using PAGE;
+ using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using Util;
///
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DefaultPageSegmenter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DefaultPageSegmenter.cs
similarity index 93%
rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/DefaultPageSegmenter.cs
rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DefaultPageSegmenter.cs
index 1e1391c9..bd8fe89a 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DefaultPageSegmenter.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DefaultPageSegmenter.cs
@@ -1,9 +1,9 @@
-namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
{
- using System.Collections.Generic;
- using System.Linq;
using Content;
using Core;
+ using System.Collections.Generic;
+ using System.Linq;
///
/// Default Page Segmenter. All words are included in one block.
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
similarity index 99%
rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs
rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
index 67f86303..34df2ade 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DocstrumBoundingBoxes.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
@@ -1,13 +1,13 @@
-namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
{
+ using Content;
+ using Core;
+ using Geometry;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
- using Geometry;
- using Content;
- using Core;
///
///
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/IPageSegmenter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenter.cs
similarity index 91%
rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/IPageSegmenter.cs
rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenter.cs
index d4c02223..953203f5 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/IPageSegmenter.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenter.cs
@@ -1,7 +1,7 @@
-namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
{
- using System.Collections.Generic;
using Content;
+ using System.Collections.Generic;
///
/// Page segmentation divides a page into areas, each consisting of a layout structure (blocks, lines, etc.).
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs
similarity index 99%
rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/RecursiveXYCut.cs
rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs
index 63cdc553..9f320ad7 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/RecursiveXYCut.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs
@@ -1,10 +1,10 @@
-namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
{
+ using Content;
+ using Core;
using System;
using System.Collections.Generic;
using System.Linq;
- using Content;
- using Core;
///
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/XYLeaf.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYLeaf.cs
similarity index 97%
rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/XYLeaf.cs
rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYLeaf.cs
index cc8c0595..784e47ea 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/XYLeaf.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYLeaf.cs
@@ -1,10 +1,10 @@
-namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
{
+ using Content;
+ using Core;
using System;
using System.Collections.Generic;
using System.Linq;
- using Content;
- using Core;
///
/// A Leaf node used in the algorithm, i.e. a block.
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/XYNode.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYNode.cs
similarity index 98%
rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/XYNode.cs
rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYNode.cs
index 680b6184..bf5422c2 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/XYNode.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYNode.cs
@@ -1,8 +1,8 @@
-namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
{
+ using Core;
using System.Collections.Generic;
using System.Linq;
- using Core;
///
/// A Node used in the algorithm.
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/DefaultReadingOrderDetector.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/DefaultReadingOrderDetector.cs
new file mode 100644
index 00000000..5b3ba78c
--- /dev/null
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/DefaultReadingOrderDetector.cs
@@ -0,0 +1,25 @@
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
+{
+ using System.Collections.Generic;
+
+ ///
+ /// This detector does nothing, no ordering takes place.
+ ///
+ public class DefaultReadingOrderDetector : IReadingOrderDetector
+ {
+ ///
+ /// Create an instance of default reading order detector, .
+ /// This detector does nothing, no ordering takes place.
+ ///
+ public static DefaultReadingOrderDetector Instance { get; } = new DefaultReadingOrderDetector();
+
+ ///
+ /// Gets the blocks in reading order and sets the .
+ ///
+ /// The s to order.
+ public IEnumerable Get(IReadOnlyList textBlocks)
+ {
+ return textBlocks;
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/IReadingOrderDetector.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/IReadingOrderDetector.cs
new file mode 100644
index 00000000..08f62d69
--- /dev/null
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/IReadingOrderDetector.cs
@@ -0,0 +1,16 @@
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
+{
+ using System.Collections.Generic;
+
+ ///
+ /// Reading order detector determines the page's blocks reading order.
+ ///
+ public interface IReadingOrderDetector
+ {
+ ///
+ /// Gets the blocks in reading order and sets the .
+ ///
+ /// The s to order.
+ IEnumerable Get(IReadOnlyList textBlocks);
+ }
+}
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/RenderingReadingOrderDetector.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/RenderingReadingOrderDetector.cs
new file mode 100644
index 00000000..f5e34448
--- /dev/null
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/RenderingReadingOrderDetector.cs
@@ -0,0 +1,37 @@
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
+{
+ using System.Collections.Generic;
+ using System.Linq;
+
+ ///
+ /// Algorithm that retrieve the blocks' reading order using rendering order (TextSequence).
+ ///
+ public class RenderingReadingOrderDetector : IReadingOrderDetector
+ {
+ ///
+ /// Create an instance of rendering reading order detector, .
+ /// This detector uses the rendering order (TextSequence).
+ ///
+ public static RenderingReadingOrderDetector Instance { get; } = new RenderingReadingOrderDetector();
+
+ ///
+ /// Gets the blocks in reading order and sets the .
+ ///
+ /// The s to order.
+ public IEnumerable Get(IReadOnlyList textBlocks)
+ {
+ int readingOrder = 0;
+
+ foreach (var block in textBlocks.OrderBy(b => AvgTextSequence(b)))
+ {
+ block.SetReadingOrder(readingOrder++);
+ yield return block;
+ }
+ }
+
+ private double AvgTextSequence(TextBlock textBlock)
+ {
+ return textBlock.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs
new file mode 100644
index 00000000..25a2ba96
--- /dev/null
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs
@@ -0,0 +1,464 @@
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
+{
+ using System.Collections.Generic;
+ using System.Linq;
+
+ ///
+ /// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order (TextSequence).
+ /// See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern and 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz.
+ ///
+ public class UnsupervisedReadingOrderDetector : IReadingOrderDetector
+ {
+ ///
+ /// Create an instance of unsupervised reading order detector, .
+ /// This detector uses the (spatial) Allen’s interval relations and rendering order (TextSequence).
+ ///
+ public static UnsupervisedReadingOrderDetector Instance { get; } = new UnsupervisedReadingOrderDetector();
+
+ private double T;
+
+ ///
+ /// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order.
+ ///
+ /// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
+ /// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
+ /// same column might not be exactly aligned.
+ public UnsupervisedReadingOrderDetector(double T = 5)
+ {
+ this.T = T;
+ }
+
+ ///
+ /// Gets the blocks in reading order and sets the .
+ ///
+ /// The s to order.
+ public IEnumerable Get(IReadOnlyList textBlocks)
+ {
+ int readingOrder = 0;
+
+ var graph = BuildGraph(textBlocks, T);
+
+ while (graph.Any())
+ {
+ var maxCount = graph.Max(kvp => kvp.Value.Count);
+ var current = graph.Where(kvp => kvp.Value.Count == maxCount).FirstOrDefault();
+ graph.Remove(current.Key);
+ int index = current.Key;
+
+ foreach (var g in graph)
+ {
+ g.Value.Remove(index);
+ }
+
+ var block = textBlocks[index];
+ block.SetReadingOrder(readingOrder++);
+
+ yield return block;
+ }
+ }
+
+ private Dictionary> BuildGraph(IReadOnlyList textBlocks, double T)
+ {
+ // We incorporate both relations into a single partial ordering of blocks by specifying a
+ // directed graph with an edge between every pair of blocks for which at least one of the
+ // two relations hold.
+
+ var graph = new Dictionary>();
+
+ for (int i = 0; i < textBlocks.Count; i++)
+ {
+ graph.Add(i, new List());
+ }
+
+ for (int i = 0; i < textBlocks.Count; i++)
+ {
+ var a = textBlocks[i];
+ for (int j = 0; j < textBlocks.Count; j++)
+ {
+ if (i == j) continue;
+ var b = textBlocks[j];
+
+ if (GetBeforeInReadingRendering(a, b, T))
+ {
+ graph[i].Add(j);
+ }
+ }
+ }
+
+ return graph;
+ }
+
+ private bool GetBeforeInReadingRendering(TextBlock a, TextBlock b, double T)
+ {
+ return GetBeforeInReadingVertical(a, b, T) || GetBeforeInRendering(a, b);
+ }
+
+ private bool GetBeforeInRendering(TextBlock a, TextBlock b)
+ {
+ var avgTextSequenceA = a.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
+ var avgTextSequenceB = b.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
+ return avgTextSequenceA < avgTextSequenceB;
+ }
+
+ private bool GetBeforeInReading(TextBlock a, TextBlock b, double T)
+ {
+ IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
+ IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
+
+ if (xRelation == IntervalRelations.Precedes ||
+ yRelation == IntervalRelations.Precedes ||
+ xRelation == IntervalRelations.Meets ||
+ yRelation == IntervalRelations.Meets ||
+ xRelation == IntervalRelations.Overlaps ||
+ yRelation == IntervalRelations.Overlaps)
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ ///
+ /// Column-wise: text-blocks are read in columns, from top-to-bottom and from left-to-right.
+ ///
+ ///
+ ///
+ /// The tolerance parameter T.
+ ///
+ private bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T)
+ {
+ IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
+ IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
+
+ if (xRelation == IntervalRelations.Precedes ||
+ xRelation == IntervalRelations.Meets ||
+ (xRelation == IntervalRelations.Overlaps && (yRelation == IntervalRelations.Precedes ||
+ yRelation == IntervalRelations.Meets ||
+ yRelation == IntervalRelations.Overlaps)) ||
+ ((yRelation == IntervalRelations.Precedes || yRelation == IntervalRelations.Meets || yRelation == IntervalRelations.Overlaps) &&
+ (xRelation == IntervalRelations.Precedes ||
+ xRelation == IntervalRelations.Meets ||
+ xRelation == IntervalRelations.Overlaps ||
+ xRelation == IntervalRelations.Starts ||
+ xRelation == IntervalRelations.FinishesI ||
+ xRelation == IntervalRelations.Equals ||
+ xRelation == IntervalRelations.During ||
+ xRelation == IntervalRelations.DuringI ||
+ xRelation == IntervalRelations.Finishes ||
+ xRelation == IntervalRelations.StartsI ||
+ xRelation == IntervalRelations.OverlapsI)))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ ///
+ /// Row-wise: text-blocks are read in rows from left-to-right, top- to-bottom.
+ ///
+ ///
+ ///
+ /// The tolerance parameter T.
+ ///
+ private bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
+ {
+ IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
+ IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
+
+ if (yRelation == IntervalRelations.Precedes ||
+ yRelation == IntervalRelations.Meets ||
+ (yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes ||
+ xRelation == IntervalRelations.Meets ||
+ xRelation == IntervalRelations.Overlaps)) ||
+ ((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) &&
+ (yRelation == IntervalRelations.Precedes ||
+ yRelation == IntervalRelations.Meets ||
+ yRelation == IntervalRelations.Overlaps ||
+ yRelation == IntervalRelations.Starts ||
+ yRelation == IntervalRelations.FinishesI ||
+ yRelation == IntervalRelations.Equals ||
+ yRelation == IntervalRelations.During ||
+ yRelation == IntervalRelations.DuringI ||
+ yRelation == IntervalRelations.Finishes ||
+ yRelation == IntervalRelations.StartsI ||
+ yRelation == IntervalRelations.OverlapsI)))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ ///
+ /// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate.
+ /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
+ /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
+ /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.
+ ///
+ ///
+ ///
+ /// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
+ private IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T)
+ {
+ IntervalRelations xRelation = IntervalRelations.Unknown;
+
+ if (a.BoundingBox.Right < b.BoundingBox.Left - T)
+ {
+ xRelation = IntervalRelations.Precedes;
+ }
+ else if (a.BoundingBox.Right >= b.BoundingBox.Left - T)
+ {
+ xRelation = IntervalRelations.PrecedesI;
+ }
+
+ else if (b.BoundingBox.Left - T <= a.BoundingBox.Right
+ && a.BoundingBox.Right <= b.BoundingBox.Left + T)
+ {
+ xRelation = IntervalRelations.Meets;
+ }
+ else if (b.BoundingBox.Left - T > a.BoundingBox.Right
+ && a.BoundingBox.Right > b.BoundingBox.Left + T)
+ {
+ xRelation = IntervalRelations.MeetsI;
+ }
+
+ else if (a.BoundingBox.Left < b.BoundingBox.Left - T
+ && (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T))
+ {
+ xRelation = IntervalRelations.Overlaps;
+ }
+ else if (a.BoundingBox.Left >= b.BoundingBox.Left - T
+ && (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T))
+ {
+ xRelation = IntervalRelations.OverlapsI;
+ }
+
+ else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
+ && a.BoundingBox.Right < b.BoundingBox.Right - T)
+ {
+ xRelation = IntervalRelations.Starts;
+ }
+ else if ((b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T)
+ && a.BoundingBox.Right >= b.BoundingBox.Right - T)
+ {
+ xRelation = IntervalRelations.StartsI;
+ }
+
+ else if (a.BoundingBox.Left > b.BoundingBox.Left + T
+ && a.BoundingBox.Right < b.BoundingBox.Right - T)
+ {
+ xRelation = IntervalRelations.During;
+ }
+ else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
+ && a.BoundingBox.Right >= b.BoundingBox.Right - T)
+ {
+ xRelation = IntervalRelations.DuringI;
+ }
+
+ else if (a.BoundingBox.Left > b.BoundingBox.Left + T
+ && (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
+ {
+ xRelation = IntervalRelations.Finishes;
+ }
+ else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
+ && (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T))
+ {
+ xRelation = IntervalRelations.FinishesI;
+ }
+
+ else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
+ && (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
+ {
+ xRelation = IntervalRelations.Equals;
+ }
+
+ return xRelation;
+ }
+
+ ///
+ /// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate.
+ /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
+ /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
+ /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.
+ ///
+ ///
+ ///
+ /// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
+ private IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T)
+ {
+ IntervalRelations yRelation = IntervalRelations.Unknown;
+
+ if (a.BoundingBox.Bottom < b.BoundingBox.Top - T)
+ {
+ yRelation = IntervalRelations.PrecedesI;
+ }
+ else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T)
+ {
+ yRelation = IntervalRelations.Precedes;
+ }
+
+ else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom
+ && a.BoundingBox.Bottom <= b.BoundingBox.Top + T)
+ {
+ yRelation = IntervalRelations.MeetsI;
+ }
+ else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom
+ && a.BoundingBox.Bottom > b.BoundingBox.Top + T)
+ {
+ yRelation = IntervalRelations.Meets;
+ }
+
+ else if (a.BoundingBox.Top < b.BoundingBox.Top - T
+ && (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T))
+ {
+ yRelation = IntervalRelations.OverlapsI;
+ }
+ else if (a.BoundingBox.Top >= b.BoundingBox.Top - T
+ && (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T))
+ {
+ yRelation = IntervalRelations.Overlaps;
+ }
+
+ else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
+ && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
+ {
+ yRelation = IntervalRelations.StartsI;
+ }
+ else if ((b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T)
+ && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
+ {
+ yRelation = IntervalRelations.Starts;
+ }
+
+ else if (a.BoundingBox.Top > b.BoundingBox.Top + T
+ && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
+ {
+ yRelation = IntervalRelations.DuringI;
+ }
+ else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
+ && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
+ {
+ yRelation = IntervalRelations.During;
+ }
+
+ else if (a.BoundingBox.Top > b.BoundingBox.Top + T
+ && (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
+ {
+ yRelation = IntervalRelations.FinishesI;
+ }
+ else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
+ && (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T))
+ {
+ yRelation = IntervalRelations.Finishes;
+ }
+
+ else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
+ && (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
+ {
+ yRelation = IntervalRelations.Equals;
+ }
+
+ return yRelation;
+ }
+
+ ///
+ /// Allen’s interval thirteen relations.
+ /// See https://en.wikipedia.org/wiki/Allen%27s_interval_algebra
+ ///
+ private enum IntervalRelations
+ {
+ ///
+ /// Unknown interval relations.
+ ///
+ Unknown,
+
+ ///
+ /// X takes place before Y.
+ /// |____X____|......................
+ /// ......................|____Y____|
+ ///
+ Precedes,
+
+ ///
+ /// X meets Y.
+ /// |____X____|.................
+ /// .................|____Y____|
+ ///
+ Meets,
+
+ ///
+ /// X overlaps with Y.
+ /// |______X______|.................
+ /// .................|______Y______|
+ ///
+ Overlaps,
+
+ ///
+ /// X starts Y.
+ /// |____X____|.................
+ /// |_____Y_____|..............
+ ///
+ Starts,
+
+ ///
+ /// X during Y.
+ /// ........|____X____|.........
+ /// .....|______Y______|.....
+ ///
+ During,
+
+ ///
+ /// X finishes Y.
+ /// .................|____X____|
+ /// ..............|_____Y_____|
+ ///
+ Finishes,
+
+ ///
+ /// Inverse precedes.
+ ///
+ PrecedesI,
+
+ ///
+ /// Inverse meets.
+ ///
+ MeetsI,
+
+ ///
+ /// Inverse overlaps.
+ ///
+ OverlapsI,
+
+ ///
+ /// Inverse Starts.
+ ///
+ StartsI,
+
+ ///
+ /// Inverse during.
+ ///
+ DuringI,
+
+ ///
+ /// Inverse finishes.
+ ///
+ FinishesI,
+
+ ///
+ /// X is equal to Y.
+ /// ..........|____X____|............
+ /// ..........|____Y____|............
+ ///
+ Equals
+ }
+
+ private class NodeComparer : IComparer>>
+ {
+ public int Compare(KeyValuePair> x, KeyValuePair> y)
+ {
+ return x.Value.Count.CompareTo(y.Value.Count);
+ }
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextBlock.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextBlock.cs
index 04808b87..ffba3a21 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextBlock.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextBlock.cs
@@ -1,10 +1,10 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
+ using Content;
+ using Core;
using System;
using System.Collections.Generic;
using System.Linq;
- using Content;
- using Core;
///
/// A block of text.
@@ -31,6 +31,11 @@
///
public IReadOnlyList TextLines { get; }
+ ///
+ /// The reading order index. Starts at 0. A value of -1 means the block is not ordered.
+ ///
+ public int ReadingOrder { get; private set; }
+
///
/// Create a new .
///
@@ -47,6 +52,8 @@
throw new ArgumentException("Empty lines provided.", nameof(lines));
}
+ ReadingOrder = -1;
+
TextLines = lines;
Text = string.Join(" ", lines.Select(x => x.Text));
@@ -60,6 +67,15 @@
TextDirection = lines[0].TextDirection;
}
+ internal void SetReadingOrder(int readingOrder)
+ {
+ if (readingOrder < -1)
+ {
+ throw new ArgumentException("The reading order should be more or equal to -1. A value of -1 means the block is not ordered.", nameof(readingOrder));
+ }
+ this.ReadingOrder = readingOrder;
+ }
+
///
public override string ToString()
{
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs
index 7437700c..c0979f26 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs
@@ -1,13 +1,12 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
+ using Content;
+ using Core;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
- using Content;
- using Core;
- using Geometry;
///
/// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line.
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs
index 7a4b4ada..6259fd76 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs
@@ -1,10 +1,10 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
+ using Content;
+ using Core;
using System;
using System.Collections.Generic;
using System.Linq;
- using Content;
- using Core;
///
/// A line of text.
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs
index 62068f33..b08164a0 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs
@@ -1,11 +1,11 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
- using System;
- using System.Collections.Generic;
- using System.Linq;
using Content;
using Core;
using Geometry;
+ using System;
+ using System.Collections.Generic;
+ using System.Linq;
///
/// A top-down algorithm that finds a cover of the background whitespace of a document in terms of maximal empty rectangles.
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/NearestNeighbourWordExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs
similarity index 98%
rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/NearestNeighbourWordExtractor.cs
rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs
index ad250064..6ba4edea 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/NearestNeighbourWordExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs
@@ -1,10 +1,10 @@
-namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor
{
+ using Content;
+ using Core;
using System;
using System.Collections.Generic;
using System.Linq;
- using Content;
- using Core;
using Util;
///