diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
index 2b26f516..865a6240 100644
--- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
@@ -65,6 +65,8 @@
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs
new file mode 100644
index 00000000..ffb821be
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs
@@ -0,0 +1,10 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ class Docstrum
+ {
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs
new file mode 100644
index 00000000..83fc7661
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs
@@ -0,0 +1,109 @@
+using System;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using UglyToad.PdfPig.Content;
+using UglyToad.PdfPig.Geometry;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line.
+ /// Useful to detect text columns, tables, justified text, lists, etc.
+ ///
+ public static class TextEdgesExtractor
+ {
+ ///
+ /// Functions used to define left, middle and right edges.
+ ///
+ private static readonly Tuple>[] edgesFuncs = new Tuple>[]
+ {
+ Tuple.Create>(EdgeType.Left, x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate
+ Tuple.Create>(EdgeType.Mid, x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate
+ Tuple.Create>(EdgeType.Right, x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate
+ };
+
+ ///
+ /// Get the text edges.
+ ///
+ /// The words in the page.
+ /// The minimum number of elements to define a text edge.
+ public static IReadOnlyDictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4)
+ {
+ if (minimumElements < 0)
+ {
+ throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements");
+ }
+
+ var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim()));
+
+ ConcurrentDictionary> dictionary = new ConcurrentDictionary>();
+
+ Parallel.ForEach(edgesFuncs, f =>
+ {
+ dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
+ });
+ return dictionary.ToDictionary(x => x.Key, x => x.Value);
+ }
+
+ private static List GetVerticalEdges(IEnumerable pageWords, Func func, int minimumElements)
+ {
+ Dictionary> edges = pageWords.GroupBy(x => func(x.BoundingBox))
+ .Where(x => x.Count() >= minimumElements).ToDictionary(gdc => gdc.Key, gdc => gdc.ToList());
+ Dictionary>> cleanEdges = new Dictionary>>();
+
+ foreach (var edge in edges)
+ {
+ var sortedEdges = edge.Value.OrderBy(x => x.BoundingBox.Bottom).ToList();
+ cleanEdges.Add(edge.Key, new List>());
+
+ var cuttings = pageWords.Except(edge.Value) // remove selected words
+ // words that cut the vertical line
+ .Where(x => x.BoundingBox.Left < edge.Key && x.BoundingBox.Right > edge.Key)
+ // and that are within the boundaries of the edge
+ .Where(k => k.BoundingBox.Bottom > edge.Value.Min(z => z.BoundingBox.Bottom)
+ && k.BoundingBox.Top < edge.Value.Max(z => z.BoundingBox.Top))
+ .OrderBy(x => x.BoundingBox.Bottom).ToList();
+
+ if (cuttings.Count > 0)
+ {
+ foreach (var cut in cuttings)
+ {
+ var group1 = sortedEdges.Where(x => x.BoundingBox.Top < cut.BoundingBox.Bottom).ToList();
+ if (group1.Count >= minimumElements) cleanEdges[edge.Key].Add(group1);
+ sortedEdges = sortedEdges.Except(group1).ToList();
+ }
+ if (sortedEdges.Count >= minimumElements) cleanEdges[edge.Key].Add(sortedEdges);
+ }
+ else
+ {
+ cleanEdges[edge.Key].Add(sortedEdges);
+ }
+ }
+
+ return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList();
+ }
+ }
+
+ ///
+ /// The type of text edge.
+ ///
+ public enum EdgeType
+ {
+ ///
+ /// Text edges where words have their BoundingBox's left coordinate aligned on the same vertical line.
+ ///
+ Left = 0,
+
+ ///
+ /// Text edges where words have their BoundingBox's mid coordinate aligned on the same vertical line.
+ ///
+ Mid = 1,
+
+ ///
+ /// Text edges where words have their BoundingBox's right coordinate aligned on the same vertical line.
+ ///
+ Right = 2
+ }
+}