From 83889cfb52b1780da7e685b87f248eaa20018134 Mon Sep 17 00:00:00 2001 From: BobLd Date: Tue, 6 Aug 2019 15:24:16 +0100 Subject: [PATCH 1/6] Document Layout Analysis - Text edges extractor Text edges are where words have either there BoundingBox's left, right or mid coordinate aligned on the same vertical line. Useful to detect tables, justified text, lists, etc. --- .../PublicApiScannerTests.cs | 1 + .../TextEdgesExtractor.cs | 83 +++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 2b26f516..54a98e37 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -65,6 +65,7 @@ "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf", + "UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Fonts.DescriptorFontFile", diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs new file mode 100644 index 00000000..d07eab70 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -0,0 +1,83 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Text edges extractor. Text edges are where words have either there BoundingBox's left, right or mid coordinate aligned on the same vertical line. + /// Useful to detect tables, justified text, lists, etc. + /// + public class TextEdgesExtractor + { + /// + /// Functions used to define left, middle and right edges. + /// + private static readonly Tuple>[] edgesFuncs = new Tuple>[] + { + Tuple.Create>("left", x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate + Tuple.Create>("mid", x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate + Tuple.Create>("right", x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate + }; + + /// + /// Get the text edges. + /// + /// The words in the page. + /// The minimum number of elements to define a text edge. + public static Dictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4) + { + var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim())); + + ConcurrentDictionary> dictionary = new ConcurrentDictionary>(); + + Parallel.ForEach(edgesFuncs, f => + { + dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements)); + }); + return dictionary.ToDictionary(x => x.Key, x => x.Value); + } + + private static List GetVerticalEdges(IEnumerable pageWords, Func func, int minimumElements) + { + Dictionary> edges = pageWords.GroupBy(x => func(x.BoundingBox)) + .Where(x => x.Count() >= minimumElements).ToDictionary(gdc => gdc.Key, gdc => gdc.ToList()); + Dictionary>> cleanEdges = new Dictionary>>(); + + foreach (var edge in edges) + { + var sortedEdges = edge.Value.OrderBy(x => x.BoundingBox.Bottom).ToList(); + cleanEdges.Add(edge.Key, new List>()); + + var cuttings = pageWords.Except(edge.Value) // remove selected words + // words that cut the vertical line + .Where(x => x.BoundingBox.Left < edge.Key && x.BoundingBox.Right > edge.Key) + // and that are within the boundaries of the edge + .Where(k => k.BoundingBox.Bottom > edge.Value.Min(z => z.BoundingBox.Bottom) + && k.BoundingBox.Top < edge.Value.Max(z => z.BoundingBox.Top)) + .OrderBy(x => x.BoundingBox.Bottom).ToList(); + + if (cuttings.Count > 0) + { + foreach (var cut in cuttings) + { + var group1 = sortedEdges.Where(x => x.BoundingBox.Top < cut.BoundingBox.Bottom).ToList(); + if (group1.Count >= minimumElements) cleanEdges[edge.Key].Add(group1); + sortedEdges = sortedEdges.Except(group1).ToList(); + } + if (sortedEdges.Count >= minimumElements) cleanEdges[edge.Key].Add(sortedEdges); + } + else + { + cleanEdges[edge.Key].Add(sortedEdges); + } + } + + return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList(); + } + } +} From 9694b1f8e89ea6fce751ef43a020ec0d3edac967 Mon Sep 17 00:00:00 2001 From: BobLd <38405645+BobLd@users.noreply.github.com> Date: Tue, 6 Aug 2019 15:27:16 +0100 Subject: [PATCH 2/6] Update TextEdgesExtractor.cs --- .../DocumentLayoutAnalysis/TextEdgesExtractor.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs index d07eab70..957e6acf 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -9,8 +9,8 @@ using UglyToad.PdfPig.Geometry; namespace UglyToad.PdfPig.DocumentLayoutAnalysis { /// - /// Text edges extractor. Text edges are where words have either there BoundingBox's left, right or mid coordinate aligned on the same vertical line. - /// Useful to detect tables, justified text, lists, etc. + /// Text edges extractor. Text edges are where words have either there BoundingBox's left, right or mid coordinates aligned on the same vertical line. + /// Useful to detect text columns, tables, justified text, lists, etc. /// public class TextEdgesExtractor { From 85d5bb7c7e09aa0e932d0f994781780d6755b4ad Mon Sep 17 00:00:00 2001 From: BobLd Date: Wed, 7 Aug 2019 13:45:57 +0100 Subject: [PATCH 3/6] Adding enum EdgeType --- .../TextEdgesExtractor.cs | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs index 957e6acf..20f84ae4 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -80,4 +80,25 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList(); } } + + /// + /// The type of edge. + /// + public enum EdgeType + { + /// + /// Text edges where words have their BoundingBox's left coordinate aligned on the same vertical line. + /// + Left = 0, + + /// + /// Text edges where words have their BoundingBox's mid coordinate aligned on the same vertical line. + /// + Mid = 1, + + /// + /// Text edges where words have their BoundingBox's right coordinate aligned on the same vertical line. + /// + Right = 2 + } } From e19b03035ef56e7e7e506b639af1dc262addcbec Mon Sep 17 00:00:00 2001 From: BobLd Date: Wed, 7 Aug 2019 13:49:05 +0100 Subject: [PATCH 4/6] Updating woth comments --- .../TextEdgesExtractor.cs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs index 20f84ae4..c2207184 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -9,7 +9,7 @@ using UglyToad.PdfPig.Geometry; namespace UglyToad.PdfPig.DocumentLayoutAnalysis { /// - /// Text edges extractor. Text edges are where words have either there BoundingBox's left, right or mid coordinates aligned on the same vertical line. + /// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line. /// Useful to detect text columns, tables, justified text, lists, etc. /// public class TextEdgesExtractor @@ -17,11 +17,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// Functions used to define left, middle and right edges. /// - private static readonly Tuple>[] edgesFuncs = new Tuple>[] + private static readonly Tuple>[] edgesFuncs = new Tuple>[] { - Tuple.Create>("left", x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate - Tuple.Create>("mid", x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate - Tuple.Create>("right", x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate + Tuple.Create>(EdgeType.Left, x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate + Tuple.Create>(EdgeType.Mid, x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate + Tuple.Create>(EdgeType.Right, x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate }; /// @@ -29,11 +29,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// The words in the page. /// The minimum number of elements to define a text edge. - public static Dictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4) + public static IReadOnlyDictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4) { + if (minimumElements < 0) + { + throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements"); + } + var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim())); - ConcurrentDictionary> dictionary = new ConcurrentDictionary>(); + ConcurrentDictionary> dictionary = new ConcurrentDictionary>(); Parallel.ForEach(edgesFuncs, f => { From 7de6de3780a231b1cff1736ceed315a2c2721c1b Mon Sep 17 00:00:00 2001 From: BobLd Date: Wed, 7 Aug 2019 13:50:07 +0100 Subject: [PATCH 5/6] Updating with comments --- .../DocumentLayoutAnalysis/TextEdgesExtractor.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs index c2207184..2cd6175f 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -12,7 +12,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line. /// Useful to detect text columns, tables, justified text, lists, etc. /// - public class TextEdgesExtractor + public static class TextEdgesExtractor { /// /// Functions used to define left, middle and right edges. From 801ea3ba7f9710e9d63e6ebf74dc799b7335bbbf Mon Sep 17 00:00:00 2001 From: BobLd Date: Wed, 7 Aug 2019 14:22:39 +0100 Subject: [PATCH 6/6] Modified PublicApiScannerTests --- src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs | 1 + src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs | 10 ++++++++++ .../DocumentLayoutAnalysis/TextEdgesExtractor.cs | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 54a98e37..865a6240 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -66,6 +66,7 @@ "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf", "UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor", + "UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Fonts.DescriptorFontFile", diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs new file mode 100644 index 00000000..ffb821be --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs @@ -0,0 +1,10 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + class Docstrum + { + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs index 2cd6175f..83fc7661 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -87,7 +87,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis } /// - /// The type of edge. + /// The type of text edge. /// public enum EdgeType {