From 83889cfb52b1780da7e685b87f248eaa20018134 Mon Sep 17 00:00:00 2001 From: BobLd Date: Tue, 6 Aug 2019 15:24:16 +0100 Subject: [PATCH] Document Layout Analysis - Text edges extractor Text edges are where words have either there BoundingBox's left, right or mid coordinate aligned on the same vertical line. Useful to detect tables, justified text, lists, etc. --- .../PublicApiScannerTests.cs | 1 + .../TextEdgesExtractor.cs | 83 +++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 2b26f516..54a98e37 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -65,6 +65,7 @@ "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf", + "UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Fonts.DescriptorFontFile", diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs new file mode 100644 index 00000000..d07eab70 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -0,0 +1,83 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Text edges extractor. Text edges are where words have either there BoundingBox's left, right or mid coordinate aligned on the same vertical line. + /// Useful to detect tables, justified text, lists, etc. + /// + public class TextEdgesExtractor + { + /// + /// Functions used to define left, middle and right edges. + /// + private static readonly Tuple>[] edgesFuncs = new Tuple>[] + { + Tuple.Create>("left", x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate + Tuple.Create>("mid", x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate + Tuple.Create>("right", x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate + }; + + /// + /// Get the text edges. + /// + /// The words in the page. + /// The minimum number of elements to define a text edge. + public static Dictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4) + { + var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim())); + + ConcurrentDictionary> dictionary = new ConcurrentDictionary>(); + + Parallel.ForEach(edgesFuncs, f => + { + dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements)); + }); + return dictionary.ToDictionary(x => x.Key, x => x.Value); + } + + private static List GetVerticalEdges(IEnumerable pageWords, Func func, int minimumElements) + { + Dictionary> edges = pageWords.GroupBy(x => func(x.BoundingBox)) + .Where(x => x.Count() >= minimumElements).ToDictionary(gdc => gdc.Key, gdc => gdc.ToList()); + Dictionary>> cleanEdges = new Dictionary>>(); + + foreach (var edge in edges) + { + var sortedEdges = edge.Value.OrderBy(x => x.BoundingBox.Bottom).ToList(); + cleanEdges.Add(edge.Key, new List>()); + + var cuttings = pageWords.Except(edge.Value) // remove selected words + // words that cut the vertical line + .Where(x => x.BoundingBox.Left < edge.Key && x.BoundingBox.Right > edge.Key) + // and that are within the boundaries of the edge + .Where(k => k.BoundingBox.Bottom > edge.Value.Min(z => z.BoundingBox.Bottom) + && k.BoundingBox.Top < edge.Value.Max(z => z.BoundingBox.Top)) + .OrderBy(x => x.BoundingBox.Bottom).ToList(); + + if (cuttings.Count > 0) + { + foreach (var cut in cuttings) + { + var group1 = sortedEdges.Where(x => x.BoundingBox.Top < cut.BoundingBox.Bottom).ToList(); + if (group1.Count >= minimumElements) cleanEdges[edge.Key].Add(group1); + sortedEdges = sortedEdges.Except(group1).ToList(); + } + if (sortedEdges.Count >= minimumElements) cleanEdges[edge.Key].Add(sortedEdges); + } + else + { + cleanEdges[edge.Key].Add(sortedEdges); + } + } + + return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList(); + } + } +}