diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 2b26f516..865a6240 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -65,6 +65,8 @@ "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf", + "UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor", + "UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Fonts.DescriptorFontFile", diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs new file mode 100644 index 00000000..ffb821be --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs @@ -0,0 +1,10 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + class Docstrum + { + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs new file mode 100644 index 00000000..83fc7661 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -0,0 +1,109 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line. + /// Useful to detect text columns, tables, justified text, lists, etc. + /// + public static class TextEdgesExtractor + { + /// + /// Functions used to define left, middle and right edges. + /// + private static readonly Tuple>[] edgesFuncs = new Tuple>[] + { + Tuple.Create>(EdgeType.Left, x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate + Tuple.Create>(EdgeType.Mid, x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate + Tuple.Create>(EdgeType.Right, x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate + }; + + /// + /// Get the text edges. + /// + /// The words in the page. + /// The minimum number of elements to define a text edge. + public static IReadOnlyDictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4) + { + if (minimumElements < 0) + { + throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements"); + } + + var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim())); + + ConcurrentDictionary> dictionary = new ConcurrentDictionary>(); + + Parallel.ForEach(edgesFuncs, f => + { + dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements)); + }); + return dictionary.ToDictionary(x => x.Key, x => x.Value); + } + + private static List GetVerticalEdges(IEnumerable pageWords, Func func, int minimumElements) + { + Dictionary> edges = pageWords.GroupBy(x => func(x.BoundingBox)) + .Where(x => x.Count() >= minimumElements).ToDictionary(gdc => gdc.Key, gdc => gdc.ToList()); + Dictionary>> cleanEdges = new Dictionary>>(); + + foreach (var edge in edges) + { + var sortedEdges = edge.Value.OrderBy(x => x.BoundingBox.Bottom).ToList(); + cleanEdges.Add(edge.Key, new List>()); + + var cuttings = pageWords.Except(edge.Value) // remove selected words + // words that cut the vertical line + .Where(x => x.BoundingBox.Left < edge.Key && x.BoundingBox.Right > edge.Key) + // and that are within the boundaries of the edge + .Where(k => k.BoundingBox.Bottom > edge.Value.Min(z => z.BoundingBox.Bottom) + && k.BoundingBox.Top < edge.Value.Max(z => z.BoundingBox.Top)) + .OrderBy(x => x.BoundingBox.Bottom).ToList(); + + if (cuttings.Count > 0) + { + foreach (var cut in cuttings) + { + var group1 = sortedEdges.Where(x => x.BoundingBox.Top < cut.BoundingBox.Bottom).ToList(); + if (group1.Count >= minimumElements) cleanEdges[edge.Key].Add(group1); + sortedEdges = sortedEdges.Except(group1).ToList(); + } + if (sortedEdges.Count >= minimumElements) cleanEdges[edge.Key].Add(sortedEdges); + } + else + { + cleanEdges[edge.Key].Add(sortedEdges); + } + } + + return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList(); + } + } + + /// + /// The type of text edge. + /// + public enum EdgeType + { + /// + /// Text edges where words have their BoundingBox's left coordinate aligned on the same vertical line. + /// + Left = 0, + + /// + /// Text edges where words have their BoundingBox's mid coordinate aligned on the same vertical line. + /// + Mid = 1, + + /// + /// Text edges where words have their BoundingBox's right coordinate aligned on the same vertical line. + /// + Right = 2 + } +}