Document Layout Analysis - Text edges extractor

Text edges are where words have either there BoundingBox's left, right or mid coordinate aligned on the same vertical line.
Useful to detect tables, justified text, lists, etc.
This commit is contained in:
BobLd
2019-08-06 15:24:16 +01:00
parent f07ab7d2c3
commit 83889cfb52
2 changed files with 84 additions and 0 deletions

View File

@@ -65,6 +65,7 @@
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf",
"UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
"UglyToad.PdfPig.Fonts.DescriptorFontFile",

View File

@@ -0,0 +1,83 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Geometry;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// Text edges extractor. Text edges are where words have either there BoundingBox's left, right or mid coordinate aligned on the same vertical line.
/// <para>Useful to detect tables, justified text, lists, etc.</para>
/// </summary>
public class TextEdgesExtractor
{
/// <summary>
/// Functions used to define left, middle and right edges.
/// </summary>
private static readonly Tuple<string, Func<PdfRectangle, decimal>>[] edgesFuncs = new Tuple<string, Func<PdfRectangle, decimal>>[]
{
Tuple.Create<string, Func<PdfRectangle, decimal>>("left", x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate
Tuple.Create<string, Func<PdfRectangle, decimal>>("mid", x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate
Tuple.Create<string, Func<PdfRectangle, decimal>>("right", x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate
};
/// <summary>
/// Get the text edges.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="minimumElements">The minimum number of elements to define a text edge.</param>
public static Dictionary<string, List<PdfLine>> GetEdges(IEnumerable<Word> pageWords, int minimumElements = 4)
{
var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim()));
ConcurrentDictionary<string, List<PdfLine>> dictionary = new ConcurrentDictionary<string, List<PdfLine>>();
Parallel.ForEach(edgesFuncs, f =>
{
dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
});
return dictionary.ToDictionary(x => x.Key, x => x.Value);
}
private static List<PdfLine> GetVerticalEdges(IEnumerable<Word> pageWords, Func<PdfRectangle, decimal> func, int minimumElements)
{
Dictionary<decimal, List<Word>> edges = pageWords.GroupBy(x => func(x.BoundingBox))
.Where(x => x.Count() >= minimumElements).ToDictionary(gdc => gdc.Key, gdc => gdc.ToList());
Dictionary<decimal, List<List<Word>>> cleanEdges = new Dictionary<decimal, List<List<Word>>>();
foreach (var edge in edges)
{
var sortedEdges = edge.Value.OrderBy(x => x.BoundingBox.Bottom).ToList();
cleanEdges.Add(edge.Key, new List<List<Word>>());
var cuttings = pageWords.Except(edge.Value) // remove selected words
// words that cut the vertical line
.Where(x => x.BoundingBox.Left < edge.Key && x.BoundingBox.Right > edge.Key)
// and that are within the boundaries of the edge
.Where(k => k.BoundingBox.Bottom > edge.Value.Min(z => z.BoundingBox.Bottom)
&& k.BoundingBox.Top < edge.Value.Max(z => z.BoundingBox.Top))
.OrderBy(x => x.BoundingBox.Bottom).ToList();
if (cuttings.Count > 0)
{
foreach (var cut in cuttings)
{
var group1 = sortedEdges.Where(x => x.BoundingBox.Top < cut.BoundingBox.Bottom).ToList();
if (group1.Count >= minimumElements) cleanEdges[edge.Key].Add(group1);
sortedEdges = sortedEdges.Except(group1).ToList();
}
if (sortedEdges.Count >= minimumElements) cleanEdges[edge.Key].Add(sortedEdges);
}
else
{
cleanEdges[edge.Key].Add(sortedEdges);
}
}
return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList();
}
}
}