mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
merge pull request #50 from BobLd/master
document layout analysis - text edges extractor
This commit is contained in:
@@ -65,6 +65,8 @@
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
|
||||
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
|
||||
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
|
||||
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
|
||||
|
||||
10
src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs
Normal file
10
src/UglyToad.PdfPig/DocumentLayoutAnalysis/Docstrum.cs
Normal file
@@ -0,0 +1,10 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
class Docstrum
|
||||
{
|
||||
}
|
||||
}
|
||||
109
src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs
Normal file
109
src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs
Normal file
@@ -0,0 +1,109 @@
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
using UglyToad.PdfPig.Content;
|
||||
using UglyToad.PdfPig.Geometry;
|
||||
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
/// <summary>
|
||||
/// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line.
|
||||
/// <para>Useful to detect text columns, tables, justified text, lists, etc.</para>
|
||||
/// </summary>
|
||||
public static class TextEdgesExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Functions used to define left, middle and right edges.
|
||||
/// </summary>
|
||||
private static readonly Tuple<EdgeType, Func<PdfRectangle, decimal>>[] edgesFuncs = new Tuple<EdgeType, Func<PdfRectangle, decimal>>[]
|
||||
{
|
||||
Tuple.Create<EdgeType, Func<PdfRectangle, decimal>>(EdgeType.Left, x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate
|
||||
Tuple.Create<EdgeType, Func<PdfRectangle, decimal>>(EdgeType.Mid, x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate
|
||||
Tuple.Create<EdgeType, Func<PdfRectangle, decimal>>(EdgeType.Right, x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Get the text edges.
|
||||
/// </summary>
|
||||
/// <param name="pageWords">The words in the page.</param>
|
||||
/// <param name="minimumElements">The minimum number of elements to define a text edge.</param>
|
||||
public static IReadOnlyDictionary<EdgeType, List<PdfLine>> GetEdges(IEnumerable<Word> pageWords, int minimumElements = 4)
|
||||
{
|
||||
if (minimumElements < 0)
|
||||
{
|
||||
throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements");
|
||||
}
|
||||
|
||||
var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim()));
|
||||
|
||||
ConcurrentDictionary<EdgeType, List<PdfLine>> dictionary = new ConcurrentDictionary<EdgeType, List<PdfLine>>();
|
||||
|
||||
Parallel.ForEach(edgesFuncs, f =>
|
||||
{
|
||||
dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
|
||||
});
|
||||
return dictionary.ToDictionary(x => x.Key, x => x.Value);
|
||||
}
|
||||
|
||||
private static List<PdfLine> GetVerticalEdges(IEnumerable<Word> pageWords, Func<PdfRectangle, decimal> func, int minimumElements)
|
||||
{
|
||||
Dictionary<decimal, List<Word>> edges = pageWords.GroupBy(x => func(x.BoundingBox))
|
||||
.Where(x => x.Count() >= minimumElements).ToDictionary(gdc => gdc.Key, gdc => gdc.ToList());
|
||||
Dictionary<decimal, List<List<Word>>> cleanEdges = new Dictionary<decimal, List<List<Word>>>();
|
||||
|
||||
foreach (var edge in edges)
|
||||
{
|
||||
var sortedEdges = edge.Value.OrderBy(x => x.BoundingBox.Bottom).ToList();
|
||||
cleanEdges.Add(edge.Key, new List<List<Word>>());
|
||||
|
||||
var cuttings = pageWords.Except(edge.Value) // remove selected words
|
||||
// words that cut the vertical line
|
||||
.Where(x => x.BoundingBox.Left < edge.Key && x.BoundingBox.Right > edge.Key)
|
||||
// and that are within the boundaries of the edge
|
||||
.Where(k => k.BoundingBox.Bottom > edge.Value.Min(z => z.BoundingBox.Bottom)
|
||||
&& k.BoundingBox.Top < edge.Value.Max(z => z.BoundingBox.Top))
|
||||
.OrderBy(x => x.BoundingBox.Bottom).ToList();
|
||||
|
||||
if (cuttings.Count > 0)
|
||||
{
|
||||
foreach (var cut in cuttings)
|
||||
{
|
||||
var group1 = sortedEdges.Where(x => x.BoundingBox.Top < cut.BoundingBox.Bottom).ToList();
|
||||
if (group1.Count >= minimumElements) cleanEdges[edge.Key].Add(group1);
|
||||
sortedEdges = sortedEdges.Except(group1).ToList();
|
||||
}
|
||||
if (sortedEdges.Count >= minimumElements) cleanEdges[edge.Key].Add(sortedEdges);
|
||||
}
|
||||
else
|
||||
{
|
||||
cleanEdges[edge.Key].Add(sortedEdges);
|
||||
}
|
||||
}
|
||||
|
||||
return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The type of text edge.
|
||||
/// </summary>
|
||||
public enum EdgeType
|
||||
{
|
||||
/// <summary>
|
||||
/// Text edges where words have their BoundingBox's left coordinate aligned on the same vertical line.
|
||||
/// </summary>
|
||||
Left = 0,
|
||||
|
||||
/// <summary>
|
||||
/// Text edges where words have their BoundingBox's mid coordinate aligned on the same vertical line.
|
||||
/// </summary>
|
||||
Mid = 1,
|
||||
|
||||
/// <summary>
|
||||
/// Text edges where words have their BoundingBox's right coordinate aligned on the same vertical line.
|
||||
/// </summary>
|
||||
Right = 2
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user