Merge pull request #36 from BobLd/master

Document Layout Analysis Tools
This commit is contained in:
Eliot Jones
2019-06-23 11:32:50 +01:00
committed by GitHub
7 changed files with 810 additions and 0 deletions

View File

@@ -51,6 +51,12 @@
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
"UglyToad.PdfPig.DocumentLayoutAnalysis.Distances",
"UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions",
"UglyToad.PdfPig.DocumentLayoutAnalysis.NearestNeighbourWordExtractor",
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
"UglyToad.PdfPig.Fonts.DescriptorFontFile",

View File

@@ -0,0 +1,122 @@
using System;
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.Geometry;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// Contains helpful tools for distance measures.
/// </summary>
public static class Distances
{
/// <summary>
/// The Euclidean distance is the "ordinary" straight-line distance between two points.
/// </summary>
/// <param name="point1">The first point.</param>
/// <param name="point2">The second point.</param>
public static double Euclidean(PdfPoint point1, PdfPoint point2)
{
double dx = (double)(point1.X - point2.X);
double dy = (double)(point1.Y - point2.Y);
return Math.Sqrt(dx * dx + dy * dy);
}
/// <summary>
/// The weighted Euclidean distance.
/// </summary>
/// <param name="point1">The first point.</param>
/// <param name="point2">The second point.</param>
/// <param name="wX">The weight of the X coordinates. Default is 1.</param>
/// <param name="wY">The weight of the Y coordinates. Default is 1.</param>
public static double WeightedEuclidean(PdfPoint point1, PdfPoint point2, double wX = 1.0, double wY = 1.0)
{
double dx = (double)(point1.X - point2.X);
double dy = (double)(point1.Y - point2.Y);
return Math.Sqrt(wX * dx * dx + wY * dy * dy);
}
/// <summary>
/// The Manhattan distance between two points is the sum of the absolute differences of their Cartesian coordinates.
/// <para>Also known as rectilinear distance, L1 distance, L1 norm, snake distance, city block distance, taxicab metric.</para>
/// </summary>
/// <param name="point1">The first point.</param>
/// <param name="point2">The second point.</param>
public static double Manhattan(PdfPoint point1, PdfPoint point2)
{
return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y));
}
/// <summary>
/// Find the nearest point.
/// </summary>
/// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
/// <param name="points">The list of neighbours candidates.</param>
/// <param name="distanceMeasure">The distance measure to use.</param>
/// <param name="distance">The distance between reference point, and its nearest neighbour</param>
public static PdfPoint FindNearest(this PdfPoint pdfPoint, IReadOnlyList<PdfPoint> points,
Func<PdfPoint, PdfPoint, double> distanceMeasure, out double distance)
{
if (points == null || points.Count == 0)
{
throw new ArgumentException("Distances.FindNearest(): The list of neighbours candidates is either null or empty.", "points");
}
if (distanceMeasure == null)
{
throw new ArgumentException("Distances.FindNearest(): The distance measure must not be null.", "distanceMeasure");
}
distance = double.MaxValue;
PdfPoint closestPoint = default;
for (var i = 0; i < points.Count; i++)
{
double currentDistance = distanceMeasure(points[i], pdfPoint);
if (currentDistance < distance)
{
distance = currentDistance;
closestPoint = points[i];
}
}
return closestPoint;
}
/// <summary>
/// Find the index of the nearest point.
/// </summary>
/// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
/// <param name="points">The list of neighbours candidates.</param>
/// <param name="distanceMeasure">The distance measure to use.</param>
/// <param name="distance">The distance between reference point, and its nearest neighbour</param>
public static int FindIndexNearest(this PdfPoint pdfPoint, IReadOnlyList<PdfPoint> points,
Func<PdfPoint, PdfPoint, double> distanceMeasure, out double distance)
{
if (points == null || points.Count == 0)
{
throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "points");
}
if (distanceMeasure == null)
{
throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", "distanceMeasure");
}
distance = double.MaxValue;
int closestPointIndex = -1;
for (var i = 0; i < points.Count; i++)
{
double currentDistance = distanceMeasure(points[i], pdfPoint);
if (currentDistance < distance)
{
distance = currentDistance;
closestPointIndex = i;
}
}
return closestPointIndex;
}
}
}

View File

@@ -0,0 +1,30 @@
using System.Collections.Generic;
using System.Linq;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// Useful math extensions.
/// </summary>
public static class MathExtensions
{
/// <summary>
/// Computes the mode of a sequence of float values.
/// </summary>
/// <param name="array">The array of floats.</param>
public static float Mode(this IEnumerable<float> array)
{
if (array == null || array.Count() == 0) return float.NaN;
return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key;
}
/// <summary>
/// Computes the mode of a sequence of decimal values.
/// </summary>
/// <param name="array">The array of decimal.</param>
public static decimal Mode(this IEnumerable<decimal> array)
{
return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key;
}
}
}

View File

@@ -0,0 +1,215 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Geometry;
using UglyToad.PdfPig.Util;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// Nearest Neighbour Word Extractor, using the <see cref="Distances.Manhattan"/> distance.
/// This implementation leverages bounding boxes.
/// </summary>
public class NearestNeighbourWordExtractor : IWordExtractor
{
/// <summary>
/// Create an instance of Nearest Neighbour Word Extractor, <see cref="NearestNeighbourWordExtractor"/>.
/// </summary>
public static IWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
/// <summary>
/// Gets the words.
/// </summary>
/// <param name="letters">The letters in the page.</param>
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
{
List<Word> wordsH = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Horizontal),
l => l.GlyphRectangle.Width, Distances.Manhattan)
.OrderByDescending(x => x.BoundingBox.Bottom)
.ThenBy(x => x.BoundingBox.Left).ToList();
List<Word> words180 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate180),
l => l.GlyphRectangle.Width, Distances.Manhattan)
.OrderBy(x => x.BoundingBox.Top)
.ThenByDescending(x => x.BoundingBox.Right).ToList();
wordsH.AddRange(words180);
List<Word> words90 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate90),
l => l.GlyphRectangle.Height, Distances.Manhattan)
.OrderByDescending(x => x.BoundingBox.Left)
.ThenBy(x => x.BoundingBox.Top).ToList();
wordsH.AddRange(words90);
List<Word> words270 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate270),
l => l.GlyphRectangle.Height, Distances.Manhattan)
.OrderBy(x => x.BoundingBox.Right)
.ThenByDescending(x => x.BoundingBox.Bottom).ToList();
wordsH.AddRange(words270);
List<Word> wordsU = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Unknown),
l => l.GlyphRectangle.Width, Distances.Manhattan)
.OrderByDescending(x => x.BoundingBox.Bottom)
.ThenBy(x => x.BoundingBox.Left).ToList();
wordsH.AddRange(wordsU);
return wordsH;
}
/// <summary>
/// Private method to get the words.
/// </summary>
/// <param name="pageLetters">The letters in the page, they must have
/// the same text directions.</param>
/// <param name="metric">The letter's metric to use in the minimum distance
/// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height.</param>
/// <param name="distMeasure">The distance measure between two start and end base line points,
/// e.g. the Manhattan distance.</param>
private static List<Word> GetWords(IEnumerable<Letter> pageLetters,
Func<Letter, decimal> metric, Func<PdfPoint, PdfPoint, double> distMeasure)
{
if (pageLetters == null || pageLetters.Count() == 0) return new List<Word>();
TextDirection textDirection = pageLetters.ElementAt(0).TextDirection;
if (pageLetters.Any(x => textDirection != x.TextDirection))
{
throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction.");
}
Func<IEnumerable<Letter>, IReadOnlyList<Letter>> orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList();
if (textDirection == TextDirection.Rotate180)
{
orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Right).ToList();
}
else if (textDirection == TextDirection.Rotate90)
{
orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Top).ToList();
}
else if (textDirection == TextDirection.Rotate270)
{
orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Bottom).ToList();
}
Letter[] letters = pageLetters.ToArray();
int lettersCount = letters.Length;
List<PdfPoint> startBaseLines = letters.Select(x => x.StartBaseLine).ToList();
int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray();
// Find nearest neighbours indexes
Parallel.For(0, lettersCount, c =>
{
var currentLetter = letters[c];
// only check neighbours if not a white space
if (!string.IsNullOrWhiteSpace(currentLetter.Value))
{
int index = currentLetter.EndBaseLine.FindIndexNearest(startBaseLines, distMeasure, out double dist);
var pairedLetter = letters[index];
if (!string.IsNullOrWhiteSpace(pairedLetter.Value) &&
string.Equals(currentLetter.FontName, pairedLetter.FontName, StringComparison.OrdinalIgnoreCase))
{
decimal minDist = Math.Max(Math.Abs(metric(currentLetter)), Math.Abs(metric(pairedLetter))) * 0.60m;
if ((decimal)dist < minDist)
{
indexes[c] = index;
}
}
}
});
// Group indexes
List<List<int>> groupedIndexes = new List<List<int>>();
List<int> indexDone = new List<int>();
for (int c = 0; c < lettersCount; c++)
{
int i = indexes[c];
if (i == -1) continue;
bool isDoneC = indexDone.Contains(c);
bool isDoneI = indexDone.Contains(i);
if (isDoneC || isDoneI)
{
if (isDoneC && !isDoneI)
{
foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
{
pair.Add(i);
}
indexDone.Add(i);
}
else if (!isDoneC && isDoneI)
{
foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
{
pair.Add(c);
}
indexDone.Add(c);
}
else
{
foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
{
if (!pair.Contains(c)) pair.Add(c);
}
foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
{
if (!pair.Contains(i)) pair.Add(i);
}
}
}
else
{
List<int> pair = new List<int>() { c, i };
groupedIndexes.Add(pair);
indexDone.AddRange(pair);
}
}
// Merge lists with common index
for (int c = 0; c < lettersCount; c++)
{
List<List<int>> candidates = groupedIndexes.Where(x => x.Any(t => t == c)).ToList();
if (candidates.Count < 2) continue; // only one group with this index
List<int> merged = candidates.First();
groupedIndexes.Remove(merged);
for (int i = 1; i < candidates.Count; i++)
{
var current = candidates[i];
merged = merged.Union(current).ToList();
groupedIndexes.Remove(current);
}
groupedIndexes.Add(merged);
}
List<Word> words = new List<Word>();
for (int a = 0; a < groupedIndexes.Count(); a++)
{
List<Letter> groupedLetters = new List<Letter>();
foreach (int s in groupedIndexes[a])
{
groupedLetters.Add(letters[s]);
}
words.Add(new Word(orderFunc(groupedLetters)));
}
List<int> indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList();
for (int n = 0; n < indexesNotDone.Count(); n++)
{
Letter letter = letters[indexesNotDone[n]];
words.Add(new Word(new Letter[] { letter }));
}
return words;
}
}
}

View File

@@ -0,0 +1,231 @@
using System;
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.Content;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
/// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
/// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips</para>
/// </summary>
public class RecursiveXYCut
{
/// <summary>
/// Get the blocks.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="minimumWidth">The minimum width for a block.</param>
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth = 0)
{
return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5m, 3));
}
/// <summary>
/// Get the blocks.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="minimumWidth">The minimum width for a block.</param>
/// <param name="dominantFontWidth">The dominant font width.</param>
/// <param name="dominantFontHeight">The dominant font height.</param>
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
decimal dominantFontWidth, decimal dominantFontHeight)
{
return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight);
}
/// <summary>
/// Get the blocks.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="minimumWidth">The minimum width for a block.</param>
/// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
/// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc)
{
var root = new XYLeaf(pageWords); // Create a root node.
return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
}
private static XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth,
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
{
if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
{
// we stop cutting if
// - only one word remains
// - width is too small
return leaf;
}
// order words left to right
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray();
// determine dominantFontWidth and dominantFontHeight
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
.Select(x => Math.Abs(x.GlyphRectangle.Width)));
decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
.Select(x => Math.Abs(x.GlyphRectangle.Height)));
List<decimal[]> projectionProfile = new List<decimal[]>();
decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Left, words[0].BoundingBox.Right };
int wordsCount = words.Count();
for (int i = 1; i < wordsCount; i++)
{
if ((words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1])
|| (words[i].BoundingBox.Right >= currentProj[0] && words[i].BoundingBox.Right <= currentProj[1]))
{
// it is overlapping
if (words[i].BoundingBox.Left >= currentProj[0]
&& words[i].BoundingBox.Left <= currentProj[1]
&& words[i].BoundingBox.Right > currentProj[1])
{
// |____|
// |____|
// |_______| <- updated
currentProj[1] = words[i].BoundingBox.Right;
}
// we ignore the following cases:
// |____|
// |____| (not possible because of OrderBy)
//
// |____|
//|___________| (not possible because of OrderBy)
//
// |____|
// |_|
}
else
{
// no overlap
if (words[i].BoundingBox.Left - currentProj[1] <= domFontWidth)
{
// if gap too small -> don't cut
// |____| |____|
currentProj[1] = words[i].BoundingBox.Right;
}
else if (currentProj[1] - currentProj[0] < minimumWidth)
{
// still too small
currentProj[1] = words[i].BoundingBox.Right;
}
else
{
// if gap big enough -> cut!
// |____| | |____|
if (i != wordsCount - 1) // will always add the last one after
{
projectionProfile.Add(currentProj);
currentProj = new decimal[2] { words[i].BoundingBox.Left, words[i].BoundingBox.Right };
}
}
}
if (i == wordsCount - 1) projectionProfile.Add(currentProj);
}
var newLeafsEnums = projectionProfile.Select(p => leaf.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1]));
var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
var newNodes = newLeafs.Select(l => HorizontalCut(l, minimumWidth,
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
if (lost.Count > 0)
{
newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
}
return new XYNode(newNodes);
}
private static XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth,
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
{
if (leaf.CountWords() <= 1)
{
// we stop cutting if
// - only one word remains
return leaf;
}
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top
// determine dominantFontWidth and dominantFontHeight
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
.Select(x => Math.Abs(x.GlyphRectangle.Width)));
decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
.Select(x => Math.Abs(x.GlyphRectangle.Height)));
List<decimal[]> projectionProfile = new List<decimal[]>();
decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Bottom, words[0].BoundingBox.Top };
int wordsCount = words.Count();
for (int i = 1; i < wordsCount; i++)
{
if ((words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1])
|| (words[i].BoundingBox.Top >= currentProj[0] && words[i].BoundingBox.Top <= currentProj[1]))
{
// it is overlapping
if (words[i].BoundingBox.Bottom >= currentProj[0]
&& words[i].BoundingBox.Bottom <= currentProj[1]
&& words[i].BoundingBox.Top > currentProj[1])
{
currentProj[1] = words[i].BoundingBox.Top;
}
}
else
{
// no overlap
if (words[i].BoundingBox.Bottom - currentProj[1] <= domFontHeight)
{
// if gap too small -> don't cut
// |____| |____|
currentProj[1] = words[i].BoundingBox.Top;
}
else
{
// if gap big enough -> cut!
// |____| | |____|
if (i != wordsCount - 1) // will always add the last one after
{
projectionProfile.Add(currentProj);
currentProj = new decimal[2] { words[i].BoundingBox.Bottom, words[i].BoundingBox.Top };
}
}
}
if (i == wordsCount - 1) projectionProfile.Add(currentProj);
}
if (projectionProfile.Count == 1)
{
if (level >= 1)
{
return leaf;
}
else
{
level++;
}
}
var newLeafsEnums = projectionProfile.Select(p =>
leaf.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1]));
var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
var newNodes = newLeafs.Select(l => VerticalCut(l, minimumWidth,
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
if (lost.Count > 0)
{
newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
}
return new XYNode(newNodes);
}
}
}

View File

@@ -0,0 +1,76 @@
using System;
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Geometry;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// A Leaf node used in the <see cref="RecursiveXYCut"/> algorithm, i.e. a block.
/// </summary>
public class XYLeaf : XYNode
{
/// <summary>
/// Returns true if this node is a leaf, false otherwise.
/// </summary>
public override bool IsLeaf => true;
/// <summary>
/// The words in the leaf.
/// </summary>
public IReadOnlyList<Word> Words { get; }
/// <summary>
/// The number of words in the leaf.
/// </summary>
public override int CountWords() => Words == null ? 0 : Words.Count;
/// <summary>
/// Returns null as a leaf doesn't have leafs.
/// </summary>
public override List<XYLeaf> GetLeafs()
{
return null;
}
/// <summary>
/// Gets the lines of the leaf.
/// </summary>
public IReadOnlyList<TextLine> GetLines()
{
return Words.GroupBy(x => x.BoundingBox.Bottom).OrderByDescending(x => x.Key)
.Select(x => new TextLine(x.ToList())).ToArray();
}
/// <summary>
/// Create a new <see cref="XYLeaf"/>.
/// </summary>
/// <param name="words">The words contained in the leaf.</param>
public XYLeaf(params Word[] words) : this(words == null ? null : words.ToList())
{
}
/// <summary>
/// Create a new <see cref="XYLeaf"/>.
/// </summary>
/// <param name="words">The words contained in the leaf.</param>
public XYLeaf(IEnumerable<Word> words) : base(null)
{
if (words == null)
{
throw new ArgumentException("XYLeaf(): The words contained in the leaf cannot be null.", "words");
}
decimal left = words.Min(b => b.BoundingBox.Left);
decimal right = words.Max(b => b.BoundingBox.Right);
decimal bottom = words.Min(b => b.BoundingBox.Bottom);
decimal top = words.Max(b => b.BoundingBox.Top);
BoundingBox = new PdfRectangle(left, bottom, right, top);
Words = words.ToArray();
}
}
}

View File

@@ -0,0 +1,130 @@
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.Geometry;
using UglyToad.PdfPig.Util;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// A Node used in the <see cref="RecursiveXYCut"/> algorithm.
/// </summary>
public class XYNode
{
/// <summary>
/// Returns true if this node is a leaf, false otherwise.
/// </summary>
public virtual bool IsLeaf => false;
/// <summary>
/// The rectangle completely containing the node.
/// </summary>
public PdfRectangle BoundingBox { get; set; }
/// <summary>
/// The children of the node.
/// </summary>
public XYNode[] Children { get; set; }
/// <summary>
/// Recursively counts the words included in this node.
/// </summary>
public virtual int CountWords()
{
if (Children == null) return 0;
int count = 0;
RecursiveCount(Children, ref count);
return count;
}
/// <summary>
/// Recursively gets the leafs (last nodes) of this node.
/// </summary>
public virtual List<XYLeaf> GetLeafs()
{
List<XYLeaf> leafs = new List<XYLeaf>();
if (Children == null || Children.Count() == 0) return leafs;
int level = 0;
RecursiveGetLeafs(Children, ref leafs, level);
return leafs;
}
/// <summary>
/// Create a new <see cref="XYNode"/>.
/// </summary>
/// <param name="children">The node's children.</param>
public XYNode(params XYNode[] children)
: this(children?.ToList())
{
}
/// <summary>
/// Create a new <see cref="XYNode"/>.
/// </summary>
/// <param name="children">The node's children.</param>
public XYNode(IEnumerable<XYNode> children)
{
if (children != null && children.Count() != 0)
{
Children = children.ToArray();
decimal left = children.Min(b => b.BoundingBox.Left);
decimal right = children.Max(b => b.BoundingBox.Right);
decimal bottom = children.Min(b => b.BoundingBox.Bottom);
decimal top = children.Max(b => b.BoundingBox.Top);
BoundingBox = new PdfRectangle(left, bottom, right, top);
}
else
{
Children = EmptyArray<XYNode>.Instance;
}
}
private void RecursiveCount(IEnumerable<XYNode> children, ref int count)
{
if (children.Count() == 0) return;
foreach (XYNode node in children.Where(x => x.IsLeaf))
{
count += node.CountWords();
}
foreach (XYNode node in children.Where(x => !x.IsLeaf))
{
RecursiveCount(node.Children, ref count);
}
}
private void RecursiveGetLeafs(IEnumerable<XYNode> children, ref List<XYLeaf> leafs, int level)
{
if (children.Count() == 0) return;
bool isVerticalCut = level % 2 == 0;
foreach (XYLeaf node in children.Where(x => x.IsLeaf))
{
leafs.Add(node);
}
level++;
IEnumerable<XYNode> notLeafs = children.Where(x => !x.IsLeaf);
if (isVerticalCut)
{
notLeafs = notLeafs.OrderBy(x => x.BoundingBox.Left).ToList();
}
else
{
notLeafs = notLeafs.OrderByDescending(x => x.BoundingBox.Top).ToList();
}
foreach (XYNode node in notLeafs)
{
RecursiveGetLeafs(node.Children, ref leafs, level);
}
}
public override string ToString()
{
return (IsLeaf ? "Leaf" : "Node");
}
}
}