mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 19:54:52 +08:00
Addind Document Layout Analysis:
- Nearest Neighbour Word Extractor - Recursive X-Y Cut algorithm, useful for multi-column pdf documents
This commit is contained in:
@@ -51,6 +51,10 @@
|
|||||||
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
|
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
|
||||||
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
|
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
|
||||||
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
|
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
|
||||||
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.Distances",
|
||||||
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions",
|
||||||
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.NNWordExtractor",
|
||||||
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
|
||||||
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
|
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
|
||||||
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
|
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
|
||||||
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
|
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
|
||||||
|
86
src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
Normal file
86
src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
using System;
|
||||||
|
using System.Linq;
|
||||||
|
using UglyToad.PdfPig.Geometry;
|
||||||
|
|
||||||
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Contains helpful tools for distance measures.
|
||||||
|
/// </summary>
|
||||||
|
public static class Distances
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// The Euclidean distance is the "ordinary" straight-line distance between two points.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="point1">The first point.</param>
|
||||||
|
/// <param name="point2">The second point.</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static double Euclidean(PdfPoint point1, PdfPoint point2)
|
||||||
|
{
|
||||||
|
double dx = (double)(point1.X - point2.X);
|
||||||
|
double dy = (double)(point1.Y - point2.Y);
|
||||||
|
return Math.Sqrt(dx * dx + dy * dy);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The weighted Euclidean distance.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="point1">The first point.</param>
|
||||||
|
/// <param name="point2">The second point.</param>
|
||||||
|
/// <param name="wX">The weight of the X coordinates. Default is 1.</param>
|
||||||
|
/// <param name="wY">The weight of the Y coordinates. Default is 1.</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static double WghtdEuclidean(PdfPoint point1, PdfPoint point2, double wX = 1.0, double wY = 1.0)
|
||||||
|
{
|
||||||
|
double dx = (double)(point1.X - point2.X);
|
||||||
|
double dy = (double)(point1.Y - point2.Y);
|
||||||
|
return Math.Sqrt(wX * dx * dx + wY * dy * dy);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The Manhattan distance between two points is the sum of the absolute differences of their Cartesian coordinates.
|
||||||
|
/// <para>Also known as rectilinear distance, L1 distance, L1 norm, snake distance, city block distance, taxicab metric.</para>
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="point1">The first point.</param>
|
||||||
|
/// <param name="point2">The second point.</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static double Manhattan(PdfPoint point1, PdfPoint point2)
|
||||||
|
{
|
||||||
|
return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Find the nearest point.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
|
||||||
|
/// <param name="points">The list of neighbours candidates.</param>
|
||||||
|
/// <param name="measure">The distance measure to use.</param>
|
||||||
|
/// <param name="dist">The distance between reference point, and its nearest neighbour</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static PdfPoint FindNearest(this PdfPoint pdfPoint, PdfPoint[] points,
|
||||||
|
Func<PdfPoint, PdfPoint, double> measure, out double dist)
|
||||||
|
{
|
||||||
|
double d = points.Min(k => measure(k, pdfPoint));
|
||||||
|
PdfPoint point = points.First(x => measure(x, pdfPoint) == d);
|
||||||
|
dist = d;
|
||||||
|
return point;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Find the index of the nearest point.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
|
||||||
|
/// <param name="points">The list of neighbours candidates.</param>
|
||||||
|
/// <param name="measure">The distance measure to use.</param>
|
||||||
|
/// <param name="dist">The distance between reference point, and its nearest neighbour</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static int FindIndexNearest(this PdfPoint pdfPoint, PdfPoint[] points,
|
||||||
|
Func<PdfPoint, PdfPoint, double> measure, out double dist)
|
||||||
|
{
|
||||||
|
double d = points.Min(k => measure(k, pdfPoint));
|
||||||
|
int index = Array.FindIndex(points, x => measure(x, pdfPoint) == d);
|
||||||
|
dist = d;
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
32
src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs
Normal file
32
src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
|
||||||
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Useful math extensions.
|
||||||
|
/// </summary>
|
||||||
|
public static class MathExtensions
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Computes the mode of a sequence of float values.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="array"></param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static float Mode(this IEnumerable<float> array)
|
||||||
|
{
|
||||||
|
if (array == null || array.Count() == 0) return float.NaN;
|
||||||
|
return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Computes the mode of a sequence of decimal values.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="array"></param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static decimal Mode(this IEnumerable<decimal> array)
|
||||||
|
{
|
||||||
|
return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
200
src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs
Normal file
200
src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using UglyToad.PdfPig.Content;
|
||||||
|
using UglyToad.PdfPig.Geometry;
|
||||||
|
using UglyToad.PdfPig.Util;
|
||||||
|
|
||||||
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Nearest Neighbour Word Extractor, using the <see cref="Distances.Manhattan"/> distance.
|
||||||
|
/// This implementation leverages bounding boxes.
|
||||||
|
/// </summary>
|
||||||
|
public class NNWordExtractor : IWordExtractor
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Create an instance of Nearest Neighbour Word Extractor, <see cref="NNWordExtractor"/>.
|
||||||
|
/// </summary>
|
||||||
|
public static IWordExtractor Instance { get; } = new NNWordExtractor();
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets the words.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="letters"></param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
|
||||||
|
{
|
||||||
|
List<Word> wordsH = GetWords(
|
||||||
|
letters.Where(l => l.TextDirection == TextDirection.Horizontal),
|
||||||
|
l => l.GlyphRectangle.Width, Distances.Manhattan)
|
||||||
|
.OrderByDescending(x => x.BoundingBox.Bottom)
|
||||||
|
.ThenBy(x => x.BoundingBox.Left).ToList();
|
||||||
|
|
||||||
|
List<Word> words180 = GetWords(
|
||||||
|
letters.Where(l => l.TextDirection == TextDirection.Rotate180),
|
||||||
|
l => l.GlyphRectangle.Width, Distances.Manhattan)
|
||||||
|
.OrderBy(x => x.BoundingBox.Top)
|
||||||
|
.ThenByDescending(x => x.BoundingBox.Right).ToList();
|
||||||
|
wordsH.AddRange(words180);
|
||||||
|
|
||||||
|
List<Word> words90 = GetWords(
|
||||||
|
letters.Where(l => l.TextDirection == TextDirection.Rotate90),
|
||||||
|
l => l.GlyphRectangle.Height, Distances.Manhattan)
|
||||||
|
.OrderByDescending(x => x.BoundingBox.Left)
|
||||||
|
.ThenBy(x => x.BoundingBox.Top).ToList();
|
||||||
|
wordsH.AddRange(words90);
|
||||||
|
|
||||||
|
List<Word> words270 = GetWords(
|
||||||
|
letters.Where(l => l.TextDirection == TextDirection.Rotate270),
|
||||||
|
l => l.GlyphRectangle.Height, Distances.Manhattan)
|
||||||
|
.OrderBy(x => x.BoundingBox.Right)
|
||||||
|
.ThenByDescending(x => x.BoundingBox.Bottom).ToList();
|
||||||
|
wordsH.AddRange(words270);
|
||||||
|
|
||||||
|
List<Word> wordsU = GetWords(
|
||||||
|
letters.Where(l => l.TextDirection == TextDirection.Unknown),
|
||||||
|
l => l.GlyphRectangle.Width, Distances.Manhattan)
|
||||||
|
.OrderByDescending(x => x.BoundingBox.Bottom)
|
||||||
|
.ThenBy(x => x.BoundingBox.Left).ToList();
|
||||||
|
wordsH.AddRange(wordsU);
|
||||||
|
|
||||||
|
return wordsH;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
///
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="pageLetters">The letters in the page, they must have
|
||||||
|
/// the same text directions.</param>
|
||||||
|
/// <param name="metric">The letter's metric to use in the minimum distance
|
||||||
|
/// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height.</param>
|
||||||
|
/// <param name="distMeasure">The distance measure between two start and end base line points,
|
||||||
|
/// e.g. the Manhattan distance.</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
private static List<Word> GetWords(IEnumerable<Letter> pageLetters,
|
||||||
|
Func<Letter, decimal> metric, Func<PdfPoint, PdfPoint, double> distMeasure)
|
||||||
|
{
|
||||||
|
if (pageLetters == null || pageLetters.Count() == 0) return new List<Word>();
|
||||||
|
|
||||||
|
if (pageLetters.Any(x => pageLetters.ElementAt(0).TextDirection != x.TextDirection))
|
||||||
|
{
|
||||||
|
throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction.");
|
||||||
|
}
|
||||||
|
|
||||||
|
Letter[] letters = pageLetters.ToArray();
|
||||||
|
int lettersCount = letters.Length;
|
||||||
|
PdfPoint[] startBaseLines = letters.Select(x => x.StartBaseLine).ToArray();
|
||||||
|
int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray();
|
||||||
|
|
||||||
|
// Find nearest neighbours indexes
|
||||||
|
Parallel.For(0, lettersCount, c =>
|
||||||
|
{
|
||||||
|
var currentLetter = letters[c];
|
||||||
|
// only check neighbours if not a white space
|
||||||
|
if (!string.IsNullOrWhiteSpace(currentLetter.Value))
|
||||||
|
{
|
||||||
|
int index = currentLetter.EndBaseLine.FindIndexNearest(startBaseLines, distMeasure, out double dist);
|
||||||
|
var pairedLetter = letters[index];
|
||||||
|
|
||||||
|
if (!string.IsNullOrWhiteSpace(pairedLetter.Value) &&
|
||||||
|
string.Equals(currentLetter.FontName, pairedLetter.FontName, StringComparison.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
decimal minDist = Math.Max(Math.Abs(metric(currentLetter)), Math.Abs(metric(pairedLetter))) * 0.60m;
|
||||||
|
if ((decimal)dist < minDist)
|
||||||
|
{
|
||||||
|
indexes[c] = index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Group indexes
|
||||||
|
List<List<int>> groupedIndexes = new List<List<int>>();
|
||||||
|
List<int> indexDone = new List<int>();
|
||||||
|
for (int c = 0; c < lettersCount; c++)
|
||||||
|
{
|
||||||
|
int i = indexes[c];
|
||||||
|
if (i == -1) continue;
|
||||||
|
|
||||||
|
bool isDoneC = indexDone.Contains(c);
|
||||||
|
bool isDoneI = indexDone.Contains(i);
|
||||||
|
if (isDoneC || isDoneI)
|
||||||
|
{
|
||||||
|
if (isDoneC && !isDoneI)
|
||||||
|
{
|
||||||
|
foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
|
||||||
|
{
|
||||||
|
pair.Add(i);
|
||||||
|
}
|
||||||
|
indexDone.Add(i);
|
||||||
|
}
|
||||||
|
else if (!isDoneC && isDoneI)
|
||||||
|
{
|
||||||
|
foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
|
||||||
|
{
|
||||||
|
pair.Add(c);
|
||||||
|
}
|
||||||
|
indexDone.Add(c);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
|
||||||
|
{
|
||||||
|
if (!pair.Contains(c)) pair.Add(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
|
||||||
|
{
|
||||||
|
if (!pair.Contains(i)) pair.Add(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
List<int> pair = new List<int>() { c, i };
|
||||||
|
groupedIndexes.Add(pair);
|
||||||
|
indexDone.AddRange(pair);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merge lists with common index
|
||||||
|
for (int c = 0; c < lettersCount; c++)
|
||||||
|
{
|
||||||
|
List<List<int>> candidates = groupedIndexes.Where(x => x.Any(t => t == c)).ToList();
|
||||||
|
if (candidates.Count < 2) continue; // only one group with this index
|
||||||
|
|
||||||
|
List<int> merged = candidates.First();
|
||||||
|
groupedIndexes.Remove(merged);
|
||||||
|
for (int i = 1; i < candidates.Count; i++)
|
||||||
|
{
|
||||||
|
var current = candidates[i];
|
||||||
|
merged = merged.Union(current).ToList();
|
||||||
|
groupedIndexes.Remove(current);
|
||||||
|
}
|
||||||
|
groupedIndexes.Add(merged);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Word> words = new List<Word>();
|
||||||
|
for (int a = 0; a < groupedIndexes.Count(); a++)
|
||||||
|
{
|
||||||
|
List<Letter> groupedLetters = new List<Letter>();
|
||||||
|
foreach (int s in groupedIndexes[a])
|
||||||
|
{
|
||||||
|
groupedLetters.Add(letters[s]);
|
||||||
|
}
|
||||||
|
words.Add(new Word(groupedLetters));
|
||||||
|
}
|
||||||
|
|
||||||
|
List<int> indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList();
|
||||||
|
for (int n = 0; n < indexesNotDone.Count(); n++)
|
||||||
|
{
|
||||||
|
Letter letter = letters[indexesNotDone[n]];
|
||||||
|
words.Add(new Word(new Letter[] { letter }));
|
||||||
|
}
|
||||||
|
|
||||||
|
return words;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
411
src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs
Normal file
411
src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs
Normal file
@@ -0,0 +1,411 @@
|
|||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using UglyToad.PdfPig.Content;
|
||||||
|
using UglyToad.PdfPig.Geometry;
|
||||||
|
|
||||||
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
|
||||||
|
/// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
|
||||||
|
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
|
||||||
|
/// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha and Robert M.Haralick Ihsin T. Phillips</para>
|
||||||
|
/// </summary>
|
||||||
|
public class RecursiveXYCut
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Get the blocks.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="pageWords">The words in a page.</param>
|
||||||
|
/// <param name="minimumWidht">The minimum widht for a block.</param>
|
||||||
|
/// <param name="dominantFontWidth">The dominant font width.</param>
|
||||||
|
/// <param name="dominantFontHeight">The dominant font height.</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidht,
|
||||||
|
decimal dominantFontWidth, decimal dominantFontHeight)
|
||||||
|
{
|
||||||
|
return GetBlocks(pageWords, minimumWidht, k => dominantFontWidth, k => dominantFontHeight);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get the blocks.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="pageWords">The words in a page.</param>
|
||||||
|
/// <param name="minimumWidht">The minimum widht for a block.</param>
|
||||||
|
/// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
|
||||||
|
/// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidht,
|
||||||
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
||||||
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc)
|
||||||
|
{
|
||||||
|
var root = new XYLeef(pageWords);
|
||||||
|
return VerticalCut(root, minimumWidht, dominantFontWidthFunc, dominantFontHeightFunc);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static XYNode VerticalCut(XYLeef leef, decimal minimumWidht,
|
||||||
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
||||||
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
||||||
|
{
|
||||||
|
if (leef.CountWords() <= 1 || leef.BoundingBox.Width <= minimumWidht)
|
||||||
|
{
|
||||||
|
// we stop cutting if
|
||||||
|
// - only one word remains
|
||||||
|
// - width is too small
|
||||||
|
return leef;
|
||||||
|
}
|
||||||
|
|
||||||
|
// order words left to right
|
||||||
|
var words = leef.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray();
|
||||||
|
|
||||||
|
// determine dominantFontWidth and dominantFontHeight
|
||||||
|
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
|
||||||
|
.Select(x => Math.Abs(x.GlyphRectangle.Width)));
|
||||||
|
decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
|
||||||
|
.Select(x => Math.Abs(x.GlyphRectangle.Height)));
|
||||||
|
|
||||||
|
List<decimal[]> projectionProfile = new List<decimal[]>();
|
||||||
|
decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Left, words[0].BoundingBox.Right };
|
||||||
|
int wordsCount = words.Count();
|
||||||
|
for (int i = 1; i < wordsCount; i++)
|
||||||
|
{
|
||||||
|
if ((words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1])
|
||||||
|
|| (words[i].BoundingBox.Right >= currentProj[0] && words[i].BoundingBox.Right <= currentProj[1]))
|
||||||
|
{
|
||||||
|
// it is overlapping
|
||||||
|
if (words[i].BoundingBox.Left >= currentProj[0]
|
||||||
|
&& words[i].BoundingBox.Left <= currentProj[1]
|
||||||
|
&& words[i].BoundingBox.Right > currentProj[1])
|
||||||
|
{
|
||||||
|
// |____|
|
||||||
|
// |____|
|
||||||
|
// |_______| <- updated
|
||||||
|
currentProj[1] = words[i].BoundingBox.Right;
|
||||||
|
}
|
||||||
|
|
||||||
|
// we ignore the following cases:
|
||||||
|
// |____|
|
||||||
|
// |____| (not possible because of OrderBy)
|
||||||
|
//
|
||||||
|
// |____|
|
||||||
|
//|___________| (not possible because of OrderBy)
|
||||||
|
//
|
||||||
|
// |____|
|
||||||
|
// |_|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// no overlap
|
||||||
|
if (words[i].BoundingBox.Left - currentProj[1] <= domFontWidth)
|
||||||
|
{
|
||||||
|
// if gap too small -> don't cut
|
||||||
|
// |____| |____|
|
||||||
|
currentProj[1] = words[i].BoundingBox.Right;
|
||||||
|
}
|
||||||
|
else if (currentProj[1] - currentProj[0] < minimumWidht)
|
||||||
|
{
|
||||||
|
// still too small
|
||||||
|
currentProj[1] = words[i].BoundingBox.Right;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// if gap big enough -> cut!
|
||||||
|
// |____| | |____|
|
||||||
|
if (i != wordsCount - 1) // will always add the last one after
|
||||||
|
{
|
||||||
|
projectionProfile.Add(currentProj);
|
||||||
|
currentProj = new decimal[2] { words[i].BoundingBox.Left, words[i].BoundingBox.Right };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (i == wordsCount - 1) projectionProfile.Add(currentProj);
|
||||||
|
}
|
||||||
|
|
||||||
|
var newLeefsEnums = projectionProfile.Select(p => leef.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1]));
|
||||||
|
var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e));
|
||||||
|
|
||||||
|
var newNodes = newLeefs.Select(l => HorizontalCut(l, minimumWidht,
|
||||||
|
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
|
||||||
|
|
||||||
|
var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
|
||||||
|
if (lost.Count > 0)
|
||||||
|
{
|
||||||
|
newNodes.AddRange(lost.Select(w => new XYLeef(w)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return new XYNode(newNodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static XYNode HorizontalCut(XYLeef leef, decimal minimumWidht,
|
||||||
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
||||||
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
||||||
|
{
|
||||||
|
if (leef.CountWords() <= 1)
|
||||||
|
{
|
||||||
|
// we stop cutting if
|
||||||
|
// - only one word remains
|
||||||
|
return leef;
|
||||||
|
}
|
||||||
|
|
||||||
|
var words = leef.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top
|
||||||
|
|
||||||
|
// determine dominantFontWidth and dominantFontHeight
|
||||||
|
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
|
||||||
|
.Select(x => Math.Abs(x.GlyphRectangle.Width)));
|
||||||
|
decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
|
||||||
|
.Select(x => Math.Abs(x.GlyphRectangle.Height)));
|
||||||
|
|
||||||
|
List<decimal[]> projectionProfile = new List<decimal[]>();
|
||||||
|
decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Bottom, words[0].BoundingBox.Top };
|
||||||
|
int wordsCount = words.Count();
|
||||||
|
for (int i = 1; i < wordsCount; i++)
|
||||||
|
{
|
||||||
|
if ((words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1])
|
||||||
|
|| (words[i].BoundingBox.Top >= currentProj[0] && words[i].BoundingBox.Top <= currentProj[1]))
|
||||||
|
{
|
||||||
|
// it is overlapping
|
||||||
|
if (words[i].BoundingBox.Bottom >= currentProj[0]
|
||||||
|
&& words[i].BoundingBox.Bottom <= currentProj[1]
|
||||||
|
&& words[i].BoundingBox.Top > currentProj[1])
|
||||||
|
{
|
||||||
|
currentProj[1] = words[i].BoundingBox.Top;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// no overlap
|
||||||
|
if (words[i].BoundingBox.Bottom - currentProj[1] <= domFontHeight)
|
||||||
|
{
|
||||||
|
// if gap too small -> don't cut
|
||||||
|
// |____| |____|
|
||||||
|
currentProj[1] = words[i].BoundingBox.Top;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// if gap big enough -> cut!
|
||||||
|
// |____| | |____|
|
||||||
|
if (i != wordsCount - 1) // will always add the last one after
|
||||||
|
{
|
||||||
|
projectionProfile.Add(currentProj);
|
||||||
|
currentProj = new decimal[2] { words[i].BoundingBox.Bottom, words[i].BoundingBox.Top };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (i == wordsCount - 1) projectionProfile.Add(currentProj);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (projectionProfile.Count == 1)
|
||||||
|
{
|
||||||
|
if (level >= 1)
|
||||||
|
{
|
||||||
|
return leef;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
level++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var newLeefsEnums = projectionProfile.Select(p =>
|
||||||
|
leef.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1]));
|
||||||
|
var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e));
|
||||||
|
var newNodes = newLeefs.Select(l => VerticalCut(l, minimumWidht,
|
||||||
|
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
|
||||||
|
|
||||||
|
var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
|
||||||
|
if (lost.Count > 0)
|
||||||
|
{
|
||||||
|
newNodes.AddRange(lost.Select(w => new XYLeef(w)));
|
||||||
|
}
|
||||||
|
return new XYNode(newNodes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// A Node used in the <see cref="RecursiveXYCut"/> algorithm.
|
||||||
|
/// </summary>
|
||||||
|
public class XYNode
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Returns true if this node is a leef, false otherwise.
|
||||||
|
/// </summary>
|
||||||
|
public virtual bool IsLeef => false;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The rectangle completely containing the node.
|
||||||
|
/// </summary>
|
||||||
|
public PdfRectangle BoundingBox { get; set; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The children of the node.
|
||||||
|
/// </summary>
|
||||||
|
public XYNode[] Children { get; set; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Recursively counts the words included in this node.
|
||||||
|
/// </summary>
|
||||||
|
/// <returns></returns>
|
||||||
|
public virtual int CountWords()
|
||||||
|
{
|
||||||
|
if (Children == null) return 0;
|
||||||
|
int count = 0;
|
||||||
|
RecursiveCount(Children, ref count);
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Recursively gets the leefs (last nodes) of this node.
|
||||||
|
/// </summary>
|
||||||
|
/// <returns></returns>
|
||||||
|
public virtual List<XYLeef> GetLeefs()
|
||||||
|
{
|
||||||
|
List<XYLeef> leefs = new List<XYLeef>();
|
||||||
|
if (Children == null || Children.Count() == 0) return leefs;
|
||||||
|
int level = 0;
|
||||||
|
RecursiveGetLeefs(Children, ref leefs, level);
|
||||||
|
return leefs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Create a new <see cref="XYNode"/>.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="children"></param>
|
||||||
|
public XYNode(params XYNode[] children)
|
||||||
|
: this(children?.ToList())
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Create a new <see cref="XYNode"/>.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="children"></param>
|
||||||
|
public XYNode(IEnumerable<XYNode> children)
|
||||||
|
{
|
||||||
|
if (children != null && children.Count() != 0)
|
||||||
|
{
|
||||||
|
Children = children.ToArray();
|
||||||
|
decimal left = children.Min(b => b.BoundingBox.Left);
|
||||||
|
decimal right = children.Max(b => b.BoundingBox.Right);
|
||||||
|
decimal bottom = children.Min(b => b.BoundingBox.Bottom);
|
||||||
|
decimal top = children.Max(b => b.BoundingBox.Top);
|
||||||
|
BoundingBox = new PdfRectangle(left, bottom, right, top);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void RecursiveCount(IEnumerable<XYNode> children, ref int count)
|
||||||
|
{
|
||||||
|
if (children.Count() == 0) return;
|
||||||
|
foreach (XYNode node in children.Where(x => x.IsLeef))
|
||||||
|
{
|
||||||
|
count += node.CountWords();
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (XYNode node in children.Where(x => !x.IsLeef))
|
||||||
|
{
|
||||||
|
RecursiveCount(node.Children, ref count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void RecursiveGetLeefs(IEnumerable<XYNode> children, ref List<XYLeef> leefs, int level)
|
||||||
|
{
|
||||||
|
if (children.Count() == 0) return;
|
||||||
|
bool isVerticalCut = level % 2 == 0;
|
||||||
|
|
||||||
|
foreach (XYLeef node in children.Where(x => x.IsLeef))
|
||||||
|
{
|
||||||
|
leefs.Add(node);
|
||||||
|
}
|
||||||
|
|
||||||
|
level++;
|
||||||
|
|
||||||
|
IEnumerable<XYNode> notLeefs = children.Where(x => !x.IsLeef);
|
||||||
|
|
||||||
|
if (isVerticalCut)
|
||||||
|
{
|
||||||
|
notLeefs = notLeefs.OrderBy(x => x.BoundingBox.Left).ToList();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
notLeefs = notLeefs.OrderByDescending(x => x.BoundingBox.Top).ToList();
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (XYNode node in notLeefs)
|
||||||
|
{
|
||||||
|
RecursiveGetLeefs(node.Children, ref leefs, level);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public override string ToString()
|
||||||
|
{
|
||||||
|
return (IsLeef ? "Leef" : "Node");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// A Leef node used in the <see cref="RecursiveXYCut"/> algorithm, i.e. a block.
|
||||||
|
/// </summary>
|
||||||
|
public class XYLeef : XYNode
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Returns true if this node is a leef, false otherwise.
|
||||||
|
/// </summary>
|
||||||
|
public override bool IsLeef => true;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The words in the leef.
|
||||||
|
/// </summary>
|
||||||
|
public Word[] Words { get; set; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The number of words in the leef.
|
||||||
|
/// </summary>
|
||||||
|
/// <returns></returns>
|
||||||
|
public override int CountWords() => Words == null ? 0 : Words.Length;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Returns null as a leef doesn't have leefs.
|
||||||
|
/// </summary>
|
||||||
|
/// <returns></returns>
|
||||||
|
public override List<XYLeef> GetLeefs()
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets the lines of the leef.
|
||||||
|
/// </summary>
|
||||||
|
/// <returns></returns>
|
||||||
|
public TextLine[] GetLines()
|
||||||
|
{
|
||||||
|
var groupedWords = Words.GroupBy(x => x.BoundingBox.Bottom).ToDictionary(x => x.Key, x => x.ToList());
|
||||||
|
return groupedWords.OrderByDescending(x => x.Key).Select(ws => new TextLine(ws.Value)).ToArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Create a new <see cref="XYLeef"/>.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="words">The words contained in the leef.</param>
|
||||||
|
public XYLeef(params Word[] words) : this(words == null ? null : words.ToList())
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Create a new <see cref="XYLeef"/>.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="words">The words contained in the leef.</param>
|
||||||
|
public XYLeef(IEnumerable<Word> words) : base(null)
|
||||||
|
{
|
||||||
|
decimal left = words.Min(b => b.BoundingBox.Left);
|
||||||
|
decimal right = words.Max(b => b.BoundingBox.Right);
|
||||||
|
|
||||||
|
decimal bottom = words.Min(b => b.BoundingBox.Bottom);
|
||||||
|
decimal top = words.Max(b => b.BoundingBox.Top);
|
||||||
|
|
||||||
|
BoundingBox = new PdfRectangle(left, bottom, right, top);
|
||||||
|
Words = words.ToArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Reference in New Issue
Block a user