mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-20 19:03:20 +08:00
Update with corrections
This commit is contained in:
@@ -56,7 +56,7 @@
|
|||||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.NNWordExtractor",
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.NNWordExtractor",
|
||||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
|
||||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
|
||||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeef",
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf",
|
||||||
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
|
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
|
||||||
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
|
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
|
||||||
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
|
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
|
||||||
|
@@ -1,4 +1,5 @@
|
|||||||
using System;
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
using UglyToad.PdfPig.Geometry;
|
using UglyToad.PdfPig.Geometry;
|
||||||
|
|
||||||
@@ -14,7 +15,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="point1">The first point.</param>
|
/// <param name="point1">The first point.</param>
|
||||||
/// <param name="point2">The second point.</param>
|
/// <param name="point2">The second point.</param>
|
||||||
/// <returns></returns>
|
|
||||||
public static double Euclidean(PdfPoint point1, PdfPoint point2)
|
public static double Euclidean(PdfPoint point1, PdfPoint point2)
|
||||||
{
|
{
|
||||||
double dx = (double)(point1.X - point2.X);
|
double dx = (double)(point1.X - point2.X);
|
||||||
@@ -29,8 +29,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// <param name="point2">The second point.</param>
|
/// <param name="point2">The second point.</param>
|
||||||
/// <param name="wX">The weight of the X coordinates. Default is 1.</param>
|
/// <param name="wX">The weight of the X coordinates. Default is 1.</param>
|
||||||
/// <param name="wY">The weight of the Y coordinates. Default is 1.</param>
|
/// <param name="wY">The weight of the Y coordinates. Default is 1.</param>
|
||||||
/// <returns></returns>
|
public static double WeightedEuclidean(PdfPoint point1, PdfPoint point2, double wX = 1.0, double wY = 1.0)
|
||||||
public static double WghtdEuclidean(PdfPoint point1, PdfPoint point2, double wX = 1.0, double wY = 1.0)
|
|
||||||
{
|
{
|
||||||
double dx = (double)(point1.X - point2.X);
|
double dx = (double)(point1.X - point2.X);
|
||||||
double dy = (double)(point1.Y - point2.Y);
|
double dy = (double)(point1.Y - point2.Y);
|
||||||
@@ -43,7 +42,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="point1">The first point.</param>
|
/// <param name="point1">The first point.</param>
|
||||||
/// <param name="point2">The second point.</param>
|
/// <param name="point2">The second point.</param>
|
||||||
/// <returns></returns>
|
|
||||||
public static double Manhattan(PdfPoint point1, PdfPoint point2)
|
public static double Manhattan(PdfPoint point1, PdfPoint point2)
|
||||||
{
|
{
|
||||||
return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y));
|
return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y));
|
||||||
@@ -54,16 +52,35 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
|
/// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
|
||||||
/// <param name="points">The list of neighbours candidates.</param>
|
/// <param name="points">The list of neighbours candidates.</param>
|
||||||
/// <param name="measure">The distance measure to use.</param>
|
/// <param name="distanceMeasure">The distance measure to use.</param>
|
||||||
/// <param name="dist">The distance between reference point, and its nearest neighbour</param>
|
/// <param name="distance">The distance between reference point, and its nearest neighbour</param>
|
||||||
/// <returns></returns>
|
public static PdfPoint FindNearest(this PdfPoint pdfPoint, IReadOnlyList<PdfPoint> points,
|
||||||
public static PdfPoint FindNearest(this PdfPoint pdfPoint, PdfPoint[] points,
|
Func<PdfPoint, PdfPoint, double> distanceMeasure, out double distance)
|
||||||
Func<PdfPoint, PdfPoint, double> measure, out double dist)
|
|
||||||
{
|
{
|
||||||
double d = points.Min(k => measure(k, pdfPoint));
|
if (points == null || points.Count == 0)
|
||||||
PdfPoint point = points.First(x => measure(x, pdfPoint) == d);
|
{
|
||||||
dist = d;
|
throw new ArgumentException("Distances.FindNearest(): The list of neighbours candidates is either null or empty.", "points");
|
||||||
return point;
|
}
|
||||||
|
|
||||||
|
if (distanceMeasure == null)
|
||||||
|
{
|
||||||
|
throw new ArgumentException("Distances.FindNearest(): The distance measure must not be null.", "distanceMeasure");
|
||||||
|
}
|
||||||
|
|
||||||
|
distance = double.MaxValue;
|
||||||
|
PdfPoint closestPoint = default;
|
||||||
|
|
||||||
|
for (var i = 0; i < points.Count; i++)
|
||||||
|
{
|
||||||
|
double currentDistance = distanceMeasure(points[i], pdfPoint);
|
||||||
|
if (currentDistance < distance)
|
||||||
|
{
|
||||||
|
distance = currentDistance;
|
||||||
|
closestPoint = points[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return closestPoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -71,16 +88,35 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
|
/// <param name="pdfPoint">The reference point, for which to find the nearest neighbour.</param>
|
||||||
/// <param name="points">The list of neighbours candidates.</param>
|
/// <param name="points">The list of neighbours candidates.</param>
|
||||||
/// <param name="measure">The distance measure to use.</param>
|
/// <param name="distanceMeasure">The distance measure to use.</param>
|
||||||
/// <param name="dist">The distance between reference point, and its nearest neighbour</param>
|
/// <param name="distance">The distance between reference point, and its nearest neighbour</param>
|
||||||
/// <returns></returns>
|
public static int FindIndexNearest(this PdfPoint pdfPoint, IReadOnlyList<PdfPoint> points,
|
||||||
public static int FindIndexNearest(this PdfPoint pdfPoint, PdfPoint[] points,
|
Func<PdfPoint, PdfPoint, double> distanceMeasure, out double distance)
|
||||||
Func<PdfPoint, PdfPoint, double> measure, out double dist)
|
|
||||||
{
|
{
|
||||||
double d = points.Min(k => measure(k, pdfPoint));
|
if (points == null || points.Count == 0)
|
||||||
int index = Array.FindIndex(points, x => measure(x, pdfPoint) == d);
|
{
|
||||||
dist = d;
|
throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "points");
|
||||||
return index;
|
}
|
||||||
|
|
||||||
|
if (distanceMeasure == null)
|
||||||
|
{
|
||||||
|
throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", "distanceMeasure");
|
||||||
|
}
|
||||||
|
|
||||||
|
distance = double.MaxValue;
|
||||||
|
int closestPointIndex = -1;
|
||||||
|
|
||||||
|
for (var i = 0; i < points.Count; i++)
|
||||||
|
{
|
||||||
|
double currentDistance = distanceMeasure(points[i], pdfPoint);
|
||||||
|
if (currentDistance < distance)
|
||||||
|
{
|
||||||
|
distance = currentDistance;
|
||||||
|
closestPointIndex = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return closestPointIndex;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -11,8 +11,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// <summary>
|
/// <summary>
|
||||||
/// Computes the mode of a sequence of float values.
|
/// Computes the mode of a sequence of float values.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="array"></param>
|
/// <param name="array">The array of floats.</param>
|
||||||
/// <returns></returns>
|
|
||||||
public static float Mode(this IEnumerable<float> array)
|
public static float Mode(this IEnumerable<float> array)
|
||||||
{
|
{
|
||||||
if (array == null || array.Count() == 0) return float.NaN;
|
if (array == null || array.Count() == 0) return float.NaN;
|
||||||
@@ -22,8 +21,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// <summary>
|
/// <summary>
|
||||||
/// Computes the mode of a sequence of decimal values.
|
/// Computes the mode of a sequence of decimal values.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="array"></param>
|
/// <param name="array">The array of decimal.</param>
|
||||||
/// <returns></returns>
|
|
||||||
public static decimal Mode(this IEnumerable<decimal> array)
|
public static decimal Mode(this IEnumerable<decimal> array)
|
||||||
{
|
{
|
||||||
return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key;
|
return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key;
|
||||||
|
@@ -12,18 +12,17 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// Nearest Neighbour Word Extractor, using the <see cref="Distances.Manhattan"/> distance.
|
/// Nearest Neighbour Word Extractor, using the <see cref="Distances.Manhattan"/> distance.
|
||||||
/// This implementation leverages bounding boxes.
|
/// This implementation leverages bounding boxes.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public class NNWordExtractor : IWordExtractor
|
public class NearestNeighbourWordExtractor : IWordExtractor
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Create an instance of Nearest Neighbour Word Extractor, <see cref="NNWordExtractor"/>.
|
/// Create an instance of Nearest Neighbour Word Extractor, <see cref="NearestNeighbourWordExtractor"/>.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public static IWordExtractor Instance { get; } = new NNWordExtractor();
|
public static IWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Gets the words.
|
/// Gets the words.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="letters"></param>
|
/// <param name="letters">The letters in the page.</param>
|
||||||
/// <returns></returns>
|
|
||||||
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
|
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
|
||||||
{
|
{
|
||||||
List<Word> wordsH = GetWords(
|
List<Word> wordsH = GetWords(
|
||||||
@@ -64,7 +63,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
///
|
/// Private method to get the words.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="pageLetters">The letters in the page, they must have
|
/// <param name="pageLetters">The letters in the page, they must have
|
||||||
/// the same text directions.</param>
|
/// the same text directions.</param>
|
||||||
@@ -72,7 +71,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height.</param>
|
/// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height.</param>
|
||||||
/// <param name="distMeasure">The distance measure between two start and end base line points,
|
/// <param name="distMeasure">The distance measure between two start and end base line points,
|
||||||
/// e.g. the Manhattan distance.</param>
|
/// e.g. the Manhattan distance.</param>
|
||||||
/// <returns></returns>
|
|
||||||
private static List<Word> GetWords(IEnumerable<Letter> pageLetters,
|
private static List<Word> GetWords(IEnumerable<Letter> pageLetters,
|
||||||
Func<Letter, decimal> metric, Func<PdfPoint, PdfPoint, double> distMeasure)
|
Func<Letter, decimal> metric, Func<PdfPoint, PdfPoint, double> distMeasure)
|
||||||
{
|
{
|
||||||
@@ -85,7 +83,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
|
|
||||||
Letter[] letters = pageLetters.ToArray();
|
Letter[] letters = pageLetters.ToArray();
|
||||||
int lettersCount = letters.Length;
|
int lettersCount = letters.Length;
|
||||||
PdfPoint[] startBaseLines = letters.Select(x => x.StartBaseLine).ToArray();
|
List<PdfPoint> startBaseLines = letters.Select(x => x.StartBaseLine).ToList();
|
||||||
|
|
||||||
int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray();
|
int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray();
|
||||||
|
|
||||||
// Find nearest neighbours indexes
|
// Find nearest neighbours indexes
|
@@ -10,14 +10,14 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
|
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
|
||||||
/// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
|
/// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
|
||||||
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
|
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
|
||||||
/// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha and Robert M.Haralick Ihsin T. Phillips</para>
|
/// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public class RecursiveXYCut
|
public class RecursiveXYCut
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the blocks.
|
/// Get the blocks.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="pageWords">The words in a page.</param>
|
/// <param name="pageWords">The words in the page.</param>
|
||||||
/// <param name="minimumWidth">The minimum width for a block.</param>
|
/// <param name="minimumWidth">The minimum width for a block.</param>
|
||||||
/// <param name="dominantFontWidth">The dominant font width.</param>
|
/// <param name="dominantFontWidth">The dominant font width.</param>
|
||||||
/// <param name="dominantFontHeight">The dominant font height.</param>
|
/// <param name="dominantFontHeight">The dominant font height.</param>
|
||||||
@@ -31,7 +31,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the blocks.
|
/// Get the blocks.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="pageWords">The words in a page.</param>
|
/// <param name="pageWords">The words in the page.</param>
|
||||||
/// <param name="minimumWidth">The minimum width for a block.</param>
|
/// <param name="minimumWidth">The minimum width for a block.</param>
|
||||||
/// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
|
/// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
|
||||||
/// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
|
/// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
|
||||||
@@ -40,24 +40,24 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc)
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc)
|
||||||
{
|
{
|
||||||
var root = new XYLeef(pageWords);
|
var root = new XYLeaf(pageWords);
|
||||||
return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
|
return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static XYNode VerticalCut(XYLeef leef, decimal minimumWidth,
|
private static XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
||||||
{
|
{
|
||||||
if (leef.CountWords() <= 1 || leef.BoundingBox.Width <= minimumWidth)
|
if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
|
||||||
{
|
{
|
||||||
// we stop cutting if
|
// we stop cutting if
|
||||||
// - only one word remains
|
// - only one word remains
|
||||||
// - width is too small
|
// - width is too small
|
||||||
return leef;
|
return leaf;
|
||||||
}
|
}
|
||||||
|
|
||||||
// order words left to right
|
// order words left to right
|
||||||
var words = leef.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray();
|
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray();
|
||||||
|
|
||||||
// determine dominantFontWidth and dominantFontHeight
|
// determine dominantFontWidth and dominantFontHeight
|
||||||
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
|
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
|
||||||
@@ -122,33 +122,33 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
if (i == wordsCount - 1) projectionProfile.Add(currentProj);
|
if (i == wordsCount - 1) projectionProfile.Add(currentProj);
|
||||||
}
|
}
|
||||||
|
|
||||||
var newLeefsEnums = projectionProfile.Select(p => leef.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1]));
|
var newLeafsEnums = projectionProfile.Select(p => leaf.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1]));
|
||||||
var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e));
|
var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
|
||||||
|
|
||||||
var newNodes = newLeefs.Select(l => HorizontalCut(l, minimumWidth,
|
var newNodes = newLeafs.Select(l => HorizontalCut(l, minimumWidth,
|
||||||
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
|
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
|
||||||
|
|
||||||
var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
|
var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
|
||||||
if (lost.Count > 0)
|
if (lost.Count > 0)
|
||||||
{
|
{
|
||||||
newNodes.AddRange(lost.Select(w => new XYLeef(w)));
|
newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
|
||||||
}
|
}
|
||||||
|
|
||||||
return new XYNode(newNodes);
|
return new XYNode(newNodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static XYNode HorizontalCut(XYLeef leef, decimal minimumWidth,
|
private static XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
||||||
{
|
{
|
||||||
if (leef.CountWords() <= 1)
|
if (leaf.CountWords() <= 1)
|
||||||
{
|
{
|
||||||
// we stop cutting if
|
// we stop cutting if
|
||||||
// - only one word remains
|
// - only one word remains
|
||||||
return leef;
|
return leaf;
|
||||||
}
|
}
|
||||||
|
|
||||||
var words = leef.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top
|
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top
|
||||||
|
|
||||||
// determine dominantFontWidth and dominantFontHeight
|
// determine dominantFontWidth and dominantFontHeight
|
||||||
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
|
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
|
||||||
@@ -199,7 +199,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
{
|
{
|
||||||
if (level >= 1)
|
if (level >= 1)
|
||||||
{
|
{
|
||||||
return leef;
|
return leaf;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -207,16 +207,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var newLeefsEnums = projectionProfile.Select(p =>
|
var newLeafsEnums = projectionProfile.Select(p =>
|
||||||
leef.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1]));
|
leaf.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1]));
|
||||||
var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e));
|
var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
|
||||||
var newNodes = newLeefs.Select(l => VerticalCut(l, minimumWidth,
|
var newNodes = newLeafs.Select(l => VerticalCut(l, minimumWidth,
|
||||||
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
|
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
|
||||||
|
|
||||||
var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
|
var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();
|
||||||
if (lost.Count > 0)
|
if (lost.Count > 0)
|
||||||
{
|
{
|
||||||
newNodes.AddRange(lost.Select(w => new XYLeef(w)));
|
newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
|
||||||
}
|
}
|
||||||
return new XYNode(newNodes);
|
return new XYNode(newNodes);
|
||||||
}
|
}
|
||||||
@@ -228,9 +228,9 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
public class XYNode
|
public class XYNode
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Returns true if this node is a leef, false otherwise.
|
/// Returns true if this node is a leaf, false otherwise.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public virtual bool IsLeef => false;
|
public virtual bool IsLeaf => false;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The rectangle completely containing the node.
|
/// The rectangle completely containing the node.
|
||||||
@@ -255,16 +255,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Recursively gets the leefs (last nodes) of this node.
|
/// Recursively gets the leafs (last nodes) of this node.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <returns></returns>
|
/// <returns></returns>
|
||||||
public virtual List<XYLeef> GetLeefs()
|
public virtual List<XYLeaf> GetLeafs()
|
||||||
{
|
{
|
||||||
List<XYLeef> leefs = new List<XYLeef>();
|
List<XYLeaf> leafs = new List<XYLeaf>();
|
||||||
if (Children == null || Children.Count() == 0) return leefs;
|
if (Children == null || Children.Count() == 0) return leafs;
|
||||||
int level = 0;
|
int level = 0;
|
||||||
RecursiveGetLeefs(Children, ref leefs, level);
|
RecursiveGetLeafs(Children, ref leafs, level);
|
||||||
return leefs;
|
return leafs;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -297,86 +297,83 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
private void RecursiveCount(IEnumerable<XYNode> children, ref int count)
|
private void RecursiveCount(IEnumerable<XYNode> children, ref int count)
|
||||||
{
|
{
|
||||||
if (children.Count() == 0) return;
|
if (children.Count() == 0) return;
|
||||||
foreach (XYNode node in children.Where(x => x.IsLeef))
|
foreach (XYNode node in children.Where(x => x.IsLeaf))
|
||||||
{
|
{
|
||||||
count += node.CountWords();
|
count += node.CountWords();
|
||||||
}
|
}
|
||||||
|
|
||||||
foreach (XYNode node in children.Where(x => !x.IsLeef))
|
foreach (XYNode node in children.Where(x => !x.IsLeaf))
|
||||||
{
|
{
|
||||||
RecursiveCount(node.Children, ref count);
|
RecursiveCount(node.Children, ref count);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void RecursiveGetLeefs(IEnumerable<XYNode> children, ref List<XYLeef> leefs, int level)
|
private void RecursiveGetLeafs(IEnumerable<XYNode> children, ref List<XYLeaf> leafs, int level)
|
||||||
{
|
{
|
||||||
if (children.Count() == 0) return;
|
if (children.Count() == 0) return;
|
||||||
bool isVerticalCut = level % 2 == 0;
|
bool isVerticalCut = level % 2 == 0;
|
||||||
|
|
||||||
foreach (XYLeef node in children.Where(x => x.IsLeef))
|
foreach (XYLeaf node in children.Where(x => x.IsLeaf))
|
||||||
{
|
{
|
||||||
leefs.Add(node);
|
leafs.Add(node);
|
||||||
}
|
}
|
||||||
|
|
||||||
level++;
|
level++;
|
||||||
|
|
||||||
IEnumerable<XYNode> notLeefs = children.Where(x => !x.IsLeef);
|
IEnumerable<XYNode> notLeafs = children.Where(x => !x.IsLeaf);
|
||||||
|
|
||||||
if (isVerticalCut)
|
if (isVerticalCut)
|
||||||
{
|
{
|
||||||
notLeefs = notLeefs.OrderBy(x => x.BoundingBox.Left).ToList();
|
notLeafs = notLeafs.OrderBy(x => x.BoundingBox.Left).ToList();
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
notLeefs = notLeefs.OrderByDescending(x => x.BoundingBox.Top).ToList();
|
notLeafs = notLeafs.OrderByDescending(x => x.BoundingBox.Top).ToList();
|
||||||
}
|
}
|
||||||
|
|
||||||
foreach (XYNode node in notLeefs)
|
foreach (XYNode node in notLeafs)
|
||||||
{
|
{
|
||||||
RecursiveGetLeefs(node.Children, ref leefs, level);
|
RecursiveGetLeafs(node.Children, ref leafs, level);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public override string ToString()
|
public override string ToString()
|
||||||
{
|
{
|
||||||
return (IsLeef ? "Leef" : "Node");
|
return (IsLeaf ? "Leaf" : "Node");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// A Leef node used in the <see cref="RecursiveXYCut"/> algorithm, i.e. a block.
|
/// A Leaf node used in the <see cref="RecursiveXYCut"/> algorithm, i.e. a block.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public class XYLeef : XYNode
|
public class XYLeaf : XYNode
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Returns true if this node is a leef, false otherwise.
|
/// Returns true if this node is a leaf, false otherwise.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public override bool IsLeef => true;
|
public override bool IsLeaf => true;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The words in the leef.
|
/// The words in the leaf.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public Word[] Words { get; set; }
|
public Word[] Words { get; set; }
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The number of words in the leef.
|
/// The number of words in the leaf.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <returns></returns>
|
|
||||||
public override int CountWords() => Words == null ? 0 : Words.Length;
|
public override int CountWords() => Words == null ? 0 : Words.Length;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Returns null as a leef doesn't have leefs.
|
/// Returns null as a leaf doesn't have leafs.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <returns></returns>
|
public override List<XYLeaf> GetLeafs()
|
||||||
public override List<XYLeef> GetLeefs()
|
|
||||||
{
|
{
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Gets the lines of the leef.
|
/// Gets the lines of the leaf.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <returns></returns>
|
|
||||||
public TextLine[] GetLines()
|
public TextLine[] GetLines()
|
||||||
{
|
{
|
||||||
var groupedWords = Words.GroupBy(x => x.BoundingBox.Bottom).ToDictionary(x => x.Key, x => x.ToList());
|
var groupedWords = Words.GroupBy(x => x.BoundingBox.Bottom).ToDictionary(x => x.Key, x => x.ToList());
|
||||||
@@ -384,19 +381,19 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Create a new <see cref="XYLeef"/>.
|
/// Create a new <see cref="XYLeaf"/>.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="words">The words contained in the leef.</param>
|
/// <param name="words">The words contained in the leaf.</param>
|
||||||
public XYLeef(params Word[] words) : this(words == null ? null : words.ToList())
|
public XYLeaf(params Word[] words) : this(words == null ? null : words.ToList())
|
||||||
{
|
{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Create a new <see cref="XYLeef"/>.
|
/// Create a new <see cref="XYLeaf"/>.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="words">The words contained in the leef.</param>
|
/// <param name="words">The words contained in the leaf.</param>
|
||||||
public XYLeef(IEnumerable<Word> words) : base(null)
|
public XYLeaf(IEnumerable<Word> words) : base(null)
|
||||||
{
|
{
|
||||||
decimal left = words.Min(b => b.BoundingBox.Left);
|
decimal left = words.Min(b => b.BoundingBox.Left);
|
||||||
decimal right = words.Max(b => b.BoundingBox.Right);
|
decimal right = words.Max(b => b.BoundingBox.Right);
|
||||||
|
Reference in New Issue
Block a user