mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
merge pull request #105 from BobLd/master
whitespace covering algorithm and #104
This commit is contained in:
@@ -91,6 +91,7 @@
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.WhitespaceCoverExtractor",
|
||||
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
|
||||
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
|
||||
"UglyToad.PdfPig.Export.AltoXmlTextExporter",
|
||||
|
||||
@@ -22,11 +22,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(List<T> elements,
|
||||
Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
Func<T, T, double> maxDistanceFunction,
|
||||
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
|
||||
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
|
||||
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal,
|
||||
int maxDegreeOfParallelism)
|
||||
{
|
||||
/*************************************************************************************
|
||||
* Algorithm steps
|
||||
@@ -48,8 +52,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
int[] indexes = Enumerable.Repeat((int)-1, elements.Count).ToArray();
|
||||
var candidatesPoints = elements.Select(candidatesPoint).ToList();
|
||||
|
||||
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
|
||||
|
||||
// 1. Find nearest neighbours indexes
|
||||
Parallel.For(0, elements.Count, e =>
|
||||
Parallel.For(0, elements.Count, parallelOptions, e =>
|
||||
{
|
||||
var pivot = elements[e];
|
||||
|
||||
@@ -85,11 +91,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(T[] elements,
|
||||
Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
Func<T, T, double> maxDistanceFunction,
|
||||
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
|
||||
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
|
||||
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal,
|
||||
int maxDegreeOfParallelism)
|
||||
{
|
||||
/*************************************************************************************
|
||||
* Algorithm steps
|
||||
@@ -111,8 +121,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
|
||||
var candidatesPoints = elements.Select(candidatesPoint).ToList();
|
||||
|
||||
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
|
||||
|
||||
// 1. Find nearest neighbours indexes
|
||||
Parallel.For(0, elements.Length, e =>
|
||||
Parallel.For(0, elements.Length, parallelOptions, e =>
|
||||
{
|
||||
var pivot = elements[e];
|
||||
|
||||
@@ -148,11 +160,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <param name="candidatesLine">The candidates' line to use for pairing.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
internal static IEnumerable<HashSet<int>> ClusterNearestNeighbours<T>(T[] elements,
|
||||
Func<PdfLine, PdfLine, double> distMeasure,
|
||||
Func<T, T, double> maxDistanceFunction,
|
||||
Func<T, PdfLine> pivotLine, Func<T, PdfLine> candidatesLine,
|
||||
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
|
||||
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal,
|
||||
int maxDegreeOfParallelism)
|
||||
{
|
||||
/*************************************************************************************
|
||||
* Algorithm steps
|
||||
@@ -174,8 +190,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
|
||||
var candidatesLines = elements.Select(x => candidatesLine(x)).ToList();
|
||||
|
||||
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
|
||||
|
||||
// 1. Find nearest neighbours indexes
|
||||
Parallel.For(0, elements.Length, e =>
|
||||
Parallel.For(0, elements.Length, parallelOptions, e =>
|
||||
{
|
||||
var pivot = elements[e];
|
||||
|
||||
|
||||
@@ -28,11 +28,25 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// Get the blocks.
|
||||
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
|
||||
/// </summary>
|
||||
/// <param name="pageWords"></param>
|
||||
/// <returns></returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords)
|
||||
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
|
||||
{
|
||||
return GetBlocks(pageWords, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3);
|
||||
return GetBlocks(words, -1);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks.
|
||||
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
|
||||
/// </summary>
|
||||
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, int maxDegreeOfParallelism)
|
||||
{
|
||||
return GetBlocks(words, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3, maxDegreeOfParallelism);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -46,8 +60,26 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// distance found by the analysis.</param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, AngleBounds withinLine,
|
||||
AngleBounds betweenLine,
|
||||
double betweenLineMultiplier)
|
||||
AngleBounds betweenLine, double betweenLineMultiplier)
|
||||
{
|
||||
return GetBlocks(words, withinLine, betweenLine, betweenLineMultiplier, -1);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks. See original paper for more information.
|
||||
/// </summary>
|
||||
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
|
||||
/// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
|
||||
/// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
|
||||
/// text lines for blocking. Maximum distance will be this number times the between-line
|
||||
/// distance found by the analysis.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, AngleBounds withinLine,
|
||||
AngleBounds betweenLine, double betweenLineMultiplier, int maxDegreeOfParallelism)
|
||||
{
|
||||
if (words == null)
|
||||
{
|
||||
@@ -74,8 +106,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
var withinLineDistList = new ConcurrentBag<double>();
|
||||
var betweenLineDistList = new ConcurrentBag<double>();
|
||||
|
||||
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
|
||||
|
||||
// 1. Estimate in line and between line spacing
|
||||
Parallel.For(0, wordsList.Count, i =>
|
||||
Parallel.For(0, wordsList.Count, parallelOptions, i =>
|
||||
{
|
||||
var word = wordsList[i];
|
||||
|
||||
@@ -107,16 +141,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
if (withinLineDistance == null || betweenLineDistance == null)
|
||||
{
|
||||
return new[] {new TextBlock(new[] {new TextLine(wordsList)})};
|
||||
return new[] { new TextBlock(new[] { new TextLine(wordsList) }) };
|
||||
}
|
||||
|
||||
// 2. Find lines of text
|
||||
double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value);
|
||||
var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine).ToArray();
|
||||
var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine, maxDegreeOfParallelism).ToArray();
|
||||
|
||||
// 3. Find blocks of text
|
||||
double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value;
|
||||
var blocks = GetLinesGroups(lines, maxDistanceBetweenLine).ToList();
|
||||
var blocks = GetLinesGroups(lines, maxDistanceBetweenLine, maxDegreeOfParallelism).ToList();
|
||||
|
||||
// 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
|
||||
for (var b = 0; b < blocks.Count; b++)
|
||||
@@ -128,7 +162,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
// Merge all lines (words)
|
||||
blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(),
|
||||
double.MaxValue, withinLine).ToList());
|
||||
double.MaxValue, withinLine, maxDegreeOfParallelism).ToList());
|
||||
|
||||
for (var c = 0; c < blocks.Count; c++)
|
||||
{
|
||||
@@ -147,7 +181,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
// 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
|
||||
// same block. Filtering will still be done based on angle.
|
||||
// Merge all lines (words) sharing same bottom (baseline)
|
||||
var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine).ToList();
|
||||
var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine, maxDegreeOfParallelism).ToList();
|
||||
blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList());
|
||||
|
||||
// Remove
|
||||
@@ -207,7 +241,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
return finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex]);
|
||||
}
|
||||
|
||||
private static IEnumerable<TextLine> GetLines(List<Word> words, double maxDist, AngleBounds withinLine)
|
||||
private static IEnumerable<TextLine> GetLines(List<Word> words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism)
|
||||
{
|
||||
TextDirection textDirection = words[0].TextDirection;
|
||||
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, Distances.Euclidean,
|
||||
@@ -220,7 +254,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft);
|
||||
|
||||
return (withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper);
|
||||
}).ToList();
|
||||
},
|
||||
maxDegreeOfParallelism).ToList();
|
||||
|
||||
Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
|
||||
if (textDirection == TextDirection.Rotate180)
|
||||
@@ -242,7 +277,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
}
|
||||
}
|
||||
|
||||
private static IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
|
||||
private static IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist, int maxDegreeOfParallelism)
|
||||
{
|
||||
/**************************************************************************************************
|
||||
* We want to measure the distance between two lines using the following method:
|
||||
@@ -269,7 +304,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
(pivot, candidate) => maxDist,
|
||||
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
|
||||
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
|
||||
pivot => true, (pivot, candidate) => true).ToList();
|
||||
pivot => true, (pivot, candidate) => true,
|
||||
maxDegreeOfParallelism).ToList();
|
||||
|
||||
for (int a = 0; a < groupedIndexes.Count(); a++)
|
||||
{
|
||||
|
||||
@@ -16,7 +16,14 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <summary>
|
||||
/// Create an instance of Nearest Neighbour Word Extractor, <see cref="NearestNeighbourWordExtractor"/>.
|
||||
/// </summary>
|
||||
public static IWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
|
||||
public static NearestNeighbourWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the maximum number of concurrent tasks enabled. Default value is -1.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para>
|
||||
/// </summary>
|
||||
public int MaxDegreeOfParallelism { get; set; } = -1;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the words.
|
||||
@@ -27,14 +34,14 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
List<Word> wordsH = GetWords(
|
||||
letters.Where(l => l.TextDirection == TextDirection.Horizontal),
|
||||
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2,
|
||||
Distances.Manhattan)
|
||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||
.OrderByDescending(x => x.BoundingBox.Bottom)
|
||||
.ThenBy(x => x.BoundingBox.Left).ToList();
|
||||
|
||||
List<Word> words180 = GetWords(
|
||||
letters.Where(l => l.TextDirection == TextDirection.Rotate180),
|
||||
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2,
|
||||
Distances.Manhattan)
|
||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||
.OrderBy(x => x.BoundingBox.Top)
|
||||
.ThenByDescending(x => x.BoundingBox.Right).ToList();
|
||||
wordsH.AddRange(words180);
|
||||
@@ -42,7 +49,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
List<Word> words90 = GetWords(
|
||||
letters.Where(l => l.TextDirection == TextDirection.Rotate90),
|
||||
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Height), (double)(l2.GlyphRectangle.Height)) * 0.2,
|
||||
Distances.Manhattan)
|
||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||
.OrderByDescending(x => x.BoundingBox.Left)
|
||||
.ThenBy(x => x.BoundingBox.Top).ToList();
|
||||
wordsH.AddRange(words90);
|
||||
@@ -50,7 +57,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
List<Word> words270 = GetWords(
|
||||
letters.Where(l => l.TextDirection == TextDirection.Rotate270),
|
||||
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Height), (double)(l2.GlyphRectangle.Height)) * 0.2,
|
||||
Distances.Manhattan)
|
||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||
.OrderBy(x => x.BoundingBox.Right)
|
||||
.ThenByDescending(x => x.BoundingBox.Bottom).ToList();
|
||||
wordsH.AddRange(words270);
|
||||
@@ -58,7 +65,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
List<Word> wordsU = GetWords(
|
||||
letters.Where(l => l.TextDirection == TextDirection.Unknown),
|
||||
(l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2,
|
||||
Distances.Manhattan)
|
||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||
.OrderByDescending(x => x.BoundingBox.Bottom)
|
||||
.ThenBy(x => x.BoundingBox.Left).ToList();
|
||||
wordsH.AddRange(wordsU);
|
||||
@@ -75,15 +82,19 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// e.g. Max(GlyphRectangle.Width) x 20%.</param>
|
||||
/// <param name="distMeasure">The distance measure between two start and end base line points,
|
||||
/// e.g. the Manhattan distance.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
private List<Word> GetWords(IEnumerable<Letter> pageLetters,
|
||||
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure)
|
||||
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
int maxDegreeOfParallelism)
|
||||
{
|
||||
if (pageLetters == null || pageLetters.Count() == 0) return new List<Word>();
|
||||
TextDirection textDirection = pageLetters.ElementAt(0).TextDirection;
|
||||
|
||||
if (pageLetters.Any(x => textDirection != x.TextDirection))
|
||||
{
|
||||
throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction.");
|
||||
throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction.");
|
||||
}
|
||||
|
||||
Func<IEnumerable<Letter>, IReadOnlyList<Letter>> orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList();
|
||||
@@ -106,7 +117,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
distMeasure, maxDistanceFunction,
|
||||
l => l.EndBaseLine, l => l.StartBaseLine,
|
||||
l => !string.IsNullOrWhiteSpace(l.Value),
|
||||
(l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value)).ToList();
|
||||
(l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value),
|
||||
maxDegreeOfParallelism).ToList();
|
||||
|
||||
List<Word> words = new List<Word>();
|
||||
for (int a = 0; a < groupedIndexes.Count(); a++)
|
||||
|
||||
@@ -29,7 +29,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// </summary>
|
||||
/// <param name="pageWords">The words in the page.</param>
|
||||
/// <param name="minimumElements">The minimum number of elements to define a text edge.</param>
|
||||
public static IReadOnlyDictionary<EdgeType, List<PdfLine>> GetEdges(IEnumerable<Word> pageWords, int minimumElements = 4)
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
public static IReadOnlyDictionary<EdgeType, List<PdfLine>> GetEdges(IEnumerable<Word> pageWords, int minimumElements = 4,
|
||||
int maxDegreeOfParallelism = -1)
|
||||
{
|
||||
if (minimumElements < 0)
|
||||
{
|
||||
@@ -40,7 +44,9 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
ConcurrentDictionary<EdgeType, List<PdfLine>> dictionary = new ConcurrentDictionary<EdgeType, List<PdfLine>>();
|
||||
|
||||
Parallel.ForEach(edgesFuncs, f =>
|
||||
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
|
||||
|
||||
Parallel.ForEach(edgesFuncs, parallelOptions, f =>
|
||||
{
|
||||
dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
|
||||
});
|
||||
|
||||
@@ -0,0 +1,400 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.Content;
|
||||
using UglyToad.PdfPig.Geometry;
|
||||
using UglyToad.PdfPig.Util;
|
||||
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
/// <summary>
|
||||
/// A top-down algorithm that finds a cover of the background whitespace of a document in terms of maximal empty rectangles.
|
||||
/// <para>See Section 3.2 of 'High precision text extraction from PDF documents' by Øyvind Raddum Berg and Section 2 of 'Two geometric algorithms for layout analysis' by Thomas M. Breuel.</para>
|
||||
/// </summary>
|
||||
public static class WhitespaceCoverExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
|
||||
/// </summary>
|
||||
/// <param name="words">The words in the page.</param>
|
||||
/// <param name="images">The images in the page.</param>
|
||||
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
|
||||
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
|
||||
/// <returns>The identified whitespace rectangles.</returns>
|
||||
public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<Word> words, IEnumerable<IPdfImage> images = null, int maxRectangleCount = 40, int maxBoundQueueSize = 0)
|
||||
{
|
||||
return GetWhitespaces(words,
|
||||
images,
|
||||
words.SelectMany(w => w.Letters).Select(x => x.GlyphRectangle.Width).Mode() * 1.25m,
|
||||
words.SelectMany(w => w.Letters).Select(x => x.GlyphRectangle.Height).Mode() * 1.25m,
|
||||
maxRectangleCount: maxRectangleCount,
|
||||
maxBoundQueueSize: maxBoundQueueSize);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
|
||||
/// </summary>
|
||||
/// <param name="words">The words in the page.</param>
|
||||
/// <param name="images">The images in the page.</param>
|
||||
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
|
||||
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
|
||||
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
|
||||
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
|
||||
/// surrounding obstacles by some percent. Default value is 15%.</param>
|
||||
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
|
||||
/// <returns>The identified whitespace rectangles.</returns>
|
||||
public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<Word> words, IEnumerable<IPdfImage> images,
|
||||
decimal minWidth, decimal minHeight, int maxRectangleCount = 40, decimal whitespaceFuzziness = 0.15m, int maxBoundQueueSize = 0)
|
||||
{
|
||||
var bboxes = words.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0)
|
||||
.Select(o => o.BoundingBox).ToList();
|
||||
|
||||
if (images != null && images.Count() > 0)
|
||||
{
|
||||
bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds));
|
||||
}
|
||||
|
||||
return GetWhitespaces(bboxes,
|
||||
minWidth: minWidth,
|
||||
minHeight: minHeight,
|
||||
maxRectangleCount: maxRectangleCount,
|
||||
whitespaceFuzziness: whitespaceFuzziness,
|
||||
maxBoundQueueSize: maxBoundQueueSize);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
|
||||
/// </summary>
|
||||
/// <param name="boundingboxes">The list of obstacles' bounding boxes in the page.</param>
|
||||
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
|
||||
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
|
||||
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
|
||||
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
|
||||
/// surrounding obstacles by some percent. Default value is 15%.</param>
|
||||
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
|
||||
/// <returns>The identified whitespace rectangles.</returns>
|
||||
public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<PdfRectangle> boundingboxes,
|
||||
decimal minWidth, decimal minHeight, int maxRectangleCount = 40, decimal whitespaceFuzziness = 0.15m, int maxBoundQueueSize = 0)
|
||||
{
|
||||
if (boundingboxes.Count() == 0) return EmptyArray<PdfRectangle>.Instance;
|
||||
|
||||
var obstacles = new HashSet<PdfRectangle>(boundingboxes);
|
||||
var pageBound = GetBound(obstacles);
|
||||
return GetMaximalRectangles(pageBound,
|
||||
obstacles,
|
||||
minWidth: minWidth,
|
||||
minHeight: minHeight,
|
||||
maxRectangleCount: maxRectangleCount,
|
||||
whitespaceFuzziness: whitespaceFuzziness,
|
||||
maxBoundQueueSize: maxBoundQueueSize);
|
||||
}
|
||||
|
||||
private static IReadOnlyList<PdfRectangle> GetMaximalRectangles(PdfRectangle bound,
|
||||
HashSet<PdfRectangle> obstacles, decimal minWidth, decimal minHeight, int maxRectangleCount,
|
||||
decimal whitespaceFuzziness, int maxBoundQueueSize)
|
||||
{
|
||||
QueueEntries queueEntries = new QueueEntries(maxBoundQueueSize);
|
||||
queueEntries.Enqueue(new QueueEntry(bound, obstacles, whitespaceFuzziness));
|
||||
|
||||
HashSet<PdfRectangle> selected = new HashSet<PdfRectangle>();
|
||||
HashSet<QueueEntry> holdList = new HashSet<QueueEntry>();
|
||||
|
||||
while (queueEntries.Any())
|
||||
{
|
||||
var current = queueEntries.Dequeue();
|
||||
|
||||
if (current.IsEmptyEnough(obstacles))
|
||||
{
|
||||
if (selected.Any(c => Inside(c, current.Bound))) continue;
|
||||
|
||||
// A check was added which impeded the algorithm from accepting
|
||||
// rectangles which were not adjacent to an already accepted
|
||||
// rectangle, or to the border of the page.
|
||||
if (!IsAdjacentToPageBounds(bound, current.Bound) && // NOT in contact to border page AND
|
||||
!selected.Any(q => IsAdjacentTo(q, current.Bound))) // NOT in contact to any already accepted rectangle
|
||||
{
|
||||
// In order to maintain the correctness of the algorithm,
|
||||
// rejected rectangles are put in a hold list.
|
||||
holdList.Add(current);
|
||||
continue;
|
||||
}
|
||||
|
||||
selected.Add(current.Bound);
|
||||
|
||||
if (selected.Count >= maxRectangleCount) return selected.ToList();
|
||||
|
||||
obstacles.Add(current.Bound);
|
||||
|
||||
// Each time a new rectangle is identified and accepted, this hold list
|
||||
// will be added back to the queue in case any of them will have become valid.
|
||||
foreach (var hold in holdList)
|
||||
{
|
||||
queueEntries.Enqueue(hold);
|
||||
}
|
||||
|
||||
// After a maximal rectangle has been found, it is added back to the list
|
||||
// of obstacles. Whenever a QueueEntry is dequeued, its list of obstacles
|
||||
// can be recomputed to include newly identified whitespace rectangles.
|
||||
foreach (var overlapping in queueEntries)
|
||||
{
|
||||
if (OverlapsHard(current.Bound, overlapping.Bound))
|
||||
overlapping.AddWhitespace(current.Bound);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
var pivot = current.GetPivot();
|
||||
var b = current.Bound;
|
||||
|
||||
List<PdfRectangle> subRectangles = new List<PdfRectangle>();
|
||||
|
||||
var rRight = new PdfRectangle(pivot.Right, b.Bottom, b.Right, b.Top);
|
||||
if (b.Right > pivot.Right && rRight.Height > minHeight && rRight.Width > minWidth)
|
||||
{
|
||||
queueEntries.Enqueue(new QueueEntry(rRight,
|
||||
new HashSet<PdfRectangle>(current.Obstacles.Where(o => OverlapsHard(rRight, o))),
|
||||
whitespaceFuzziness));
|
||||
}
|
||||
|
||||
var rLeft = new PdfRectangle(b.Left, b.Bottom, pivot.Left, b.Top);
|
||||
if (b.Left < pivot.Left && rLeft.Height > minHeight && rLeft.Width > minWidth)
|
||||
{
|
||||
queueEntries.Enqueue(new QueueEntry(rLeft,
|
||||
new HashSet<PdfRectangle>(current.Obstacles.Where(o => OverlapsHard(rLeft, o))),
|
||||
whitespaceFuzziness));
|
||||
}
|
||||
|
||||
var rAbove = new PdfRectangle(b.Left, b.Bottom, b.Right, pivot.Bottom);
|
||||
if (b.Bottom < pivot.Bottom && rAbove.Height > minHeight && rAbove.Width > minWidth)
|
||||
{
|
||||
queueEntries.Enqueue(new QueueEntry(rAbove,
|
||||
new HashSet<PdfRectangle>(current.Obstacles.Where(o => OverlapsHard(rAbove, o))),
|
||||
whitespaceFuzziness));
|
||||
}
|
||||
|
||||
var rBelow = new PdfRectangle(b.Left, pivot.Top, b.Right, b.Top);
|
||||
if (b.Top > pivot.Top && rBelow.Height > minHeight && rBelow.Width > minWidth)
|
||||
{
|
||||
queueEntries.Enqueue(new QueueEntry(rBelow,
|
||||
new HashSet<PdfRectangle>(current.Obstacles.Where(o => OverlapsHard(rBelow, o))),
|
||||
whitespaceFuzziness));
|
||||
}
|
||||
}
|
||||
|
||||
return selected.ToList();
|
||||
}
|
||||
|
||||
private static bool IsAdjacentTo(PdfRectangle rectangle1, PdfRectangle rectangle2)
|
||||
{
|
||||
if (rectangle1.Left > rectangle2.Right ||
|
||||
rectangle2.Left > rectangle1.Right ||
|
||||
rectangle1.Top < rectangle2.Bottom ||
|
||||
rectangle2.Top < rectangle1.Bottom)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (rectangle1.Left == rectangle2.Right ||
|
||||
rectangle1.Right == rectangle2.Left ||
|
||||
rectangle1.Bottom == rectangle2.Top ||
|
||||
rectangle1.Top == rectangle2.Bottom)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static bool IsAdjacentToPageBounds(PdfRectangle pageBound, PdfRectangle rectangle)
|
||||
{
|
||||
if (rectangle.Bottom == pageBound.Bottom ||
|
||||
rectangle.Top == pageBound.Top ||
|
||||
rectangle.Left == pageBound.Left ||
|
||||
rectangle.Right == pageBound.Right)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static bool OverlapsHard(PdfRectangle rectangle1, PdfRectangle rectangle2)
|
||||
{
|
||||
if (rectangle1.Left >= rectangle2.Right ||
|
||||
rectangle2.Left >= rectangle1.Right ||
|
||||
rectangle1.Top <= rectangle2.Bottom ||
|
||||
rectangle2.Top <= rectangle1.Bottom)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static bool Inside(PdfRectangle rectangle1, PdfRectangle rectangle2)
|
||||
{
|
||||
if (rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left &&
|
||||
rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static PdfRectangle GetBound(IEnumerable<PdfRectangle> obstacles)
|
||||
{
|
||||
return new PdfRectangle(
|
||||
obstacles.Min(b => b.Left),
|
||||
obstacles.Min(b => b.Bottom),
|
||||
obstacles.Max(b => b.Right),
|
||||
obstacles.Max(b => b.Top));
|
||||
}
|
||||
|
||||
#region Sorted Queue
|
||||
private class QueueEntries : SortedSet<QueueEntry>
|
||||
{
|
||||
int bound;
|
||||
|
||||
public QueueEntries(int maximumBound)
|
||||
{
|
||||
this.bound = maximumBound;
|
||||
}
|
||||
|
||||
public QueueEntry Dequeue()
|
||||
{
|
||||
var current = this.Max;
|
||||
this.Remove(current);
|
||||
return current;
|
||||
}
|
||||
|
||||
public void Enqueue(QueueEntry queueEntry)
|
||||
{
|
||||
if (this.bound > 0 && this.Count > this.bound)
|
||||
{
|
||||
this.Remove(this.Min);
|
||||
}
|
||||
this.Add(queueEntry);
|
||||
}
|
||||
}
|
||||
|
||||
private class QueueEntry : IComparable<QueueEntry>
|
||||
{
|
||||
public PdfRectangle Bound { get; private set; }
|
||||
|
||||
public decimal Quality { get; private set; }
|
||||
|
||||
public HashSet<PdfRectangle> Obstacles { get; private set; }
|
||||
|
||||
private decimal WhitespaceFuzziness;
|
||||
|
||||
public QueueEntry(PdfRectangle bound, HashSet<PdfRectangle> obstacles, decimal whitespaceFuzziness)
|
||||
{
|
||||
this.Bound = bound;
|
||||
this.Quality = ScoringFunction(Bound);
|
||||
this.Obstacles = obstacles;
|
||||
this.WhitespaceFuzziness = whitespaceFuzziness;
|
||||
}
|
||||
|
||||
public PdfRectangle GetPivot()
|
||||
{
|
||||
int indexMiddle = Distances.FindIndexNearest(Bound.Centroid,
|
||||
Obstacles.Select(o => o.Centroid).ToList(),
|
||||
p => p, p => p, Distances.Euclidean, out double d);
|
||||
|
||||
return indexMiddle == -1 ? Obstacles.First() : Obstacles.ElementAt(indexMiddle);
|
||||
}
|
||||
|
||||
public bool IsEmptyEnough()
|
||||
{
|
||||
return !Obstacles.Any();
|
||||
}
|
||||
|
||||
public bool IsEmptyEnough(IEnumerable<PdfRectangle> pageObstacles)
|
||||
{
|
||||
if (IsEmptyEnough()) return true;
|
||||
|
||||
decimal sum = 0;
|
||||
foreach (var obstacle in pageObstacles)
|
||||
{
|
||||
var intersect = Bound.Intersect(obstacle);
|
||||
if (!intersect.HasValue) return false;
|
||||
|
||||
decimal minimumArea = MinimumOverlappingArea(obstacle, Bound, WhitespaceFuzziness);
|
||||
|
||||
if (intersect.Value.Area > minimumArea)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
sum += intersect.Value.Area;
|
||||
}
|
||||
return sum < Bound.Area * WhitespaceFuzziness;
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return "Q=" + Quality.ToString("#0.0") + ", O=" + Obstacles.Count + ", " + Bound.ToString();
|
||||
}
|
||||
|
||||
public void AddWhitespace(PdfRectangle rectangle)
|
||||
{
|
||||
Obstacles.Add(rectangle);
|
||||
}
|
||||
|
||||
public int CompareTo(QueueEntry entry)
|
||||
{
|
||||
return this.Quality.CompareTo(entry.Quality);
|
||||
}
|
||||
|
||||
public override bool Equals(object obj)
|
||||
{
|
||||
if (obj is QueueEntry entry)
|
||||
{
|
||||
if (this.Bound.Left != entry.Bound.Left ||
|
||||
this.Bound.Right != entry.Bound.Right ||
|
||||
this.Bound.Top != entry.Bound.Top ||
|
||||
this.Bound.Bottom != entry.Bound.Bottom ||
|
||||
this.Obstacles != entry.Obstacles) return false;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public override int GetHashCode()
|
||||
{
|
||||
return (Bound.Left, Bound.Right,
|
||||
Bound.Top, Bound.Bottom,
|
||||
Obstacles).GetHashCode();
|
||||
}
|
||||
|
||||
private static decimal MinimumOverlappingArea(PdfRectangle r1, PdfRectangle r2, decimal whitespaceFuzziness)
|
||||
{
|
||||
return Math.Min(r1.Area, r2.Area) * whitespaceFuzziness;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The scoring function Q(r) which is subsequently used to sort a priority queue.
|
||||
/// </summary>
|
||||
/// <param name="rectangle"></param>
|
||||
private static decimal ScoringFunction(PdfRectangle rectangle)
|
||||
{
|
||||
// As can be seen, tall rectangles are preferred. The trick while choosing this Q(r) was
|
||||
// to keep that preference while still allowing wide rectangles to be chosen. After having
|
||||
// experimented with quite a few variations, this simple function was considered a good
|
||||
// solution.
|
||||
return rectangle.Area * (rectangle.Height / 4m);
|
||||
}
|
||||
|
||||
private static decimal OverlappingArea(PdfRectangle rectangle1, PdfRectangle rectangle2)
|
||||
{
|
||||
var intersect = rectangle1.Intersect(rectangle2);
|
||||
if (intersect.HasValue)
|
||||
{
|
||||
return intersect.Value.Area;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user