mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-18 18:27:55 +08:00
update NearestNeighbourWordExtractor to use DlaOptions, stop ordering words
This commit is contained in:
@@ -19,109 +19,171 @@
|
||||
public static NearestNeighbourWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the maximum number of concurrent tasks enabled. Default value is -1.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para>
|
||||
/// Get the words using default options values.
|
||||
/// </summary>
|
||||
public int MaxDegreeOfParallelism { get; set; } = -1;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the words, using the <see cref="Distances.Manhattan"/> distance.
|
||||
/// </summary>
|
||||
/// <param name="letters">The letters in the page.</param>
|
||||
/// <param name="letters">The page's letters to group into <see cref="Word"/>s.</param>
|
||||
/// <returns>The <see cref="Word"/>s generated by the nearest neighbour method.</returns>
|
||||
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
|
||||
{
|
||||
double maxDistFunc(Letter l1, Letter l2)
|
||||
return GetWords(letters, new NearestNeighbourWordExtractorOptions());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the words using options values.
|
||||
/// </summary>
|
||||
/// <param name="letters">The page's letters to group into <see cref="Word"/>s.</param>
|
||||
/// <param name="options">The <see cref="NearestNeighbourWordExtractorOptions"/> to use.</param>
|
||||
/// <returns>The <see cref="Word"/>s generated by the nearest neighbour method.</returns>
|
||||
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters, DlaOptions options)
|
||||
{
|
||||
if (options is NearestNeighbourWordExtractorOptions nnOptions)
|
||||
{
|
||||
return Math.Max(Math.Max(Math.Max(Math.Max(Math.Max(
|
||||
Math.Abs(l1.GlyphRectangle.Width),
|
||||
Math.Abs(l2.GlyphRectangle.Width)),
|
||||
Math.Abs(l1.Width)),
|
||||
Math.Abs(l2.Width)),
|
||||
l1.PointSize), l2.PointSize) * 0.2;
|
||||
}
|
||||
if (letters == null || letters.Count == 0)
|
||||
{
|
||||
return EmptyArray<Word>.Instance;
|
||||
}
|
||||
|
||||
bool filterFunc(Letter l1, Letter l2)
|
||||
if (nnOptions.GroupByOrientation)
|
||||
{
|
||||
// axis aligned
|
||||
List<Word> words = GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
|
||||
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
|
||||
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism);
|
||||
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
|
||||
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
|
||||
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
|
||||
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
|
||||
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
|
||||
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
|
||||
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
|
||||
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
|
||||
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
|
||||
|
||||
// not axis aligned
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
|
||||
nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot,
|
||||
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
|
||||
|
||||
return words;
|
||||
}
|
||||
else
|
||||
{
|
||||
return GetWords(letters,
|
||||
nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot,
|
||||
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return !string.IsNullOrWhiteSpace(l2.Value);
|
||||
throw new ArgumentException("Options provided must be of type " + nameof(NearestNeighbourWordExtractorOptions) + ".", nameof(options));
|
||||
}
|
||||
|
||||
List<Word> wordsH = GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
|
||||
(l1, l2) => maxDistFunc(l1, l2),
|
||||
Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
|
||||
.OrderByDescending(x => x.BoundingBox.Bottom)
|
||||
.ThenBy(x => x.BoundingBox.Left).ToList();
|
||||
|
||||
var words270 = GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
|
||||
(l1, l2) => maxDistFunc(l1, l2),
|
||||
Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
|
||||
.OrderBy(x => x.BoundingBox.Right)
|
||||
.ThenByDescending(x => x.BoundingBox.Bottom);
|
||||
wordsH.AddRange(words270);
|
||||
|
||||
var words180 = GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
|
||||
(l1, l2) => maxDistFunc(l1, l2),
|
||||
Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
|
||||
.OrderBy(x => x.BoundingBox.Top)
|
||||
.ThenByDescending(x => x.BoundingBox.Right);
|
||||
wordsH.AddRange(words180);
|
||||
|
||||
var words90 = GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
|
||||
(l1, l2) => maxDistFunc(l1, l2),
|
||||
Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
|
||||
.OrderByDescending(x => x.BoundingBox.Left)
|
||||
.ThenBy(x => x.BoundingBox.Top);
|
||||
wordsH.AddRange(words90);
|
||||
|
||||
var wordsU = GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
|
||||
(l1, l2) => maxDistFunc(l1, l2) * 2.0, // allow twice the distance for oriented text
|
||||
Distances.Euclidean, filterFunc, MaxDegreeOfParallelism)
|
||||
.OrderByDescending(x => x.BoundingBox.Bottom)
|
||||
.ThenBy(x => x.BoundingBox.Left);
|
||||
wordsH.AddRange(wordsU);
|
||||
|
||||
return wordsH;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the words.
|
||||
/// </summary>
|
||||
/// <param name="pageLetters">The letters in the page.</param>
|
||||
/// <param name="letters">The letters in the page.</param>
|
||||
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two letters (start and end base line points),
|
||||
/// e.g. Max(GlyphRectangle.Width) x 20%.
|
||||
/// e.g. Max(GlyphRectangle.Width) x 20%.
|
||||
/// <para>If the distance between the two letters is greater, a new word will be created.</para></param>
|
||||
/// <param name="distMeasure">The distance measure between two letters (start and end base line points),
|
||||
/// e.g. the Manhattan distance.</param>
|
||||
/// <param name="filterPivotFunction"></param>
|
||||
/// <param name="filterFunction">Function used to filter out connection between letters, e.g. check if the letters have the same color.
|
||||
/// <para>If the function returns false, a new word will be created.</para></param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
public List<Word> GetWords(IReadOnlyList<Letter> pageLetters,
|
||||
private List<Word> GetWords(IReadOnlyList<Letter> letters,
|
||||
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
Func<Letter, bool> filterPivotFunction,
|
||||
Func<Letter, Letter, bool> filterFunction, int maxDegreeOfParallelism)
|
||||
{
|
||||
if (pageLetters == null || pageLetters.Count == 0) return new List<Word>();
|
||||
if (letters == null || letters.Count == 0) return new List<Word>();
|
||||
|
||||
var groupedIndexes = Clustering.NearestNeighbours(pageLetters,
|
||||
var groupedLetters = Clustering.NearestNeighbours(letters,
|
||||
distMeasure, maxDistanceFunction,
|
||||
l => l.EndBaseLine, l => l.StartBaseLine,
|
||||
l => !string.IsNullOrWhiteSpace(l.Value),
|
||||
filterPivotFunction,
|
||||
filterFunction,
|
||||
maxDegreeOfParallelism).ToList();
|
||||
|
||||
List<Word> words = new List<Word>();
|
||||
for (int a = 0; a < groupedIndexes.Count; a++)
|
||||
foreach (var g in groupedLetters)
|
||||
{
|
||||
words.Add(new Word(groupedIndexes[a]));
|
||||
words.Add(new Word(g));
|
||||
}
|
||||
|
||||
return words;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Nearest neighbour word extractor options.
|
||||
/// </summary>
|
||||
public class NearestNeighbourWordExtractorOptions : DlaOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// The maximum distance between two letters (start and end base line points) within the same word, as a function of the two letters.
|
||||
/// If the distance between the two letters is greater than this maximum, they will belong to different words.
|
||||
/// <para>Default value is 20% of the Max(Width, PointSize) of both letters. If <see cref="TextOrientation"/> is Other, this distance is doubled.</para>
|
||||
/// </summary>
|
||||
public Func<Letter, Letter, double> MaximumDistance { get; set; } = (l1, l2) =>
|
||||
{
|
||||
double maxDist = Math.Max(Math.Max(Math.Max(Math.Max(Math.Max(
|
||||
Math.Abs(l1.GlyphRectangle.Width),
|
||||
Math.Abs(l2.GlyphRectangle.Width)),
|
||||
Math.Abs(l1.Width)),
|
||||
Math.Abs(l2.Width)),
|
||||
l1.PointSize), l2.PointSize) * 0.2;
|
||||
|
||||
if (l1.TextOrientation == TextOrientation.Other || l2.TextOrientation == TextOrientation.Other)
|
||||
{
|
||||
return 2.0 * maxDist;
|
||||
}
|
||||
return maxDist;
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// The default distance measure used between two letters (start and end base line points).
|
||||
/// <para>Default value is the Euclidean distance.</para>
|
||||
/// </summary>
|
||||
public Func<PdfPoint, PdfPoint, double> DistanceMeasure { get; set; } = Distances.Euclidean;
|
||||
|
||||
/// <summary>
|
||||
/// The distance measure used between two letters (start and end base line points) with axis aligned <see cref="TextOrientation"/>.
|
||||
/// Only used if GroupByOrientation is set to true.
|
||||
/// <para>Default value is the Manhattan distance.</para>
|
||||
/// </summary>
|
||||
public Func<PdfPoint, PdfPoint, double> DistanceMeasureAA { get; set; } = Distances.Manhattan;
|
||||
|
||||
/// <summary>
|
||||
/// Function used to filter out connection between letters, e.g. check if the letters have the same color.
|
||||
/// If the function returns false, letters will belong to different words.
|
||||
/// <para>Default value checks whether the neighbour is a white space or not. If it is the case, it returns false.</para>
|
||||
/// </summary>
|
||||
public Func<Letter, Letter, bool> Filter { get; set; } = (_, l2) => !string.IsNullOrWhiteSpace(l2.Value);
|
||||
|
||||
/// <summary>
|
||||
/// Function used prior searching for the nearest neighbour. If return false, no search will be done.
|
||||
/// <para>Default value checks whether the current letter is a white space or not. If it is the case, it returns false and no search is done.</para>
|
||||
/// </summary>
|
||||
public Func<Letter, bool> FilterPivot { get; set; } = l => !string.IsNullOrWhiteSpace(l.Value);
|
||||
|
||||
/// <summary>
|
||||
/// If true, letters will be grouped by <see cref="TextOrientation"/> before processing.
|
||||
/// The DistanceMeasureAA will be used on axis aligned letters, and the DistanceMeasure on others.
|
||||
/// If false, DistanceMeasure will be used for all letters and DistanceMeasureAA won't be used.
|
||||
/// <para>Default value is true.</para>
|
||||
/// </summary>
|
||||
public bool GroupByOrientation { get; set; } = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user