NearestNeighbourWordExtractor:

- Improve results by using PointSize
- Make 'filterFunction' public for ad hoc GetWords()
- Allow text in different direction

Make Letter.PointSize public and add warning (needed for NNWordExtractor)
Remove Page.GetPointSize(Letter letter)
This commit is contained in:
BobLd
2020-04-20 13:09:35 +01:00
parent 8eb50517dd
commit 0a6ec3946b
4 changed files with 36 additions and 42 deletions

View File

@@ -107,8 +107,8 @@
return false; return false;
} }
var ptSizePrevious = (int)Math.Round(page.ExperimentalAccess.GetPointSize(previous)); var ptSizePrevious = (int)Math.Round(previous.PointSize);
var ptSize = (int)Math.Round(page.ExperimentalAccess.GetPointSize(letter)); var ptSize = (int)Math.Round(letter.PointSize);
var minPtSize = ptSize < ptSizePrevious ? ptSize : ptSizePrevious; var minPtSize = ptSize < ptSizePrevious ? ptSize : ptSizePrevious;
var gap = Math.Abs(previous.StartBaseLine.Y - letter.StartBaseLine.Y); var gap = Math.Abs(previous.StartBaseLine.Y - letter.StartBaseLine.Y);

View File

@@ -8,7 +8,7 @@
using Util; using Util;
/// <summary> /// <summary>
/// Nearest Neighbour Word Extractor, using the <see cref="Distances.Manhattan"/> distance. /// Nearest Neighbour Word Extractor.
/// This implementation leverages bounding boxes. /// This implementation leverages bounding boxes.
/// </summary> /// </summary>
public class NearestNeighbourWordExtractor : IWordExtractor public class NearestNeighbourWordExtractor : IWordExtractor
@@ -26,55 +26,61 @@
public int MaxDegreeOfParallelism { get; set; } = -1; public int MaxDegreeOfParallelism { get; set; } = -1;
/// <summary> /// <summary>
/// Gets the words. /// Gets the words, using the <see cref="Distances.Manhattan"/> distance.
/// </summary> /// </summary>
/// <param name="letters">The letters in the page.</param> /// <param name="letters">The letters in the page.</param>
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters) public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
{ {
double baseMaxFunc(Letter l1, Letter l2) double maxDistFunc(Letter l1, Letter l2)
{ {
return Math.Max(Math.Max(Math.Max( return Math.Max(Math.Max(Math.Max(Math.Max(Math.Max(
Math.Abs(l1.GlyphRectangle.Width), Math.Abs(l1.GlyphRectangle.Width),
Math.Abs(l2.GlyphRectangle.Width)), Math.Abs(l2.GlyphRectangle.Width)),
Math.Abs(l1.Width)), Math.Abs(l1.Width)),
Math.Abs(l2.Width)); Math.Abs(l2.Width)),
l1.PointSize), l2.PointSize) * 0.2;
}
bool filterFunc(Letter l1, Letter l2)
{
return !string.IsNullOrWhiteSpace(l2.Value);
} }
List<Word> wordsH = GetWords( List<Word> wordsH = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Horizontal).ToList(), letters.Where(l => l.TextDirection == TextDirection.Horizontal).ToList(),
(l1, l2) => baseMaxFunc(l1, l2) * 0.2, (l1, l2) => maxDistFunc(l1, l2),
Distances.Manhattan, MaxDegreeOfParallelism) Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Bottom) .OrderByDescending(x => x.BoundingBox.Bottom)
.ThenBy(x => x.BoundingBox.Left).ToList(); .ThenBy(x => x.BoundingBox.Left).ToList();
var words270 = GetWords( var words270 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate270).ToList(), letters.Where(l => l.TextDirection == TextDirection.Rotate270).ToList(),
(l1, l2) => baseMaxFunc(l1, l2) * 0.2, (l1, l2) => maxDistFunc(l1, l2),
Distances.Manhattan, MaxDegreeOfParallelism) Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
.OrderBy(x => x.BoundingBox.Right) .OrderBy(x => x.BoundingBox.Right)
.ThenByDescending(x => x.BoundingBox.Bottom); .ThenByDescending(x => x.BoundingBox.Bottom);
wordsH.AddRange(words270); wordsH.AddRange(words270);
var words180 = GetWords( var words180 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate180).ToList(), letters.Where(l => l.TextDirection == TextDirection.Rotate180).ToList(),
(l1, l2) => baseMaxFunc(l1, l2) * 0.2, (l1, l2) => maxDistFunc(l1, l2),
Distances.Manhattan, MaxDegreeOfParallelism) Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
.OrderBy(x => x.BoundingBox.Top) .OrderBy(x => x.BoundingBox.Top)
.ThenByDescending(x => x.BoundingBox.Right); .ThenByDescending(x => x.BoundingBox.Right);
wordsH.AddRange(words180); wordsH.AddRange(words180);
var words90 = GetWords( var words90 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate90).ToList(), letters.Where(l => l.TextDirection == TextDirection.Rotate90).ToList(),
(l1, l2) => baseMaxFunc(l1, l2) * 0.2, (l1, l2) => maxDistFunc(l1, l2),
Distances.Manhattan, MaxDegreeOfParallelism) Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Left) .OrderByDescending(x => x.BoundingBox.Left)
.ThenBy(x => x.BoundingBox.Top); .ThenBy(x => x.BoundingBox.Top);
wordsH.AddRange(words90); wordsH.AddRange(words90);
var wordsU = GetWords( var wordsU = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Other).ToList(), letters.Where(l => l.TextDirection == TextDirection.Other).ToList(),
(l1, l2) => baseMaxFunc(l1, l2) * 0.5, (l1, l2) => maxDistFunc(l1, l2) * 2.0, // allow twice the distance for oriented text
Distances.Euclidean, MaxDegreeOfParallelism) Distances.Euclidean, filterFunc, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Bottom) .OrderByDescending(x => x.BoundingBox.Bottom)
.ThenBy(x => x.BoundingBox.Left); .ThenBy(x => x.BoundingBox.Left);
wordsH.AddRange(wordsU); wordsH.AddRange(wordsU);
@@ -83,34 +89,30 @@
} }
/// <summary> /// <summary>
/// Private method to get the words. /// Gets the words.
/// </summary> /// </summary>
/// <param name="pageLetters">The letters in the page, they must have /// <param name="pageLetters">The letters in the page.</param>
/// the same text directions.</param> /// <param name="maxDistanceFunction">The function that determines the maximum distance between two letters (start and end base line points),
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two Letters, /// e.g. Max(GlyphRectangle.Width) x 20%.
/// e.g. Max(GlyphRectangle.Width) x 20%.</param> /// <para>If the distance between the two letters is greater, a new word will be created.</para></param>
/// <param name="distMeasure">The distance measure between two start and end base line points, /// <param name="distMeasure">The distance measure between two letters (start and end base line points),
/// e.g. the Manhattan distance.</param> /// e.g. the Manhattan distance.</param>
/// <param name="filterFunction">Function used to filter out connection between letters, e.g. check if the letters have the same color.
/// <para>If the function returns false, a new word will be created.</para></param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled. /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value. /// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param> /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public List<Word> GetWords(IReadOnlyList<Letter> pageLetters, public List<Word> GetWords(IReadOnlyList<Letter> pageLetters,
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure, Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure,
int maxDegreeOfParallelism) Func<Letter, Letter, bool> filterFunction, int maxDegreeOfParallelism)
{ {
if (pageLetters == null || pageLetters.Count == 0) return new List<Word>(); if (pageLetters == null || pageLetters.Count == 0) return new List<Word>();
TextDirection textDirection = pageLetters[0].TextDirection;
if (pageLetters.Any(x => textDirection != x.TextDirection))
{
throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction.");
}
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(pageLetters, var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(pageLetters,
distMeasure, maxDistanceFunction, distMeasure, maxDistanceFunction,
l => l.EndBaseLine, l => l.StartBaseLine, l => l.EndBaseLine, l => l.StartBaseLine,
l => !string.IsNullOrWhiteSpace(l.Value), l => !string.IsNullOrWhiteSpace(l.Value),
(l1, l2) => !string.IsNullOrWhiteSpace(l2.Value), filterFunction,
maxDegreeOfParallelism).ToList(); maxDegreeOfParallelism).ToList();
List<Word> words = new List<Word>(); List<Word> words = new List<Word>();

View File

@@ -61,9 +61,10 @@
public IColor Color { get; } public IColor Color { get; }
/// <summary> /// <summary>
/// The size of the font in points. This is not ready for public consumption as the calculation is incorrect. /// The size of the font in points.
/// <para>This is considered experimental because the calculated value is incorrect for some documents at present.</para>
/// </summary> /// </summary>
internal double PointSize { get; } public double PointSize { get; }
/// <summary> /// <summary>
/// Sequence number of the ShowText operation that printed this letter. /// Sequence number of the ShowText operation that printed this letter.

View File

@@ -191,15 +191,6 @@
{ {
return annotationProvider.GetAnnotations(); return annotationProvider.GetAnnotations();
} }
/// <summary>
/// Gets the calculated letter size in points.
/// This is considered experimental because the calculated value is incorrect for some documents at present.
/// </summary>
public double GetPointSize(Letter letter)
{
return letter.PointSize;
}
} }
} }
} }