mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 19:54:52 +08:00
NearestNeighbourWordExtractor:
- Improve results by using PointSize - Make 'filterFunction' public for ad hoc GetWords() - Allow text in different direction Make Letter.PointSize public and add warning (needed for NNWordExtractor) Remove Page.GetPointSize(Letter letter)
This commit is contained in:
@@ -107,8 +107,8 @@
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
var ptSizePrevious = (int)Math.Round(page.ExperimentalAccess.GetPointSize(previous));
|
var ptSizePrevious = (int)Math.Round(previous.PointSize);
|
||||||
var ptSize = (int)Math.Round(page.ExperimentalAccess.GetPointSize(letter));
|
var ptSize = (int)Math.Round(letter.PointSize);
|
||||||
var minPtSize = ptSize < ptSizePrevious ? ptSize : ptSizePrevious;
|
var minPtSize = ptSize < ptSizePrevious ? ptSize : ptSizePrevious;
|
||||||
|
|
||||||
var gap = Math.Abs(previous.StartBaseLine.Y - letter.StartBaseLine.Y);
|
var gap = Math.Abs(previous.StartBaseLine.Y - letter.StartBaseLine.Y);
|
||||||
|
@@ -8,7 +8,7 @@
|
|||||||
using Util;
|
using Util;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Nearest Neighbour Word Extractor, using the <see cref="Distances.Manhattan"/> distance.
|
/// Nearest Neighbour Word Extractor.
|
||||||
/// This implementation leverages bounding boxes.
|
/// This implementation leverages bounding boxes.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public class NearestNeighbourWordExtractor : IWordExtractor
|
public class NearestNeighbourWordExtractor : IWordExtractor
|
||||||
@@ -26,55 +26,61 @@
|
|||||||
public int MaxDegreeOfParallelism { get; set; } = -1;
|
public int MaxDegreeOfParallelism { get; set; } = -1;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Gets the words.
|
/// Gets the words, using the <see cref="Distances.Manhattan"/> distance.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="letters">The letters in the page.</param>
|
/// <param name="letters">The letters in the page.</param>
|
||||||
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
|
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
|
||||||
{
|
{
|
||||||
double baseMaxFunc(Letter l1, Letter l2)
|
double maxDistFunc(Letter l1, Letter l2)
|
||||||
{
|
{
|
||||||
return Math.Max(Math.Max(Math.Max(
|
return Math.Max(Math.Max(Math.Max(Math.Max(Math.Max(
|
||||||
Math.Abs(l1.GlyphRectangle.Width),
|
Math.Abs(l1.GlyphRectangle.Width),
|
||||||
Math.Abs(l2.GlyphRectangle.Width)),
|
Math.Abs(l2.GlyphRectangle.Width)),
|
||||||
Math.Abs(l1.Width)),
|
Math.Abs(l1.Width)),
|
||||||
Math.Abs(l2.Width));
|
Math.Abs(l2.Width)),
|
||||||
|
l1.PointSize), l2.PointSize) * 0.2;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool filterFunc(Letter l1, Letter l2)
|
||||||
|
{
|
||||||
|
return !string.IsNullOrWhiteSpace(l2.Value);
|
||||||
}
|
}
|
||||||
|
|
||||||
List<Word> wordsH = GetWords(
|
List<Word> wordsH = GetWords(
|
||||||
letters.Where(l => l.TextDirection == TextDirection.Horizontal).ToList(),
|
letters.Where(l => l.TextDirection == TextDirection.Horizontal).ToList(),
|
||||||
(l1, l2) => baseMaxFunc(l1, l2) * 0.2,
|
(l1, l2) => maxDistFunc(l1, l2),
|
||||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
|
||||||
.OrderByDescending(x => x.BoundingBox.Bottom)
|
.OrderByDescending(x => x.BoundingBox.Bottom)
|
||||||
.ThenBy(x => x.BoundingBox.Left).ToList();
|
.ThenBy(x => x.BoundingBox.Left).ToList();
|
||||||
|
|
||||||
var words270 = GetWords(
|
var words270 = GetWords(
|
||||||
letters.Where(l => l.TextDirection == TextDirection.Rotate270).ToList(),
|
letters.Where(l => l.TextDirection == TextDirection.Rotate270).ToList(),
|
||||||
(l1, l2) => baseMaxFunc(l1, l2) * 0.2,
|
(l1, l2) => maxDistFunc(l1, l2),
|
||||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
|
||||||
.OrderBy(x => x.BoundingBox.Right)
|
.OrderBy(x => x.BoundingBox.Right)
|
||||||
.ThenByDescending(x => x.BoundingBox.Bottom);
|
.ThenByDescending(x => x.BoundingBox.Bottom);
|
||||||
wordsH.AddRange(words270);
|
wordsH.AddRange(words270);
|
||||||
|
|
||||||
var words180 = GetWords(
|
var words180 = GetWords(
|
||||||
letters.Where(l => l.TextDirection == TextDirection.Rotate180).ToList(),
|
letters.Where(l => l.TextDirection == TextDirection.Rotate180).ToList(),
|
||||||
(l1, l2) => baseMaxFunc(l1, l2) * 0.2,
|
(l1, l2) => maxDistFunc(l1, l2),
|
||||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
|
||||||
.OrderBy(x => x.BoundingBox.Top)
|
.OrderBy(x => x.BoundingBox.Top)
|
||||||
.ThenByDescending(x => x.BoundingBox.Right);
|
.ThenByDescending(x => x.BoundingBox.Right);
|
||||||
wordsH.AddRange(words180);
|
wordsH.AddRange(words180);
|
||||||
|
|
||||||
var words90 = GetWords(
|
var words90 = GetWords(
|
||||||
letters.Where(l => l.TextDirection == TextDirection.Rotate90).ToList(),
|
letters.Where(l => l.TextDirection == TextDirection.Rotate90).ToList(),
|
||||||
(l1, l2) => baseMaxFunc(l1, l2) * 0.2,
|
(l1, l2) => maxDistFunc(l1, l2),
|
||||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
|
||||||
.OrderByDescending(x => x.BoundingBox.Left)
|
.OrderByDescending(x => x.BoundingBox.Left)
|
||||||
.ThenBy(x => x.BoundingBox.Top);
|
.ThenBy(x => x.BoundingBox.Top);
|
||||||
wordsH.AddRange(words90);
|
wordsH.AddRange(words90);
|
||||||
|
|
||||||
var wordsU = GetWords(
|
var wordsU = GetWords(
|
||||||
letters.Where(l => l.TextDirection == TextDirection.Other).ToList(),
|
letters.Where(l => l.TextDirection == TextDirection.Other).ToList(),
|
||||||
(l1, l2) => baseMaxFunc(l1, l2) * 0.5,
|
(l1, l2) => maxDistFunc(l1, l2) * 2.0, // allow twice the distance for oriented text
|
||||||
Distances.Euclidean, MaxDegreeOfParallelism)
|
Distances.Euclidean, filterFunc, MaxDegreeOfParallelism)
|
||||||
.OrderByDescending(x => x.BoundingBox.Bottom)
|
.OrderByDescending(x => x.BoundingBox.Bottom)
|
||||||
.ThenBy(x => x.BoundingBox.Left);
|
.ThenBy(x => x.BoundingBox.Left);
|
||||||
wordsH.AddRange(wordsU);
|
wordsH.AddRange(wordsU);
|
||||||
@@ -83,34 +89,30 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Private method to get the words.
|
/// Gets the words.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="pageLetters">The letters in the page, they must have
|
/// <param name="pageLetters">The letters in the page.</param>
|
||||||
/// the same text directions.</param>
|
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two letters (start and end base line points),
|
||||||
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two Letters,
|
/// e.g. Max(GlyphRectangle.Width) x 20%.
|
||||||
/// e.g. Max(GlyphRectangle.Width) x 20%.</param>
|
/// <para>If the distance between the two letters is greater, a new word will be created.</para></param>
|
||||||
/// <param name="distMeasure">The distance measure between two start and end base line points,
|
/// <param name="distMeasure">The distance measure between two letters (start and end base line points),
|
||||||
/// e.g. the Manhattan distance.</param>
|
/// e.g. the Manhattan distance.</param>
|
||||||
|
/// <param name="filterFunction">Function used to filter out connection between letters, e.g. check if the letters have the same color.
|
||||||
|
/// <para>If the function returns false, a new word will be created.</para></param>
|
||||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||||
public List<Word> GetWords(IReadOnlyList<Letter> pageLetters,
|
public List<Word> GetWords(IReadOnlyList<Letter> pageLetters,
|
||||||
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure,
|
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||||
int maxDegreeOfParallelism)
|
Func<Letter, Letter, bool> filterFunction, int maxDegreeOfParallelism)
|
||||||
{
|
{
|
||||||
if (pageLetters == null || pageLetters.Count == 0) return new List<Word>();
|
if (pageLetters == null || pageLetters.Count == 0) return new List<Word>();
|
||||||
TextDirection textDirection = pageLetters[0].TextDirection;
|
|
||||||
|
|
||||||
if (pageLetters.Any(x => textDirection != x.TextDirection))
|
|
||||||
{
|
|
||||||
throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction.");
|
|
||||||
}
|
|
||||||
|
|
||||||
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(pageLetters,
|
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(pageLetters,
|
||||||
distMeasure, maxDistanceFunction,
|
distMeasure, maxDistanceFunction,
|
||||||
l => l.EndBaseLine, l => l.StartBaseLine,
|
l => l.EndBaseLine, l => l.StartBaseLine,
|
||||||
l => !string.IsNullOrWhiteSpace(l.Value),
|
l => !string.IsNullOrWhiteSpace(l.Value),
|
||||||
(l1, l2) => !string.IsNullOrWhiteSpace(l2.Value),
|
filterFunction,
|
||||||
maxDegreeOfParallelism).ToList();
|
maxDegreeOfParallelism).ToList();
|
||||||
|
|
||||||
List<Word> words = new List<Word>();
|
List<Word> words = new List<Word>();
|
||||||
|
@@ -61,9 +61,10 @@
|
|||||||
public IColor Color { get; }
|
public IColor Color { get; }
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The size of the font in points. This is not ready for public consumption as the calculation is incorrect.
|
/// The size of the font in points.
|
||||||
|
/// <para>This is considered experimental because the calculated value is incorrect for some documents at present.</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
internal double PointSize { get; }
|
public double PointSize { get; }
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Sequence number of the ShowText operation that printed this letter.
|
/// Sequence number of the ShowText operation that printed this letter.
|
||||||
|
@@ -191,15 +191,6 @@
|
|||||||
{
|
{
|
||||||
return annotationProvider.GetAnnotations();
|
return annotationProvider.GetAnnotations();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Gets the calculated letter size in points.
|
|
||||||
/// This is considered experimental because the calculated value is incorrect for some documents at present.
|
|
||||||
/// </summary>
|
|
||||||
public double GetPointSize(Letter letter)
|
|
||||||
{
|
|
||||||
return letter.PointSize;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user