mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-20 20:07:57 +08:00
Improve default max function
This commit is contained in:
@@ -31,43 +31,52 @@
|
|||||||
/// <param name="letters">The letters in the page.</param>
|
/// <param name="letters">The letters in the page.</param>
|
||||||
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
|
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
|
||||||
{
|
{
|
||||||
|
Func<Letter, Letter, double> baseMaxFunc = (l1, l2) =>
|
||||||
|
{
|
||||||
|
return Math.Max(Math.Max(Math.Max(
|
||||||
|
Math.Abs(l1.GlyphRectangle.Width),
|
||||||
|
Math.Abs(l2.GlyphRectangle.Width)),
|
||||||
|
Math.Abs(l1.Width)),
|
||||||
|
Math.Abs(l2.Width));
|
||||||
|
};
|
||||||
|
|
||||||
List<Word> wordsH = GetWords(
|
List<Word> wordsH = GetWords(
|
||||||
letters.Where(l => l.TextDirection == TextDirection.Horizontal),
|
letters.Where(l => l.TextDirection == TextDirection.Horizontal).ToList(),
|
||||||
(l1, l2) => Math.Max(l1.GlyphRectangle.Width, l2.GlyphRectangle.Width) * 0.2,
|
(l1, l2) => baseMaxFunc(l1, l2) * 0.2,
|
||||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||||
.OrderByDescending(x => x.BoundingBox.Bottom)
|
.OrderByDescending(x => x.BoundingBox.Bottom)
|
||||||
.ThenBy(x => x.BoundingBox.Left).ToList();
|
.ThenBy(x => x.BoundingBox.Left).ToList();
|
||||||
|
|
||||||
List<Word> words270 = GetWords(
|
var words270 = GetWords(
|
||||||
letters.Where(l => l.TextDirection == TextDirection.Rotate270),
|
letters.Where(l => l.TextDirection == TextDirection.Rotate270).ToList(),
|
||||||
(l1, l2) => Math.Max(l1.GlyphRectangle.Width, l2.GlyphRectangle.Width) * 0.2,
|
(l1, l2) => baseMaxFunc(l1, l2) * 0.2,
|
||||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||||
.OrderBy(x => x.BoundingBox.Right)
|
.OrderBy(x => x.BoundingBox.Right)
|
||||||
.ThenByDescending(x => x.BoundingBox.Bottom).ToList();
|
.ThenByDescending(x => x.BoundingBox.Bottom);
|
||||||
wordsH.AddRange(words270);
|
wordsH.AddRange(words270);
|
||||||
|
|
||||||
List<Word> words180 = GetWords(
|
var words180 = GetWords(
|
||||||
letters.Where(l => l.TextDirection == TextDirection.Rotate180),
|
letters.Where(l => l.TextDirection == TextDirection.Rotate180).ToList(),
|
||||||
(l1, l2) => Math.Max(Math.Abs(l1.GlyphRectangle.Width), Math.Abs(l2.GlyphRectangle.Width)) * 0.2,
|
(l1, l2) => baseMaxFunc(l1, l2) * 0.2,
|
||||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||||
.OrderBy(x => x.BoundingBox.Top)
|
.OrderBy(x => x.BoundingBox.Top)
|
||||||
.ThenByDescending(x => x.BoundingBox.Right).ToList();
|
.ThenByDescending(x => x.BoundingBox.Right);
|
||||||
wordsH.AddRange(words180);
|
wordsH.AddRange(words180);
|
||||||
|
|
||||||
List<Word> words90 = GetWords(
|
var words90 = GetWords(
|
||||||
letters.Where(l => l.TextDirection == TextDirection.Rotate90),
|
letters.Where(l => l.TextDirection == TextDirection.Rotate90).ToList(),
|
||||||
(l1, l2) => Math.Max(Math.Abs(l1.GlyphRectangle.Width), Math.Abs(l2.GlyphRectangle.Width)) * 0.2,
|
(l1, l2) => baseMaxFunc(l1, l2) * 0.2,
|
||||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||||
.OrderByDescending(x => x.BoundingBox.Left)
|
.OrderByDescending(x => x.BoundingBox.Left)
|
||||||
.ThenBy(x => x.BoundingBox.Top).ToList();
|
.ThenBy(x => x.BoundingBox.Top);
|
||||||
wordsH.AddRange(words90);
|
wordsH.AddRange(words90);
|
||||||
|
|
||||||
List<Word> wordsU = GetWords(
|
var wordsU = GetWords(
|
||||||
letters.Where(l => l.TextDirection == TextDirection.Other),
|
letters.Where(l => l.TextDirection == TextDirection.Other).ToList(),
|
||||||
(l1, l2) => Math.Max(Math.Abs(l1.GlyphRectangle.Width), Math.Abs(l2.GlyphRectangle.Width)) * 0.5,
|
(l1, l2) => baseMaxFunc(l1, l2) * 0.5,
|
||||||
Distances.Euclidean, MaxDegreeOfParallelism)
|
Distances.Euclidean, MaxDegreeOfParallelism)
|
||||||
.OrderByDescending(x => x.BoundingBox.Bottom)
|
.OrderByDescending(x => x.BoundingBox.Bottom)
|
||||||
.ThenBy(x => x.BoundingBox.Left).ToList();
|
.ThenBy(x => x.BoundingBox.Left);
|
||||||
wordsH.AddRange(wordsU);
|
wordsH.AddRange(wordsU);
|
||||||
|
|
||||||
return wordsH;
|
return wordsH;
|
||||||
@@ -85,7 +94,7 @@
|
|||||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||||
public List<Word> GetWords(IEnumerable<Letter> pageLetters,
|
public List<Word> GetWords(IReadOnlyList<Letter> pageLetters,
|
||||||
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure,
|
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||||
int maxDegreeOfParallelism)
|
int maxDegreeOfParallelism)
|
||||||
{
|
{
|
||||||
@@ -97,9 +106,7 @@
|
|||||||
throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction.");
|
throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction.");
|
||||||
}
|
}
|
||||||
|
|
||||||
Letter[] letters = pageLetters.ToArray();
|
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(pageLetters,
|
||||||
|
|
||||||
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(letters,
|
|
||||||
distMeasure, maxDistanceFunction,
|
distMeasure, maxDistanceFunction,
|
||||||
l => l.EndBaseLine, l => l.StartBaseLine,
|
l => l.EndBaseLine, l => l.StartBaseLine,
|
||||||
l => !string.IsNullOrWhiteSpace(l.Value),
|
l => !string.IsNullOrWhiteSpace(l.Value),
|
||||||
@@ -107,9 +114,9 @@
|
|||||||
maxDegreeOfParallelism).ToList();
|
maxDegreeOfParallelism).ToList();
|
||||||
|
|
||||||
List<Word> words = new List<Word>();
|
List<Word> words = new List<Word>();
|
||||||
for (int a = 0; a < groupedIndexes.Count(); a++)
|
for (int a = 0; a < groupedIndexes.Count; a++)
|
||||||
{
|
{
|
||||||
words.Add(new Word(groupedIndexes[a].Select(i => letters[i]).ToList()));
|
words.Add(new Word(groupedIndexes[a].Select(i => pageLetters[i]).ToList()));
|
||||||
}
|
}
|
||||||
|
|
||||||
return words;
|
return words;
|
||||||
|
Reference in New Issue
Block a user