diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs
index 6371e143..0085307b 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs
@@ -107,8 +107,8 @@
return false;
}
- var ptSizePrevious = (int)Math.Round(page.ExperimentalAccess.GetPointSize(previous));
- var ptSize = (int)Math.Round(page.ExperimentalAccess.GetPointSize(letter));
+ var ptSizePrevious = (int)Math.Round(previous.PointSize);
+ var ptSize = (int)Math.Round(letter.PointSize);
var minPtSize = ptSize < ptSizePrevious ? ptSize : ptSizePrevious;
var gap = Math.Abs(previous.StartBaseLine.Y - letter.StartBaseLine.Y);
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs
index 05989bfe..be3f87c6 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs
@@ -8,7 +8,7 @@
using Util;
///
- /// Nearest Neighbour Word Extractor, using the distance.
+ /// Nearest Neighbour Word Extractor.
/// This implementation leverages bounding boxes.
///
public class NearestNeighbourWordExtractor : IWordExtractor
@@ -26,55 +26,61 @@
public int MaxDegreeOfParallelism { get; set; } = -1;
///
- /// Gets the words.
+ /// Gets the words, using the distance.
///
/// The letters in the page.
public IEnumerable GetWords(IReadOnlyList letters)
{
- double baseMaxFunc(Letter l1, Letter l2)
+ double maxDistFunc(Letter l1, Letter l2)
{
- return Math.Max(Math.Max(Math.Max(
+ return Math.Max(Math.Max(Math.Max(Math.Max(Math.Max(
Math.Abs(l1.GlyphRectangle.Width),
Math.Abs(l2.GlyphRectangle.Width)),
Math.Abs(l1.Width)),
- Math.Abs(l2.Width));
+ Math.Abs(l2.Width)),
+ l1.PointSize), l2.PointSize) * 0.2;
+ }
+
+ bool filterFunc(Letter l1, Letter l2)
+ {
+ return !string.IsNullOrWhiteSpace(l2.Value);
}
List wordsH = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Horizontal).ToList(),
- (l1, l2) => baseMaxFunc(l1, l2) * 0.2,
- Distances.Manhattan, MaxDegreeOfParallelism)
+ (l1, l2) => maxDistFunc(l1, l2),
+ Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Bottom)
.ThenBy(x => x.BoundingBox.Left).ToList();
var words270 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate270).ToList(),
- (l1, l2) => baseMaxFunc(l1, l2) * 0.2,
- Distances.Manhattan, MaxDegreeOfParallelism)
+ (l1, l2) => maxDistFunc(l1, l2),
+ Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
.OrderBy(x => x.BoundingBox.Right)
.ThenByDescending(x => x.BoundingBox.Bottom);
wordsH.AddRange(words270);
var words180 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate180).ToList(),
- (l1, l2) => baseMaxFunc(l1, l2) * 0.2,
- Distances.Manhattan, MaxDegreeOfParallelism)
+ (l1, l2) => maxDistFunc(l1, l2),
+ Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
.OrderBy(x => x.BoundingBox.Top)
.ThenByDescending(x => x.BoundingBox.Right);
wordsH.AddRange(words180);
var words90 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate90).ToList(),
- (l1, l2) => baseMaxFunc(l1, l2) * 0.2,
- Distances.Manhattan, MaxDegreeOfParallelism)
+ (l1, l2) => maxDistFunc(l1, l2),
+ Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Left)
.ThenBy(x => x.BoundingBox.Top);
wordsH.AddRange(words90);
var wordsU = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Other).ToList(),
- (l1, l2) => baseMaxFunc(l1, l2) * 0.5,
- Distances.Euclidean, MaxDegreeOfParallelism)
+ (l1, l2) => maxDistFunc(l1, l2) * 2.0, // allow twice the distance for oriented text
+ Distances.Euclidean, filterFunc, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Bottom)
.ThenBy(x => x.BoundingBox.Left);
wordsH.AddRange(wordsU);
@@ -83,34 +89,30 @@
}
///
- /// Private method to get the words.
+ /// Gets the words.
///
- /// The letters in the page, they must have
- /// the same text directions.
- /// The function that determines the maximum distance between two Letters,
- /// e.g. Max(GlyphRectangle.Width) x 20%.
- /// The distance measure between two start and end base line points,
+ /// The letters in the page.
+ /// The function that determines the maximum distance between two letters (start and end base line points),
+ /// e.g. Max(GlyphRectangle.Width) x 20%.
+ /// If the distance between the two letters is greater, a new word will be created.
+ /// The distance measure between two letters (start and end base line points),
/// e.g. the Manhattan distance.
+ /// Function used to filter out connection between letters, e.g. check if the letters have the same color.
+ /// If the function returns false, a new word will be created.
/// Sets the maximum number of concurrent tasks enabled.
/// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
public List GetWords(IReadOnlyList pageLetters,
Func maxDistanceFunction, Func distMeasure,
- int maxDegreeOfParallelism)
+ Func filterFunction, int maxDegreeOfParallelism)
{
if (pageLetters == null || pageLetters.Count == 0) return new List();
- TextDirection textDirection = pageLetters[0].TextDirection;
-
- if (pageLetters.Any(x => textDirection != x.TextDirection))
- {
- throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction.");
- }
var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(pageLetters,
distMeasure, maxDistanceFunction,
l => l.EndBaseLine, l => l.StartBaseLine,
l => !string.IsNullOrWhiteSpace(l.Value),
- (l1, l2) => !string.IsNullOrWhiteSpace(l2.Value),
+ filterFunction,
maxDegreeOfParallelism).ToList();
List words = new List();
diff --git a/src/UglyToad.PdfPig/Content/Letter.cs b/src/UglyToad.PdfPig/Content/Letter.cs
index 0213d978..a8f27d1b 100644
--- a/src/UglyToad.PdfPig/Content/Letter.cs
+++ b/src/UglyToad.PdfPig/Content/Letter.cs
@@ -61,9 +61,10 @@
public IColor Color { get; }
///
- /// The size of the font in points. This is not ready for public consumption as the calculation is incorrect.
+ /// The size of the font in points.
+ /// This is considered experimental because the calculated value is incorrect for some documents at present.
///
- internal double PointSize { get; }
+ public double PointSize { get; }
///
/// Sequence number of the ShowText operation that printed this letter.
diff --git a/src/UglyToad.PdfPig/Content/Page.cs b/src/UglyToad.PdfPig/Content/Page.cs
index 0aaf17c5..faf297a9 100644
--- a/src/UglyToad.PdfPig/Content/Page.cs
+++ b/src/UglyToad.PdfPig/Content/Page.cs
@@ -191,15 +191,6 @@
{
return annotationProvider.GetAnnotations();
}
-
- ///
- /// Gets the calculated letter size in points.
- /// This is considered experimental because the calculated value is incorrect for some documents at present.
- ///
- public double GetPointSize(Letter letter)
- {
- return letter.PointSize;
- }
}
}
}