From 0a6ec3946b12f7b22633dfa46cb22c629cc18baf Mon Sep 17 00:00:00 2001 From: BobLd Date: Mon, 20 Apr 2020 13:09:35 +0100 Subject: [PATCH] NearestNeighbourWordExtractor: - Improve results by using PointSize - Make 'filterFunction' public for ad hoc GetWords() - Allow text in different direction Make Letter.PointSize public and add warning (needed for NNWordExtractor) Remove Page.GetPointSize(Letter letter) --- .../ContentOrderTextExtractor.cs | 4 +- .../NearestNeighbourWordExtractor.cs | 60 ++++++++++--------- src/UglyToad.PdfPig/Content/Letter.cs | 5 +- src/UglyToad.PdfPig/Content/Page.cs | 9 --- 4 files changed, 36 insertions(+), 42 deletions(-) diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs index 6371e143..0085307b 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs @@ -107,8 +107,8 @@ return false; } - var ptSizePrevious = (int)Math.Round(page.ExperimentalAccess.GetPointSize(previous)); - var ptSize = (int)Math.Round(page.ExperimentalAccess.GetPointSize(letter)); + var ptSizePrevious = (int)Math.Round(previous.PointSize); + var ptSize = (int)Math.Round(letter.PointSize); var minPtSize = ptSize < ptSizePrevious ? ptSize : ptSizePrevious; var gap = Math.Abs(previous.StartBaseLine.Y - letter.StartBaseLine.Y); diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs index 05989bfe..be3f87c6 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs @@ -8,7 +8,7 @@ using Util; /// - /// Nearest Neighbour Word Extractor, using the distance. + /// Nearest Neighbour Word Extractor. /// This implementation leverages bounding boxes. /// public class NearestNeighbourWordExtractor : IWordExtractor @@ -26,55 +26,61 @@ public int MaxDegreeOfParallelism { get; set; } = -1; /// - /// Gets the words. + /// Gets the words, using the distance. /// /// The letters in the page. public IEnumerable GetWords(IReadOnlyList letters) { - double baseMaxFunc(Letter l1, Letter l2) + double maxDistFunc(Letter l1, Letter l2) { - return Math.Max(Math.Max(Math.Max( + return Math.Max(Math.Max(Math.Max(Math.Max(Math.Max( Math.Abs(l1.GlyphRectangle.Width), Math.Abs(l2.GlyphRectangle.Width)), Math.Abs(l1.Width)), - Math.Abs(l2.Width)); + Math.Abs(l2.Width)), + l1.PointSize), l2.PointSize) * 0.2; + } + + bool filterFunc(Letter l1, Letter l2) + { + return !string.IsNullOrWhiteSpace(l2.Value); } List wordsH = GetWords( letters.Where(l => l.TextDirection == TextDirection.Horizontal).ToList(), - (l1, l2) => baseMaxFunc(l1, l2) * 0.2, - Distances.Manhattan, MaxDegreeOfParallelism) + (l1, l2) => maxDistFunc(l1, l2), + Distances.Manhattan, filterFunc, MaxDegreeOfParallelism) .OrderByDescending(x => x.BoundingBox.Bottom) .ThenBy(x => x.BoundingBox.Left).ToList(); var words270 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate270).ToList(), - (l1, l2) => baseMaxFunc(l1, l2) * 0.2, - Distances.Manhattan, MaxDegreeOfParallelism) + (l1, l2) => maxDistFunc(l1, l2), + Distances.Manhattan, filterFunc, MaxDegreeOfParallelism) .OrderBy(x => x.BoundingBox.Right) .ThenByDescending(x => x.BoundingBox.Bottom); wordsH.AddRange(words270); var words180 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate180).ToList(), - (l1, l2) => baseMaxFunc(l1, l2) * 0.2, - Distances.Manhattan, MaxDegreeOfParallelism) + (l1, l2) => maxDistFunc(l1, l2), + Distances.Manhattan, filterFunc, MaxDegreeOfParallelism) .OrderBy(x => x.BoundingBox.Top) .ThenByDescending(x => x.BoundingBox.Right); wordsH.AddRange(words180); var words90 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate90).ToList(), - (l1, l2) => baseMaxFunc(l1, l2) * 0.2, - Distances.Manhattan, MaxDegreeOfParallelism) + (l1, l2) => maxDistFunc(l1, l2), + Distances.Manhattan, filterFunc, MaxDegreeOfParallelism) .OrderByDescending(x => x.BoundingBox.Left) .ThenBy(x => x.BoundingBox.Top); wordsH.AddRange(words90); var wordsU = GetWords( letters.Where(l => l.TextDirection == TextDirection.Other).ToList(), - (l1, l2) => baseMaxFunc(l1, l2) * 0.5, - Distances.Euclidean, MaxDegreeOfParallelism) + (l1, l2) => maxDistFunc(l1, l2) * 2.0, // allow twice the distance for oriented text + Distances.Euclidean, filterFunc, MaxDegreeOfParallelism) .OrderByDescending(x => x.BoundingBox.Bottom) .ThenBy(x => x.BoundingBox.Left); wordsH.AddRange(wordsU); @@ -83,34 +89,30 @@ } /// - /// Private method to get the words. + /// Gets the words. /// - /// The letters in the page, they must have - /// the same text directions. - /// The function that determines the maximum distance between two Letters, - /// e.g. Max(GlyphRectangle.Width) x 20%. - /// The distance measure between two start and end base line points, + /// The letters in the page. + /// The function that determines the maximum distance between two letters (start and end base line points), + /// e.g. Max(GlyphRectangle.Width) x 20%. + /// If the distance between the two letters is greater, a new word will be created. + /// The distance measure between two letters (start and end base line points), /// e.g. the Manhattan distance. + /// Function used to filter out connection between letters, e.g. check if the letters have the same color. + /// If the function returns false, a new word will be created. /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public List GetWords(IReadOnlyList pageLetters, Func maxDistanceFunction, Func distMeasure, - int maxDegreeOfParallelism) + Func filterFunction, int maxDegreeOfParallelism) { if (pageLetters == null || pageLetters.Count == 0) return new List(); - TextDirection textDirection = pageLetters[0].TextDirection; - - if (pageLetters.Any(x => textDirection != x.TextDirection)) - { - throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction."); - } var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(pageLetters, distMeasure, maxDistanceFunction, l => l.EndBaseLine, l => l.StartBaseLine, l => !string.IsNullOrWhiteSpace(l.Value), - (l1, l2) => !string.IsNullOrWhiteSpace(l2.Value), + filterFunction, maxDegreeOfParallelism).ToList(); List words = new List(); diff --git a/src/UglyToad.PdfPig/Content/Letter.cs b/src/UglyToad.PdfPig/Content/Letter.cs index 0213d978..a8f27d1b 100644 --- a/src/UglyToad.PdfPig/Content/Letter.cs +++ b/src/UglyToad.PdfPig/Content/Letter.cs @@ -61,9 +61,10 @@ public IColor Color { get; } /// - /// The size of the font in points. This is not ready for public consumption as the calculation is incorrect. + /// The size of the font in points. + /// This is considered experimental because the calculated value is incorrect for some documents at present. /// - internal double PointSize { get; } + public double PointSize { get; } /// /// Sequence number of the ShowText operation that printed this letter. diff --git a/src/UglyToad.PdfPig/Content/Page.cs b/src/UglyToad.PdfPig/Content/Page.cs index 0aaf17c5..faf297a9 100644 --- a/src/UglyToad.PdfPig/Content/Page.cs +++ b/src/UglyToad.PdfPig/Content/Page.cs @@ -191,15 +191,6 @@ { return annotationProvider.GetAnnotations(); } - - /// - /// Gets the calculated letter size in points. - /// This is considered experimental because the calculated value is incorrect for some documents at present. - /// - public double GetPointSize(Letter letter) - { - return letter.PointSize; - } } } }