diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index bc3dcaf9..e1f3fa6a 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -215,6 +215,7 @@ "UglyToad.PdfPig.Tokens.StreamToken", "UglyToad.PdfPig.Tokens.StringToken", "UglyToad.PdfPig.Util.IWordExtractor", + "UglyToad.PdfPig.Util.DefaultWordExtractor", "UglyToad.PdfPig.Writer.PdfDocumentBuilder", "UglyToad.PdfPig.Writer.PdfPageBuilder", "UglyToad.PdfPig.Writer.TokenWriter", diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs index 3efb19be..1038530d 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs @@ -26,34 +26,39 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { List wordsH = GetWords( letters.Where(l => l.TextDirection == TextDirection.Horizontal), - l => l.GlyphRectangle.Width, Distances.Manhattan) + (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2, + Distances.Manhattan) .OrderByDescending(x => x.BoundingBox.Bottom) .ThenBy(x => x.BoundingBox.Left).ToList(); List words180 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate180), - l => l.GlyphRectangle.Width, Distances.Manhattan) + (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2, + Distances.Manhattan) .OrderBy(x => x.BoundingBox.Top) .ThenByDescending(x => x.BoundingBox.Right).ToList(); wordsH.AddRange(words180); List words90 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate90), - l => l.GlyphRectangle.Height, Distances.Manhattan) + (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Height), (double)(l2.GlyphRectangle.Height)) * 0.2, + Distances.Manhattan) .OrderByDescending(x => x.BoundingBox.Left) .ThenBy(x => x.BoundingBox.Top).ToList(); wordsH.AddRange(words90); List words270 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate270), - l => l.GlyphRectangle.Height, Distances.Manhattan) + (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Height), (double)(l2.GlyphRectangle.Height)) * 0.2, + Distances.Manhattan) .OrderBy(x => x.BoundingBox.Right) .ThenByDescending(x => x.BoundingBox.Bottom).ToList(); wordsH.AddRange(words270); List wordsU = GetWords( letters.Where(l => l.TextDirection == TextDirection.Unknown), - l => l.GlyphRectangle.Width, Distances.Manhattan) + (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2, + Distances.Manhattan) .OrderByDescending(x => x.BoundingBox.Bottom) .ThenBy(x => x.BoundingBox.Left).ToList(); wordsH.AddRange(wordsU); @@ -66,12 +71,12 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// The letters in the page, they must have /// the same text directions. - /// The letter's metric to use in the minimum distance - /// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height. + /// The function that determines the maximum distance between two Letters, + /// e.g. Max(GlyphRectangle.Width) x 20%. /// The distance measure between two start and end base line points, /// e.g. the Manhattan distance. private List GetWords(IEnumerable pageLetters, - Func metric, Func distMeasure) + Func maxDistanceFunction, Func distMeasure) { if (pageLetters == null || pageLetters.Count() == 0) return new List(); TextDirection textDirection = pageLetters.ElementAt(0).TextDirection; @@ -98,8 +103,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis Letter[] letters = pageLetters.ToArray(); var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(letters, - distMeasure, - (l1, l2) => Math.Max((double)metric(l1), (double)metric(l2)) * 0.60, + distMeasure, maxDistanceFunction, l => l.EndBaseLine, l => l.StartBaseLine, l => !string.IsNullOrWhiteSpace(l.Value), (l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value)).ToList(); diff --git a/src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs b/src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs index 89f41b6b..dacb37c2 100644 --- a/src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs +++ b/src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs @@ -5,8 +5,15 @@ using System.Linq; using Content; - internal class DefaultWordExtractor : IWordExtractor + /// + /// Default Word Extractor. + /// + public class DefaultWordExtractor : IWordExtractor { + /// + /// Gets the words. + /// + /// The letters in the page. public IEnumerable GetWords(IReadOnlyList letters) { var lettersOrder = letters.OrderByDescending(x => x.Location.Y) @@ -99,6 +106,9 @@ return new Word(letters.ToList()); } + /// + /// Create an instance of Default Word Extractor, . + /// public static IWordExtractor Instance { get; } = new DefaultWordExtractor(); private DefaultWordExtractor()