using System; using System.Collections.Generic; using System.Linq; using UglyToad.PdfPig.Content; using UglyToad.PdfPig.Geometry; using UglyToad.PdfPig.Util; namespace UglyToad.PdfPig.DocumentLayoutAnalysis { /// /// Nearest Neighbour Word Extractor, using the distance. /// This implementation leverages bounding boxes. /// public class NearestNeighbourWordExtractor : IWordExtractor { /// /// Create an instance of Nearest Neighbour Word Extractor, . /// public static NearestNeighbourWordExtractor Instance { get; } = new NearestNeighbourWordExtractor(); /// /// Gets or sets the maximum number of concurrent tasks enabled. Default value is -1. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. /// public int MaxDegreeOfParallelism { get; set; } = -1; /// /// Gets the words. /// /// The letters in the page. public IEnumerable GetWords(IReadOnlyList letters) { List wordsH = GetWords( letters.Where(l => l.TextDirection == TextDirection.Horizontal), (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2, Distances.Manhattan, MaxDegreeOfParallelism) .OrderByDescending(x => x.BoundingBox.Bottom) .ThenBy(x => x.BoundingBox.Left).ToList(); List words180 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate180), (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2, Distances.Manhattan, MaxDegreeOfParallelism) .OrderBy(x => x.BoundingBox.Top) .ThenByDescending(x => x.BoundingBox.Right).ToList(); wordsH.AddRange(words180); List words90 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate90), (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Height), (double)(l2.GlyphRectangle.Height)) * 0.2, Distances.Manhattan, MaxDegreeOfParallelism) .OrderByDescending(x => x.BoundingBox.Left) .ThenBy(x => x.BoundingBox.Top).ToList(); wordsH.AddRange(words90); List words270 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate270), (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Height), (double)(l2.GlyphRectangle.Height)) * 0.2, Distances.Manhattan, MaxDegreeOfParallelism) .OrderBy(x => x.BoundingBox.Right) .ThenByDescending(x => x.BoundingBox.Bottom).ToList(); wordsH.AddRange(words270); List wordsU = GetWords( letters.Where(l => l.TextDirection == TextDirection.Unknown), (l1, l2) => Math.Max((double)(l1.GlyphRectangle.Width), (double)(l2.GlyphRectangle.Width)) * 0.2, Distances.Manhattan, MaxDegreeOfParallelism) .OrderByDescending(x => x.BoundingBox.Bottom) .ThenBy(x => x.BoundingBox.Left).ToList(); wordsH.AddRange(wordsU); return wordsH; } /// /// Private method to get the words. /// /// The letters in the page, they must have /// the same text directions. /// The function that determines the maximum distance between two Letters, /// e.g. Max(GlyphRectangle.Width) x 20%. /// The distance measure between two start and end base line points, /// e.g. the Manhattan distance. /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. private List GetWords(IEnumerable pageLetters, Func maxDistanceFunction, Func distMeasure, int maxDegreeOfParallelism) { if (pageLetters == null || pageLetters.Count() == 0) return new List(); TextDirection textDirection = pageLetters.ElementAt(0).TextDirection; if (pageLetters.Any(x => textDirection != x.TextDirection)) { throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction."); } Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList(); if (textDirection == TextDirection.Rotate180) { orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Right).ToList(); } else if (textDirection == TextDirection.Rotate90) { orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Top).ToList(); } else if (textDirection == TextDirection.Rotate270) { orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Bottom).ToList(); } Letter[] letters = pageLetters.ToArray(); var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(letters, distMeasure, maxDistanceFunction, l => l.EndBaseLine, l => l.StartBaseLine, l => !string.IsNullOrWhiteSpace(l.Value), (l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value), maxDegreeOfParallelism).ToList(); List words = new List(); for (int a = 0; a < groupedIndexes.Count(); a++) { words.Add(new Word(orderFunc(groupedIndexes[a].Select(i => letters[i])))); } return words; } } }