From eb9a9fd00ecb14409a132e0d20e07c4fbce24acc Mon Sep 17 00:00:00 2001 From: BobLd Date: Sat, 10 Aug 2019 16:01:27 +0100 Subject: [PATCH] Document Layout Analysis - IPageSegmenter, Docstrum - Create a TextBlock class - Creates IPageSegmenter - Add other useful distances: angle, etc. - Update RecursiveXYCut - With IPageSegmenter and TextBlock - Make XYNode and XYLeaf internal - Optimise (faster) NearestNeighbourWordExtractor and isolate the clustering algorithms for use outside of this class - Implement a Docstrum inspired page segmentation algorithm --- .../PublicApiScannerTests.cs | 6 +- src/UglyToad.PdfPig/Content/TextBlock.cs | 68 ++++++ .../ClusteringAlgorithms.cs | 164 ++++++++++++++ .../DocumentLayoutAnalysis/Distances.cs | 33 +++ .../DocumentLayoutAnalysis/DocstrumBB.cs | 212 ++++++++++++++++++ .../DocumentLayoutAnalysis/IPageSegmenter.cs | 19 ++ .../NearestNeighbourWordExtractor .cs | 115 +--------- .../DocumentLayoutAnalysis/RecursiveXYCut.cs | 42 +++- .../DocumentLayoutAnalysis/XYLeaf.cs | 2 +- .../DocumentLayoutAnalysis/XYNode.cs | 2 +- 10 files changed, 544 insertions(+), 119 deletions(-) create mode 100644 src/UglyToad.PdfPig/Content/TextBlock.cs create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 865a6240..273f59f7 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -54,17 +54,19 @@ "UglyToad.PdfPig.Content.PageSize", "UglyToad.PdfPig.Content.Word", "UglyToad.PdfPig.Content.TextLine", + "UglyToad.PdfPig.Content.TextBlock", "UglyToad.PdfPig.Content.TextDirection", "UglyToad.PdfPig.Core.TransformationMatrix", "UglyToad.PdfPig.CrossReference.CrossReferenceTable", "UglyToad.PdfPig.CrossReference.CrossReferenceType", "UglyToad.PdfPig.CrossReference.TrailerDictionary", + "UglyToad.PdfPig.DocumentLayoutAnalysis.ClusteringAlgorithms", "UglyToad.PdfPig.DocumentLayoutAnalysis.Distances", + "UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBB", + "UglyToad.PdfPig.DocumentLayoutAnalysis.IPageSegmenter", "UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions", "UglyToad.PdfPig.DocumentLayoutAnalysis.NearestNeighbourWordExtractor", "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", - "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", - "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf", "UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor", "UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", diff --git a/src/UglyToad.PdfPig/Content/TextBlock.cs b/src/UglyToad.PdfPig/Content/TextBlock.cs new file mode 100644 index 00000000..85c10b63 --- /dev/null +++ b/src/UglyToad.PdfPig/Content/TextBlock.cs @@ -0,0 +1,68 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.Content +{ + /// + /// A block of text. + /// + public class TextBlock + { + /// + /// The text of the block. + /// + public string Text { get; } + + /// + /// The text direction of the block. + /// + public TextDirection TextDirection { get; } + + /// + /// The rectangle completely containing the block. + /// + public PdfRectangle BoundingBox { get; } + + /// + /// The text lines contained in the block. + /// + public IReadOnlyList TextLines { get; } + + /// + /// Create a new . + /// + /// + public TextBlock(IReadOnlyList lines) + { + if (lines == null) + { + throw new ArgumentNullException(nameof(lines)); + } + + if (lines.Count == 0) + { + throw new ArgumentException("Empty lines provided.", nameof(lines)); + } + + TextLines = lines; + + Text = string.Join(" ", lines.Select(x => x.Text)); + + var minX = lines.Min(x => x.BoundingBox.Left); + var minY = lines.Min(x => x.BoundingBox.Bottom); + var maxX = lines.Max(x => x.BoundingBox.Right); + var maxY = lines.Max(x => x.BoundingBox.Top); + BoundingBox = new PdfRectangle(minX, minY, maxX, maxY); + + TextDirection = lines[0].TextDirection; + } + + /// + public override string ToString() + { + return Text; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs new file mode 100644 index 00000000..be7a8299 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs @@ -0,0 +1,164 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Clustering Algorithms. + /// + internal class ClusteringAlgorithms + { + /// + /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance. + /// https://en.wikipedia.org/wiki/Transitive_closure + /// + /// Letter, Word, TextLine, etc. + /// Array of elements to group. + /// The distance measure between two points. + /// The function that determines the distance between to points in the same cluster. + /// The pivot's point to use. + /// The candidates to pair point to use. + /// Filter to apply to the pivot point. + /// Filter to apply to both the pivot and the paired point. + internal static IEnumerable> SimpleTransitiveClosure(T[] elements, + Func distMeasure, + Func maxDistanceFunction, + Func pivotPoint, Func candidatesPoint, + Func filterPivot, Func filterFinal) + { + /************************************************************************************* + * Algorithm steps + * 1. Find nearest neighbours indexes (done in parallel) + * Iterate every point (pivot) and put its nearest neighbour's index in an array + * e.g. if nearest neighbour of point i is point j, then indexes[i] = j. + * Only conciders a neighbour if it is within the maximum distance. + * If not within the maximum distance, index will be set to -1. + * NB: Given the possible asymmetry in the relationship, it is possible + * that if indexes[i] = j then indexes[j] != i. + * + * 2. Group indexes + * Group indexes if share neighbours in common - Transitive closure + * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1 + * (i,j,k) will form a group and (m,n) will form another group. + * + * 3. Merge groups that have indexes in common - If any + * If there are group with indexes in common, merge them. + * (Could be improved and put in step 2) + *************************************************************************************/ + + int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray(); + var candidatesPoints = elements.Select(x => candidatesPoint(x)).ToList(); + + // 1. Find nearest neighbours indexes + Parallel.For(0, elements.Length, e => + { + var pivot = elements[e]; + + if (filterPivot(pivot)) + { + int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist); + var paired = elements[index]; + + if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired)) + { + indexes[e] = index; + } + } + }); + + // 2. Group indexes + List> groupedIndexes = new List>(); + HashSet indexDone = new HashSet(); + + for (int e = 0; e < elements.Length; e++) + { + int index = indexes[e]; + + if (index == -1) // This element is not connected + { + // Check if another element index is connected to this element (nb: distance measure is asymetric) + if (!indexes.Contains(e)) + { + // If no other element is connected to this element, add it as a standalone element + groupedIndexes.Add(new HashSet() { e }); + indexDone.Add(e); + } + continue; + } + + bool isDoneC = indexDone.Contains(e); + bool isDoneI = indexDone.Contains(index); + if (isDoneC || isDoneI) + { + if (isDoneC && !isDoneI) + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(e))) + { + pair.Add(index); + } + indexDone.Add(index); + } + else if (!isDoneC && isDoneI) + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(index))) + { + pair.Add(e); + } + indexDone.Add(e); + } + else // isDoneC && isDoneI + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(index))) + { + if (!pair.Contains(e)) pair.Add(e); + } + + foreach (var pair in groupedIndexes.Where(x => x.Contains(e))) + { + if (!pair.Contains(index)) pair.Add(index); + } + } + } + else + { + groupedIndexes.Add(new HashSet() { e, index }); + indexDone.Add(e); + indexDone.Add(index); + } + } + + // Check that all elements are done + if (elements.Length != indexDone.Count) + { + throw new Exception("ClusteringAlgorithms.GetNNGroupedIndexes(): Some elements were not done."); + } + + // 3. Merge groups that have indexes in common + // Check if duplicates (if duplicates, then same index in different groups) + if (indexDone.Count != groupedIndexes.SelectMany(x => x).Count()) + { + for (int e = 0; e < elements.Length; e++) + { + List> candidates = groupedIndexes.Where(x => x.Contains(e)).ToList(); + int count = candidates.Count(); + if (count < 2) continue; // Only one group with this index + + HashSet merged = candidates.First(); + groupedIndexes.Remove(merged); + for (int i = 1; i < count; i++) + { + var current = candidates.ElementAt(i); + merged.UnionWith(current); + groupedIndexes.Remove(current); + } + groupedIndexes.Add(merged); + } + } + + return groupedIndexes; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs index 2b06eea9..8921392f 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs @@ -47,6 +47,39 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y)); } + /// + /// The angle in degrees between the horizontal axis and the line between two points. + /// + /// The first point. + /// The second point. + /// + public static double Angle(PdfPoint point1, PdfPoint point2) + { + return Math.Atan2((float)(point2.Y - point1.Y), (float)(point2.X - point1.X)) * 180.0 / Math.PI; + } + + /// + /// The absolute distance between the Y coordinates of two points. + /// + /// The first point. + /// The second point. + /// + public static double Vertical(PdfPoint point1, PdfPoint point2) + { + return Math.Abs((double)(point2.Y - point1.Y)); + } + + /// + /// The absolute distance between the X coordinates of two points. + /// + /// The first point. + /// The second point. + /// + public static double Horizontal(PdfPoint point1, PdfPoint point2) + { + return Math.Abs((double)(point2.X - point1.X)); + } + /// /// Find the nearest point. /// diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs new file mode 100644 index 00000000..3ced0778 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs @@ -0,0 +1,212 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// The Docstrum algorithm is a bottom-up page segmentation technique based on nearest-neighborhood + /// clustering of connected components extracted from the document. + /// This implementation leverages bounding boxes and does not exactly replicates the original algorithm. + /// See 'The document spectrum for page layout analysis.' by L. O’Gorman. + /// + public class DocstrumBB : IPageSegmenter + { + /// + /// Create an instance of Docstrum for bounding boxes page segmenter, . + /// + public static DocstrumBB Instance { get; } = new DocstrumBB(); + + /// + /// Get the blocks. + /// Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3. + /// + /// + /// + public IReadOnlyList GetBlocks(IEnumerable pageWords) + { + return GetBlocks(pageWords, -30, 30, -135, -45, 1.3); + } + + /// + /// Get the blocks. See original paper for more information. + /// + /// + /// Within-line lower bound angle. + /// Within-line upper bound angle. + /// Between-line lower bound angle. + /// Between-line upper bound angle. + /// Multiplier that gives the maximum perpendicular distance between + /// text lines for blocking. Maximum distance will be this number times the between-line + /// distance found by the analysis. + /// + public IReadOnlyList GetBlocks(IEnumerable pageWords, double wlAngleLB, double wlAngleUB, + double blAngleLB, double blAngleUB, double blMultiplier) + { + var pageWordsArr = pageWords.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToArray(); // remove white spaces + + var withinLineDistList = new ConcurrentBag(); + var betweenLineDistList = new ConcurrentBag(); + + // 1. Estimate in line and between line spacing + Parallel.For(0, pageWordsArr.Length, i => + { + var word = pageWordsArr[i]; + + // Within-line distance + var pointWL = GetNearestPointData(pageWordsArr, word, + bb => bb.BottomRight, bb => bb.BottomRight, + bb => bb.BottomLeft, bb => bb.BottomLeft, + wlAngleLB, wlAngleUB, Distances.Horizontal); + if (pointWL != null) withinLineDistList.Add(pointWL); + + // Between-line distance + var pointBL = GetNearestPointData(pageWordsArr, word, + bb => bb.BottomLeft, bb => bb.Centroid, + bb => bb.TopLeft, bb => bb.Centroid, + blAngleLB, blAngleUB, Distances.Vertical); + if (pointBL != null) betweenLineDistList.Add(pointBL); + }); + + double withinLineDistance = GetPeakAverageDistance(withinLineDistList); + double betweenLineDistance = GetPeakAverageDistance(betweenLineDistList); + + // 2. Find lines of text + double maxDistWL = Math.Min(3 * withinLineDistance, Math.Sqrt(2) * betweenLineDistance); + var lines = GetLines(pageWordsArr, maxDistWL).ToArray(); + + // 3. Find blocks of text + double maxDistBL = blMultiplier * betweenLineDistance; + return GetLinesGroups(lines, maxDistBL).ToList(); + } + + /// + /// Get information on the nearest point, filtered for angle. + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + private double[] GetNearestPointData(Word[] words, Word pivot, Func funcPivotDist, Func funcPivotAngle, + Func funcPointsDist, Func funcPointsAngle, + double angleStart, double angleEnd, + Func finalDistMEasure) + { + var pointR = funcPivotDist(pivot.BoundingBox); + var filtered = words.Where(w => + { + var angleWL = Distances.Angle(funcPivotAngle(pivot.BoundingBox), funcPointsAngle(w.BoundingBox)); + return (angleWL >= angleStart && angleWL <= angleEnd); + }).ToList(); + filtered.Remove(pivot); // remove itself + + if (filtered.Count > 0) + { + int index = pointR.FindIndexNearest( + filtered.Select(w => funcPointsDist(w.BoundingBox)).ToList(), + Distances.Euclidean, out double distWL); + + if (index >= 0) + { + var matchWL = filtered[index]; + return new double[] + { + (double)pivot.Letters.Select(l => l.FontSize).Mode(), + finalDistMEasure(pointR, funcPointsDist(matchWL.BoundingBox)) + }; + } + } + return null; + } + + /// + /// Build lines via transitive closure. + /// + /// + /// + /// + private IEnumerable GetLines(Word[] words, double maxDist) + { + TextDirection textDirection = words[0].TextDirection; + var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean, + (w1, w2) => maxDist, + w => w.BoundingBox.BottomRight, w => w.BoundingBox.BottomLeft, + w => true, + (w1, w2) => + { + var angleWL = Distances.Angle(w1.BoundingBox.BottomRight, w2.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle + return (angleWL >= -30 && angleWL <= 30); + }).ToList(); + + Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList(); + if (textDirection == TextDirection.Rotate180) + { + orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList(); + } + else if (textDirection == TextDirection.Rotate90) + { + orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList(); + } + else if (textDirection == TextDirection.Rotate270) + { + orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList(); + } + + for (int a = 0; a < groupedIndexes.Count(); a++) + { + yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i]))); + } + } + + /// + /// Build blocks via transitive closure. + /// + /// + /// + /// + private IEnumerable GetLinesGroups(TextLine[] lines, double maxDist) + { + var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, Distances.Euclidean, + (l1, l2) => maxDist, + l => l.BoundingBox.TopLeft, l => l.BoundingBox.BottomLeft, + l => true, (l1, l2) => true).ToList(); + + for (int a = 0; a < groupedIndexes.Count(); a++) + { + yield return new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList()); + } + } + + /// + /// Get the average distance value of the peak bucket of the histogram. + /// + /// array[0]=font size, array[1]=distance + /// + private double GetPeakAverageDistance(IEnumerable values) + { + int max = (int)values.Max(x => x[1]) + 1; + int[] distrib = new int[max]; + + // Create histogram with buckets of size 1. + for (int i = 0; i < max; i++) + { + distrib[i] = values.Where(x => x[1] > i && x[1] <= i + 1).Count(); + } + + var peakIndex = Array.IndexOf(distrib, distrib.Max()); + + return values.Where(v => v[1] > peakIndex && v[1] <= peakIndex + 1).Average(x => x[1]); + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs new file mode 100644 index 00000000..27511786 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs @@ -0,0 +1,19 @@ +using System.Collections.Generic; +using UglyToad.PdfPig.Content; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Page segmentation divides a page into areas, each consisting of a layout structure (blocks, lines, etc.). + /// See 'Performance Comparison of Six Algorithms for Page Segmentation' by Faisal Shafait, Daniel Keysers, and Thomas M. Breuel. + /// + public interface IPageSegmenter + { + /// + /// Get the text blocks. + /// + /// The words to generate text blocks for. + /// A list of text blocks from this approach. + IReadOnlyList GetBlocks(IEnumerable pageWords); + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs index 34455cda..3efb19be 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs @@ -1,7 +1,6 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Threading.Tasks; using UglyToad.PdfPig.Content; using UglyToad.PdfPig.Geometry; using UglyToad.PdfPig.Util; @@ -71,7 +70,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height. /// The distance measure between two start and end base line points, /// e.g. the Manhattan distance. - private static List GetWords(IEnumerable pageLetters, + private List GetWords(IEnumerable pageLetters, Func metric, Func distMeasure) { if (pageLetters == null || pageLetters.Count() == 0) return new List(); @@ -97,116 +96,18 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis } Letter[] letters = pageLetters.ToArray(); - int lettersCount = letters.Length; - List startBaseLines = letters.Select(x => x.StartBaseLine).ToList(); - int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray(); - - // Find nearest neighbours indexes - Parallel.For(0, lettersCount, c => - { - var currentLetter = letters[c]; - // only check neighbours if not a white space - if (!string.IsNullOrWhiteSpace(currentLetter.Value)) - { - int index = currentLetter.EndBaseLine.FindIndexNearest(startBaseLines, distMeasure, out double dist); - var pairedLetter = letters[index]; - - if (!string.IsNullOrWhiteSpace(pairedLetter.Value) && - string.Equals(currentLetter.FontName, pairedLetter.FontName, StringComparison.OrdinalIgnoreCase)) - { - decimal minDist = Math.Max(Math.Abs(metric(currentLetter)), Math.Abs(metric(pairedLetter))) * 0.60m; - if ((decimal)dist < minDist) - { - indexes[c] = index; - } - } - } - }); - - // Group indexes - List> groupedIndexes = new List>(); - List indexDone = new List(); - for (int c = 0; c < lettersCount; c++) - { - int i = indexes[c]; - if (i == -1) continue; - - bool isDoneC = indexDone.Contains(c); - bool isDoneI = indexDone.Contains(i); - if (isDoneC || isDoneI) - { - if (isDoneC && !isDoneI) - { - foreach (var pair in groupedIndexes.Where(x => x.Contains(c))) - { - pair.Add(i); - } - indexDone.Add(i); - } - else if (!isDoneC && isDoneI) - { - foreach (var pair in groupedIndexes.Where(x => x.Contains(i))) - { - pair.Add(c); - } - indexDone.Add(c); - } - else - { - foreach (var pair in groupedIndexes.Where(x => x.Contains(i))) - { - if (!pair.Contains(c)) pair.Add(c); - } - - foreach (var pair in groupedIndexes.Where(x => x.Contains(c))) - { - if (!pair.Contains(i)) pair.Add(i); - } - } - } - else - { - List pair = new List() { c, i }; - groupedIndexes.Add(pair); - indexDone.AddRange(pair); - } - } - - // Merge lists with common index - for (int c = 0; c < lettersCount; c++) - { - List> candidates = groupedIndexes.Where(x => x.Any(t => t == c)).ToList(); - if (candidates.Count < 2) continue; // only one group with this index - - List merged = candidates.First(); - groupedIndexes.Remove(merged); - for (int i = 1; i < candidates.Count; i++) - { - var current = candidates[i]; - merged = merged.Union(current).ToList(); - groupedIndexes.Remove(current); - } - groupedIndexes.Add(merged); - } + var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(letters, + distMeasure, + (l1, l2) => Math.Max((double)metric(l1), (double)metric(l2)) * 0.60, + l => l.EndBaseLine, l => l.StartBaseLine, + l => !string.IsNullOrWhiteSpace(l.Value), + (l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value)).ToList(); List words = new List(); for (int a = 0; a < groupedIndexes.Count(); a++) { - List groupedLetters = new List(); - foreach (int s in groupedIndexes[a]) - { - groupedLetters.Add(letters[s]); - } - - words.Add(new Word(orderFunc(groupedLetters))); - } - - List indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList(); - for (int n = 0; n < indexesNotDone.Count(); n++) - { - Letter letter = letters[indexesNotDone[n]]; - words.Add(new Word(new Letter[] { letter })); + words.Add(new Word(orderFunc(groupedIndexes[a].Select(i => letters[i])))); } return words; diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs index ecaa610b..16258f1f 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs @@ -11,14 +11,31 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips /// - public static class RecursiveXYCut + public class RecursiveXYCut : IPageSegmenter { + /// + /// Create an instance of Recursive X-Y Cut page segmenter, . + /// + public static RecursiveXYCut Instance { get; } = new RecursiveXYCut(); + /// /// Get the blocks. + /// Uses 'minimumWidth' = 0, 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height) + /// + /// The words in the page. + /// + public IReadOnlyList GetBlocks(IEnumerable pageWords) + { + return GetBlocks(pageWords, 0); + } + + /// + /// Get the blocks. + /// Uses 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height) /// /// The words in the page. /// The minimum width for a block. - public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth = 0) + public IReadOnlyList GetBlocks(IEnumerable pageWords, decimal minimumWidth) { return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5m, 3)); } @@ -30,7 +47,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The minimum width for a block. /// The dominant font width. /// The dominant font height. - public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth, + public IReadOnlyList GetBlocks(IEnumerable pageWords, decimal minimumWidth, decimal dominantFontWidth, decimal dominantFontHeight) { return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight); @@ -43,15 +60,24 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The minimum width for a block. /// The function that determines the dominant font width. /// The function that determines the dominant font height. - public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth, + public IReadOnlyList GetBlocks(IEnumerable pageWords, decimal minimumWidth, Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc) { - var root = new XYLeaf(pageWords); // Create a root node. - return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); + XYLeaf root = new XYLeaf(pageWords); // Create a root node. + XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); + + var leafs = node.GetLeafs(); + + if (leafs.Count > 0) + { + return leafs.Select(l => new TextBlock(l.GetLines())).ToList(); + } + + return new List(); } - private static XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth, + private XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth, Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc, int level = 0) { @@ -144,7 +170,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return new XYNode(newNodes); } - private static XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth, + private XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth, Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc, int level = 0) { diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs index 9dab8daf..a5970693 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs @@ -9,7 +9,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// A Leaf node used in the algorithm, i.e. a block. /// - public class XYLeaf : XYNode + internal class XYLeaf : XYNode { /// /// Returns true if this node is a leaf, false otherwise. diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs index db423c55..9bac97fb 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs @@ -8,7 +8,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// A Node used in the algorithm. /// - public class XYNode + internal class XYNode { /// /// Returns true if this node is a leaf, false otherwise.