diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
index 19461919..b85221ce 100644
--- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
@@ -54,6 +54,7 @@
"UglyToad.PdfPig.Content.PageSize",
"UglyToad.PdfPig.Content.Word",
"UglyToad.PdfPig.Content.TextLine",
+ "UglyToad.PdfPig.Content.TextBlock",
"UglyToad.PdfPig.Content.TextDirection",
"UglyToad.PdfPig.Content.XmpMetadata",
"UglyToad.PdfPig.Core.TransformationMatrix",
@@ -61,11 +62,11 @@
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
"UglyToad.PdfPig.DocumentLayoutAnalysis.Distances",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBB",
+ "UglyToad.PdfPig.DocumentLayoutAnalysis.IPageSegmenter",
"UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions",
"UglyToad.PdfPig.DocumentLayoutAnalysis.NearestNeighbourWordExtractor",
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
- "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
- "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf",
"UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
"UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
diff --git a/src/UglyToad.PdfPig/Content/TextBlock.cs b/src/UglyToad.PdfPig/Content/TextBlock.cs
new file mode 100644
index 00000000..85c10b63
--- /dev/null
+++ b/src/UglyToad.PdfPig/Content/TextBlock.cs
@@ -0,0 +1,68 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using UglyToad.PdfPig.Geometry;
+
+namespace UglyToad.PdfPig.Content
+{
+ ///
+ /// A block of text.
+ ///
+ public class TextBlock
+ {
+ ///
+ /// The text of the block.
+ ///
+ public string Text { get; }
+
+ ///
+ /// The text direction of the block.
+ ///
+ public TextDirection TextDirection { get; }
+
+ ///
+ /// The rectangle completely containing the block.
+ ///
+ public PdfRectangle BoundingBox { get; }
+
+ ///
+ /// The text lines contained in the block.
+ ///
+ public IReadOnlyList TextLines { get; }
+
+ ///
+ /// Create a new .
+ ///
+ ///
+ public TextBlock(IReadOnlyList lines)
+ {
+ if (lines == null)
+ {
+ throw new ArgumentNullException(nameof(lines));
+ }
+
+ if (lines.Count == 0)
+ {
+ throw new ArgumentException("Empty lines provided.", nameof(lines));
+ }
+
+ TextLines = lines;
+
+ Text = string.Join(" ", lines.Select(x => x.Text));
+
+ var minX = lines.Min(x => x.BoundingBox.Left);
+ var minY = lines.Min(x => x.BoundingBox.Bottom);
+ var maxX = lines.Max(x => x.BoundingBox.Right);
+ var maxY = lines.Max(x => x.BoundingBox.Top);
+ BoundingBox = new PdfRectangle(minX, minY, maxX, maxY);
+
+ TextDirection = lines[0].TextDirection;
+ }
+
+ ///
+ public override string ToString()
+ {
+ return Text;
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
new file mode 100644
index 00000000..4e9e6182
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/ClusteringAlgorithms.cs
@@ -0,0 +1,243 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using UglyToad.PdfPig.Geometry;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// Clustering Algorithms.
+ ///
+ internal class ClusteringAlgorithms
+ {
+ ///
+ /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
+ /// https://en.wikipedia.org/wiki/Transitive_closure
+ ///
+ /// Letter, Word, TextLine, etc.
+ /// Array of elements to group.
+ /// The distance measure between two points.
+ /// The function that determines the maximum distance between two points in the same cluster.
+ /// The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.
+ /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.
+ /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
+ /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
+ internal static IEnumerable> SimpleTransitiveClosure(T[] elements,
+ Func distMeasure,
+ Func maxDistanceFunction,
+ Func pivotPoint, Func candidatesPoint,
+ Func filterPivot, Func filterFinal)
+ {
+ /*************************************************************************************
+ * Algorithm steps
+ * 1. Find nearest neighbours indexes (done in parallel)
+ * Iterate every point (pivot) and put its nearest neighbour's index in an array
+ * e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
+ * Only conciders a neighbour if it is within the maximum distance.
+ * If not within the maximum distance, index will be set to -1.
+ * NB: Given the possible asymmetry in the relationship, it is possible
+ * that if indexes[i] = j then indexes[j] != i.
+ *
+ * 2. Group indexes
+ * Group indexes if share neighbours in common - Transitive closure
+ * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
+ * (i,j,k) will form a group and (m,n) will form another group.
+ *
+ * 3. Merge groups that have indexes in common - If any
+ * If there are group with indexes in common, merge them.
+ * (Could be improved and put in step 2)
+ *************************************************************************************/
+
+ int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
+ var candidatesPoints = elements.Select(x => candidatesPoint(x)).ToList();
+
+ // 1. Find nearest neighbours indexes
+ Parallel.For(0, elements.Length, e =>
+ {
+ var pivot = elements[e];
+
+ if (filterPivot(pivot))
+ {
+ int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist);
+ var paired = elements[index];
+
+ if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
+ {
+ indexes[e] = index;
+ }
+ }
+ });
+
+ // 2. Group indexes
+ // 3. Merge groups that have indexes in common
+ var groupedIndexes = GroupMergeIndexes(indexes);
+
+ return groupedIndexes;
+ }
+
+ ///
+ /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
+ /// https://en.wikipedia.org/wiki/Transitive_closure
+ ///
+ /// Letter, Word, TextLine, etc.
+ /// Array of elements to group.
+ /// The distance measure between two lines.
+ /// The function that determines the maximum distance between two points in the same cluster.
+ /// The pivot's line to use for pairing.
+ /// The candidates' line to use for pairing.
+ /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
+ /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
+ internal static IEnumerable> SimpleTransitiveClosure(T[] elements,
+ Func distMeasure,
+ Func maxDistanceFunction,
+ Func pivotLine, Func candidatesLine,
+ Func filterPivot, Func filterFinal)
+ {
+ /*************************************************************************************
+ * Algorithm steps
+ * 1. Find nearest neighbours indexes (done in parallel)
+ * Iterate every point (pivot) and put its nearest neighbour's index in an array
+ * e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
+ * Only conciders a neighbour if it is within the maximum distance.
+ * If not within the maximum distance, index will be set to -1.
+ * NB: Given the possible asymmetry in the relationship, it is possible
+ * that if indexes[i] = j then indexes[j] != i.
+ *
+ * 2. Group indexes
+ * Group indexes if share neighbours in common - Transitive closure
+ * e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
+ * (i,j,k) will form a group and (m,n) will form another group.
+ *
+ * 3. Merge groups that have indexes in common - If any
+ * If there are group with indexes in common, merge them.
+ * (Could be improved and put in step 2)
+ *************************************************************************************/
+
+ int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
+ var candidatesLines = elements.Select(x => candidatesLine(x)).ToList();
+
+ // 1. Find nearest neighbours indexes
+ Parallel.For(0, elements.Length, e =>
+ {
+ var pivot = elements[e];
+
+ if (filterPivot(pivot))
+ {
+ int index = pivotLine(pivot).FindIndexNearest(candidatesLines, distMeasure, out double dist);
+ var paired = elements[index];
+
+ if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
+ {
+ indexes[e] = index;
+ }
+ }
+ });
+
+ // 2. Group indexes
+ // 3. Merge groups that have indexes in common
+ var groupedIndexes = GroupMergeIndexes(indexes);
+
+ return groupedIndexes;
+ }
+
+ ///
+ /// Group elements via transitive closure.
+ /// https://en.wikipedia.org/wiki/Transitive_closure
+ ///
+ /// Array of paired elements index.
+ ///
+ internal static List> GroupMergeIndexes(int[] indexes)
+ {
+ // 2. Group indexes
+ List> groupedIndexes = new List>();
+ HashSet indexDone = new HashSet();
+
+ for (int e = 0; e < indexes.Length; e++)
+ {
+ int index = indexes[e];
+
+ if (index == -1) // This element is not connected
+ {
+ // Check if another element's index is connected to this element (nb: distance measure is asymmetric)
+ if (!indexes.Contains(e))
+ {
+ // If no other element is connected to this element, add it as a standalone element
+ groupedIndexes.Add(new HashSet() { e });
+ indexDone.Add(e);
+ }
+ continue;
+ }
+
+ bool isDoneC = indexDone.Contains(e);
+ bool isDoneI = indexDone.Contains(index);
+ if (isDoneC || isDoneI)
+ {
+ if (isDoneC && !isDoneI)
+ {
+ foreach (var pair in groupedIndexes.Where(x => x.Contains(e)))
+ {
+ pair.Add(index);
+ }
+ indexDone.Add(index);
+ }
+ else if (!isDoneC && isDoneI)
+ {
+ foreach (var pair in groupedIndexes.Where(x => x.Contains(index)))
+ {
+ pair.Add(e);
+ }
+ indexDone.Add(e);
+ }
+ else // isDoneC && isDoneI
+ {
+ foreach (var pair in groupedIndexes.Where(x => x.Contains(index)))
+ {
+ if (!pair.Contains(e)) pair.Add(e);
+ }
+
+ foreach (var pair in groupedIndexes.Where(x => x.Contains(e)))
+ {
+ if (!pair.Contains(index)) pair.Add(index);
+ }
+ }
+ }
+ else
+ {
+ groupedIndexes.Add(new HashSet() { e, index });
+ indexDone.Add(e);
+ indexDone.Add(index);
+ }
+ }
+
+ // Check that all elements are done
+ if (indexes.Length != indexDone.Count)
+ {
+ throw new Exception("ClusteringAlgorithms.GetNNGroupedIndexes(): Some elements were not done.");
+ }
+
+ // 3. Merge groups that have indexes in common
+ // Check if duplicates (if duplicates, then same index in different groups)
+ if (indexDone.Count != groupedIndexes.SelectMany(x => x).Count())
+ {
+ for (int e = 0; e < indexes.Length; e++)
+ {
+ List> candidates = groupedIndexes.Where(x => x.Contains(e)).ToList();
+ int count = candidates.Count();
+ if (count < 2) continue; // Only one group with this index
+
+ HashSet merged = candidates.First();
+ groupedIndexes.Remove(merged);
+ for (int i = 1; i < count; i++)
+ {
+ var current = candidates.ElementAt(i);
+ merged.UnionWith(current);
+ groupedIndexes.Remove(current);
+ }
+ groupedIndexes.Add(merged);
+ }
+ }
+ return groupedIndexes;
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
index 2b06eea9..f099c175 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
@@ -47,13 +47,46 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y));
}
+ ///
+ /// The angle in degrees between the horizontal axis and the line between two points.
+ ///
+ /// The first point.
+ /// The second point.
+ ///
+ public static double Angle(PdfPoint point1, PdfPoint point2)
+ {
+ return Math.Atan2((float)(point2.Y - point1.Y), (float)(point2.X - point1.X)) * 180.0 / Math.PI;
+ }
+
+ ///
+ /// The absolute distance between the Y coordinates of two points.
+ ///
+ /// The first point.
+ /// The second point.
+ ///
+ public static double Vertical(PdfPoint point1, PdfPoint point2)
+ {
+ return Math.Abs((double)(point2.Y - point1.Y));
+ }
+
+ ///
+ /// The absolute distance between the X coordinates of two points.
+ ///
+ /// The first point.
+ /// The second point.
+ ///
+ public static double Horizontal(PdfPoint point1, PdfPoint point2)
+ {
+ return Math.Abs((double)(point2.X - point1.X));
+ }
+
///
/// Find the nearest point.
///
/// The reference point, for which to find the nearest neighbour.
/// The list of neighbours candidates.
/// The distance measure to use.
- /// The distance between reference point, and its nearest neighbour
+ /// The distance between reference point, and its nearest neighbour.
public static PdfPoint FindNearest(this PdfPoint pdfPoint, IReadOnlyList points,
Func distanceMeasure, out double distance)
{
@@ -89,7 +122,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// The reference point, for which to find the nearest neighbour.
/// The list of neighbours candidates.
/// The distance measure to use.
- /// The distance between reference point, and its nearest neighbour
+ /// The distance between reference point, and its nearest neighbour.
public static int FindIndexNearest(this PdfPoint pdfPoint, IReadOnlyList points,
Func distanceMeasure, out double distance)
{
@@ -118,5 +151,41 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
return closestPointIndex;
}
+
+ ///
+ /// Find the index of the nearest line.
+ ///
+ /// The reference line, for which to find the nearest neighbour.
+ /// The list of neighbours candidates.
+ /// The distance measure between two lines to use.
+ /// The distance between reference line, and its nearest neighbour.
+ public static int FindIndexNearest(this PdfLine pdfLine, IReadOnlyList lines,
+ Func distanceMeasure, out double distance)
+ {
+ if (lines == null || lines.Count == 0)
+ {
+ throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "lines");
+ }
+
+ if (distanceMeasure == null)
+ {
+ throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", "distanceMeasure");
+ }
+
+ distance = double.MaxValue;
+ int closestLineIndex = -1;
+
+ for (var i = 0; i < lines.Count; i++)
+ {
+ double currentDistance = distanceMeasure(lines[i], pdfLine);
+ if (currentDistance < distance)
+ {
+ distance = currentDistance;
+ closestLineIndex = i;
+ }
+ }
+
+ return closestLineIndex;
+ }
}
}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
new file mode 100644
index 00000000..b2fca5c3
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
@@ -0,0 +1,287 @@
+using System;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using UglyToad.PdfPig.Content;
+using UglyToad.PdfPig.Geometry;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// The Docstrum algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood
+ /// clustering of connected components extracted from the document.
+ /// This implementation leverages bounding boxes and does not exactly replicates the original algorithm.
+ /// See 'The document spectrum for page layout analysis.' by L. O’Gorman.
+ ///
+ public class DocstrumBB : IPageSegmenter
+ {
+ ///
+ /// Create an instance of Docstrum for bounding boxes page segmenter, .
+ ///
+ public static DocstrumBB Instance { get; } = new DocstrumBB();
+
+ ///
+ /// Get the blocks.
+ /// Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.
+ ///
+ ///
+ ///
+ public IReadOnlyList GetBlocks(IEnumerable pageWords)
+ {
+ return GetBlocks(pageWords, -30, 30, -135, -45, 1.3);
+ }
+
+ ///
+ /// Get the blocks. See original paper for more information.
+ ///
+ ///
+ /// Within-line lower bound angle.
+ /// Within-line upper bound angle.
+ /// Between-line lower bound angle.
+ /// Between-line upper bound angle.
+ /// Multiplier that gives the maximum perpendicular distance between
+ /// text lines for blocking. Maximum distance will be this number times the between-line
+ /// distance found by the analysis.
+ ///
+ public IReadOnlyList GetBlocks(IEnumerable pageWords, double wlAngleLB, double wlAngleUB,
+ double blAngleLB, double blAngleUB, double blMultiplier)
+ {
+ var pageWordsArr = pageWords.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToArray(); // remove white spaces
+
+ var withinLineDistList = new ConcurrentBag();
+ var betweenLineDistList = new ConcurrentBag();
+
+ // 1. Estimate in line and between line spacing
+ Parallel.For(0, pageWordsArr.Length, i =>
+ {
+ var word = pageWordsArr[i];
+
+ // Within-line distance
+ var pointWL = GetNearestPointData(pageWordsArr, word,
+ bb => bb.BottomRight, bb => bb.BottomRight,
+ bb => bb.BottomLeft, bb => bb.BottomLeft,
+ wlAngleLB, wlAngleUB, Distances.Horizontal);
+ if (pointWL != null) withinLineDistList.Add(pointWL);
+
+ // Between-line distance
+ var pointBL = GetNearestPointData(pageWordsArr, word,
+ bb => bb.BottomLeft, bb => bb.Centroid,
+ bb => bb.TopLeft, bb => bb.Centroid,
+ blAngleLB, blAngleUB, Distances.Vertical);
+ if (pointBL != null) betweenLineDistList.Add(pointBL);
+ });
+
+ double withinLineDistance = GetPeakAverageDistance(withinLineDistList);
+ double betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);
+
+ // 2. Find lines of text
+ double maxDistWL = Math.Min(3 * withinLineDistance, Math.Sqrt(2) * betweenLineDistance);
+ var lines = GetLines(pageWordsArr, maxDistWL, wlAngleLB, wlAngleUB).ToArray();
+
+ // 3. Find blocks of text
+ double maxDistBL = blMultiplier * betweenLineDistance;
+ var blocks = GetLinesGroups(lines, maxDistBL).ToList();
+
+ // 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
+ for (int b = 0; b < blocks.Count; b++)
+ {
+ if (blocks[b] == null) continue;
+
+ for (int c = 0; c < blocks.Count; c++)
+ {
+ if (b == c) continue;
+ if (blocks[c] == null) continue;
+
+ if (AreRectangleOverlapping(blocks[b].BoundingBox, blocks[c].BoundingBox))
+ {
+ // Merge
+ // 1. Merge all words
+ var mergedWords = new List(blocks[b].TextLines.SelectMany(l => l.Words));
+ mergedWords.AddRange(blocks[c].TextLines.SelectMany(l => l.Words));
+
+ // 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
+ // same block. Filtering will still be done based on angle.
+ var mergedLines = GetLines(mergedWords.ToArray(), wlAngleLB, wlAngleUB, double.MaxValue);
+ blocks[b] = new TextBlock(mergedLines.ToList());
+
+ // Remove
+ blocks[c] = null;
+ }
+ }
+ }
+
+ return blocks.Where(b => b != null).ToList();
+ }
+
+ private bool AreRectangleOverlapping(PdfRectangle rectangle1, PdfRectangle rectangle2)
+ {
+ if (rectangle1.Left > rectangle2.Right || rectangle2.Left > rectangle1.Right) return false;
+ if (rectangle1.Top < rectangle2.Bottom || rectangle2.Top < rectangle1.Bottom) return false;
+ return true;
+ }
+
+ ///
+ /// Get information on the nearest point, filtered for angle.
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ private double[] GetNearestPointData(Word[] words, Word pivot, Func funcPivotDist, Func funcPivotAngle,
+ Func funcPointsDist, Func funcPointsAngle,
+ double angleStart, double angleEnd,
+ Func finalDistMEasure)
+ {
+ var pointR = funcPivotDist(pivot.BoundingBox);
+
+ // Filter by angle
+ var filtered = words.Where(w =>
+ {
+ var angleWL = Distances.Angle(funcPivotAngle(pivot.BoundingBox), funcPointsAngle(w.BoundingBox));
+ return (angleWL >= angleStart && angleWL <= angleEnd);
+ }).ToList();
+ filtered.Remove(pivot); // remove itself
+
+ if (filtered.Count > 0)
+ {
+ int index = pointR.FindIndexNearest(
+ filtered.Select(w => funcPointsDist(w.BoundingBox)).ToList(),
+ Distances.Euclidean, out double distWL);
+
+ if (index >= 0)
+ {
+ var matchWL = filtered[index];
+ return new double[]
+ {
+ (double)pivot.Letters.Select(l => l.FontSize).Mode(),
+ finalDistMEasure(pointR, funcPointsDist(matchWL.BoundingBox))
+ };
+ }
+ }
+ return null;
+ }
+
+ ///
+ /// Build lines via transitive closure.
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ private IEnumerable GetLines(Word[] words, double maxDist, double wlAngleLB, double wlAngleUB)
+ {
+ /***************************************************************************************************
+ * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not
+ * work as the FindIndexNearest() function might pair the pivot with itself (the pivot's right point
+ * (distance = width) is closer than other words' left point).
+ * -> Solution would be to find more than one nearest neighbours. Use KDTree?
+ ***************************************************************************************************/
+
+ TextDirection textDirection = words[0].TextDirection;
+ var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean,
+ (pivot, candidate) => maxDist,
+ pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
+ pivot => true,
+ (pivot, candidate) =>
+ {
+ var angleWL = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle
+ return (angleWL >= wlAngleLB && angleWL <= wlAngleUB);
+ }).ToList();
+
+ Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
+ if (textDirection == TextDirection.Rotate180)
+ {
+ orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList();
+ }
+ else if (textDirection == TextDirection.Rotate90)
+ {
+ orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList();
+ }
+ else if (textDirection == TextDirection.Rotate270)
+ {
+ orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
+ }
+
+ for (int a = 0; a < groupedIndexes.Count(); a++)
+ {
+ yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i])));
+ }
+ }
+
+ ///
+ /// Build blocks via transitive closure.
+ ///
+ ///
+ ///
+ ///
+ private IEnumerable GetLinesGroups(TextLine[] lines, double maxDist)
+ {
+ /**************************************************************************************************
+ * We want to measure the distance between two lines using the following method:
+ * We check if two lines are overlapping horizontally.
+ * If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
+ * We finally compute the Euclidean distance between these two middle points.
+ * If the two lines are not overlapping, the distance is set to the max distance.
+ *
+ * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'line Height', the algo won't
+ * work as the FindIndexNearest() function will always pair the pivot with itself (the pivot's top
+ * point (distance = height) is closer than other lines' top point).
+ * -> Solution would be to find more than one nearest neighbours. Use KDTree?
+ **************************************************************************************************/
+
+ Func euclidianOverlappingMiddleDistance = (l1, l2) =>
+ {
+ var left = Math.Max(l1.Point1.X, l2.Point1.X);
+ var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left);
+
+ if (d < 0) return double.MaxValue; // not overlapping -> max distance
+
+ return Distances.Euclidean(
+ new PdfPoint(left + d / 2, l1.Point1.Y),
+ new PdfPoint(left + d / 2, l2.Point1.Y));
+ };
+
+ var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines,
+ euclidianOverlappingMiddleDistance,
+ (pivot, candidate) => maxDist,
+ pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
+ candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
+ pivot => true, (pivot, candidate) => true).ToList();
+
+ for (int a = 0; a < groupedIndexes.Count(); a++)
+ {
+ yield return new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList());
+ }
+ }
+
+ ///
+ /// Get the average distance value of the peak bucket of the histogram.
+ ///
+ /// array[0]=font size, array[1]=distance
+ ///
+ private double GetPeakAverageDistance(IEnumerable values)
+ {
+ int max = (int)values.Max(x => x[1]) + 1;
+ int[] distrib = new int[max];
+
+ // Create histogram with buckets of size 1.
+ for (int i = 0; i < max; i++)
+ {
+ distrib[i] = values.Where(x => x[1] > i && x[1] <= i + 1).Count();
+ }
+
+ var peakIndex = Array.IndexOf(distrib, distrib.Max());
+
+ return values.Where(v => v[1] > peakIndex && v[1] <= peakIndex + 1).Average(x => x[1]);
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs
new file mode 100644
index 00000000..27511786
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs
@@ -0,0 +1,19 @@
+using System.Collections.Generic;
+using UglyToad.PdfPig.Content;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// Page segmentation divides a page into areas, each consisting of a layout structure (blocks, lines, etc.).
+ /// See 'Performance Comparison of Six Algorithms for Page Segmentation' by Faisal Shafait, Daniel Keysers, and Thomas M. Breuel.
+ ///
+ public interface IPageSegmenter
+ {
+ ///
+ /// Get the text blocks.
+ ///
+ /// The words to generate text blocks for.
+ /// A list of text blocks from this approach.
+ IReadOnlyList GetBlocks(IEnumerable pageWords);
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs
index 34455cda..3efb19be 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs
@@ -1,7 +1,6 @@
using System;
using System.Collections.Generic;
using System.Linq;
-using System.Threading.Tasks;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Geometry;
using UglyToad.PdfPig.Util;
@@ -71,7 +70,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height.
/// The distance measure between two start and end base line points,
/// e.g. the Manhattan distance.
- private static List GetWords(IEnumerable pageLetters,
+ private List GetWords(IEnumerable pageLetters,
Func metric, Func distMeasure)
{
if (pageLetters == null || pageLetters.Count() == 0) return new List();
@@ -97,116 +96,18 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
}
Letter[] letters = pageLetters.ToArray();
- int lettersCount = letters.Length;
- List startBaseLines = letters.Select(x => x.StartBaseLine).ToList();
- int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray();
-
- // Find nearest neighbours indexes
- Parallel.For(0, lettersCount, c =>
- {
- var currentLetter = letters[c];
- // only check neighbours if not a white space
- if (!string.IsNullOrWhiteSpace(currentLetter.Value))
- {
- int index = currentLetter.EndBaseLine.FindIndexNearest(startBaseLines, distMeasure, out double dist);
- var pairedLetter = letters[index];
-
- if (!string.IsNullOrWhiteSpace(pairedLetter.Value) &&
- string.Equals(currentLetter.FontName, pairedLetter.FontName, StringComparison.OrdinalIgnoreCase))
- {
- decimal minDist = Math.Max(Math.Abs(metric(currentLetter)), Math.Abs(metric(pairedLetter))) * 0.60m;
- if ((decimal)dist < minDist)
- {
- indexes[c] = index;
- }
- }
- }
- });
-
- // Group indexes
- List> groupedIndexes = new List>();
- List indexDone = new List();
- for (int c = 0; c < lettersCount; c++)
- {
- int i = indexes[c];
- if (i == -1) continue;
-
- bool isDoneC = indexDone.Contains(c);
- bool isDoneI = indexDone.Contains(i);
- if (isDoneC || isDoneI)
- {
- if (isDoneC && !isDoneI)
- {
- foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
- {
- pair.Add(i);
- }
- indexDone.Add(i);
- }
- else if (!isDoneC && isDoneI)
- {
- foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
- {
- pair.Add(c);
- }
- indexDone.Add(c);
- }
- else
- {
- foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
- {
- if (!pair.Contains(c)) pair.Add(c);
- }
-
- foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
- {
- if (!pair.Contains(i)) pair.Add(i);
- }
- }
- }
- else
- {
- List pair = new List() { c, i };
- groupedIndexes.Add(pair);
- indexDone.AddRange(pair);
- }
- }
-
- // Merge lists with common index
- for (int c = 0; c < lettersCount; c++)
- {
- List> candidates = groupedIndexes.Where(x => x.Any(t => t == c)).ToList();
- if (candidates.Count < 2) continue; // only one group with this index
-
- List merged = candidates.First();
- groupedIndexes.Remove(merged);
- for (int i = 1; i < candidates.Count; i++)
- {
- var current = candidates[i];
- merged = merged.Union(current).ToList();
- groupedIndexes.Remove(current);
- }
- groupedIndexes.Add(merged);
- }
+ var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(letters,
+ distMeasure,
+ (l1, l2) => Math.Max((double)metric(l1), (double)metric(l2)) * 0.60,
+ l => l.EndBaseLine, l => l.StartBaseLine,
+ l => !string.IsNullOrWhiteSpace(l.Value),
+ (l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value)).ToList();
List words = new List();
for (int a = 0; a < groupedIndexes.Count(); a++)
{
- List groupedLetters = new List();
- foreach (int s in groupedIndexes[a])
- {
- groupedLetters.Add(letters[s]);
- }
-
- words.Add(new Word(orderFunc(groupedLetters)));
- }
-
- List indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList();
- for (int n = 0; n < indexesNotDone.Count(); n++)
- {
- Letter letter = letters[indexesNotDone[n]];
- words.Add(new Word(new Letter[] { letter }));
+ words.Add(new Word(orderFunc(groupedIndexes[a].Select(i => letters[i]))));
}
return words;
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs
index ecaa610b..16258f1f 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs
@@ -11,14 +11,31 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
/// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips
///
- public static class RecursiveXYCut
+ public class RecursiveXYCut : IPageSegmenter
{
+ ///
+ /// Create an instance of Recursive X-Y Cut page segmenter, .
+ ///
+ public static RecursiveXYCut Instance { get; } = new RecursiveXYCut();
+
///
/// Get the blocks.
+ /// Uses 'minimumWidth' = 0, 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height)
+ ///
+ /// The words in the page.
+ ///
+ public IReadOnlyList GetBlocks(IEnumerable pageWords)
+ {
+ return GetBlocks(pageWords, 0);
+ }
+
+ ///
+ /// Get the blocks.
+ /// Uses 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height)
///
/// The words in the page.
/// The minimum width for a block.
- public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth = 0)
+ public IReadOnlyList GetBlocks(IEnumerable pageWords, decimal minimumWidth)
{
return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5m, 3));
}
@@ -30,7 +47,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// The minimum width for a block.
/// The dominant font width.
/// The dominant font height.
- public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth,
+ public IReadOnlyList GetBlocks(IEnumerable pageWords, decimal minimumWidth,
decimal dominantFontWidth, decimal dominantFontHeight)
{
return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight);
@@ -43,15 +60,24 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// The minimum width for a block.
/// The function that determines the dominant font width.
/// The function that determines the dominant font height.
- public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth,
+ public IReadOnlyList GetBlocks(IEnumerable pageWords, decimal minimumWidth,
Func, decimal> dominantFontWidthFunc,
Func, decimal> dominantFontHeightFunc)
{
- var root = new XYLeaf(pageWords); // Create a root node.
- return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
+ XYLeaf root = new XYLeaf(pageWords); // Create a root node.
+ XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
+
+ var leafs = node.GetLeafs();
+
+ if (leafs.Count > 0)
+ {
+ return leafs.Select(l => new TextBlock(l.GetLines())).ToList();
+ }
+
+ return new List();
}
- private static XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth,
+ private XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth,
Func, decimal> dominantFontWidthFunc,
Func, decimal> dominantFontHeightFunc, int level = 0)
{
@@ -144,7 +170,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
return new XYNode(newNodes);
}
- private static XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth,
+ private XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth,
Func, decimal> dominantFontWidthFunc,
Func, decimal> dominantFontHeightFunc, int level = 0)
{
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs
index 9dab8daf..a5970693 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs
@@ -9,7 +9,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
///
/// A Leaf node used in the algorithm, i.e. a block.
///
- public class XYLeaf : XYNode
+ internal class XYLeaf : XYNode
{
///
/// Returns true if this node is a leaf, false otherwise.
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs
index db423c55..9bac97fb 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs
@@ -8,7 +8,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
///
/// A Node used in the algorithm.
///
- public class XYNode
+ internal class XYNode
{
///
/// Returns true if this node is a leaf, false otherwise.