diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 662780b3..0e38466d 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -51,6 +51,12 @@ "UglyToad.PdfPig.CrossReference.CrossReferenceTable", "UglyToad.PdfPig.CrossReference.CrossReferenceType", "UglyToad.PdfPig.CrossReference.TrailerDictionary", + "UglyToad.PdfPig.DocumentLayoutAnalysis.Distances", + "UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions", + "UglyToad.PdfPig.DocumentLayoutAnalysis.NearestNeighbourWordExtractor", + "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", + "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", + "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Fonts.DescriptorFontFile", diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs new file mode 100644 index 00000000..2b06eea9 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs @@ -0,0 +1,122 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Contains helpful tools for distance measures. + /// + public static class Distances + { + /// + /// The Euclidean distance is the "ordinary" straight-line distance between two points. + /// + /// The first point. + /// The second point. + public static double Euclidean(PdfPoint point1, PdfPoint point2) + { + double dx = (double)(point1.X - point2.X); + double dy = (double)(point1.Y - point2.Y); + return Math.Sqrt(dx * dx + dy * dy); + } + + /// + /// The weighted Euclidean distance. + /// + /// The first point. + /// The second point. + /// The weight of the X coordinates. Default is 1. + /// The weight of the Y coordinates. Default is 1. + public static double WeightedEuclidean(PdfPoint point1, PdfPoint point2, double wX = 1.0, double wY = 1.0) + { + double dx = (double)(point1.X - point2.X); + double dy = (double)(point1.Y - point2.Y); + return Math.Sqrt(wX * dx * dx + wY * dy * dy); + } + + /// + /// The Manhattan distance between two points is the sum of the absolute differences of their Cartesian coordinates. + /// Also known as rectilinear distance, L1 distance, L1 norm, snake distance, city block distance, taxicab metric. + /// + /// The first point. + /// The second point. + public static double Manhattan(PdfPoint point1, PdfPoint point2) + { + return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y)); + } + + /// + /// Find the nearest point. + /// + /// The reference point, for which to find the nearest neighbour. + /// The list of neighbours candidates. + /// The distance measure to use. + /// The distance between reference point, and its nearest neighbour + public static PdfPoint FindNearest(this PdfPoint pdfPoint, IReadOnlyList points, + Func distanceMeasure, out double distance) + { + if (points == null || points.Count == 0) + { + throw new ArgumentException("Distances.FindNearest(): The list of neighbours candidates is either null or empty.", "points"); + } + + if (distanceMeasure == null) + { + throw new ArgumentException("Distances.FindNearest(): The distance measure must not be null.", "distanceMeasure"); + } + + distance = double.MaxValue; + PdfPoint closestPoint = default; + + for (var i = 0; i < points.Count; i++) + { + double currentDistance = distanceMeasure(points[i], pdfPoint); + if (currentDistance < distance) + { + distance = currentDistance; + closestPoint = points[i]; + } + } + + return closestPoint; + } + + /// + /// Find the index of the nearest point. + /// + /// The reference point, for which to find the nearest neighbour. + /// The list of neighbours candidates. + /// The distance measure to use. + /// The distance between reference point, and its nearest neighbour + public static int FindIndexNearest(this PdfPoint pdfPoint, IReadOnlyList points, + Func distanceMeasure, out double distance) + { + if (points == null || points.Count == 0) + { + throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "points"); + } + + if (distanceMeasure == null) + { + throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", "distanceMeasure"); + } + + distance = double.MaxValue; + int closestPointIndex = -1; + + for (var i = 0; i < points.Count; i++) + { + double currentDistance = distanceMeasure(points[i], pdfPoint); + if (currentDistance < distance) + { + distance = currentDistance; + closestPointIndex = i; + } + } + + return closestPointIndex; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs new file mode 100644 index 00000000..295c524e --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs @@ -0,0 +1,30 @@ +using System.Collections.Generic; +using System.Linq; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Useful math extensions. + /// + public static class MathExtensions + { + /// + /// Computes the mode of a sequence of float values. + /// + /// The array of floats. + public static float Mode(this IEnumerable array) + { + if (array == null || array.Count() == 0) return float.NaN; + return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key; + } + + /// + /// Computes the mode of a sequence of decimal values. + /// + /// The array of decimal. + public static decimal Mode(this IEnumerable array) + { + return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs new file mode 100644 index 00000000..34455cda --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs @@ -0,0 +1,215 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; +using UglyToad.PdfPig.Util; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Nearest Neighbour Word Extractor, using the distance. + /// This implementation leverages bounding boxes. + /// + public class NearestNeighbourWordExtractor : IWordExtractor + { + /// + /// Create an instance of Nearest Neighbour Word Extractor, . + /// + public static IWordExtractor Instance { get; } = new NearestNeighbourWordExtractor(); + + /// + /// Gets the words. + /// + /// The letters in the page. + public IEnumerable GetWords(IReadOnlyList letters) + { + List wordsH = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Horizontal), + l => l.GlyphRectangle.Width, Distances.Manhattan) + .OrderByDescending(x => x.BoundingBox.Bottom) + .ThenBy(x => x.BoundingBox.Left).ToList(); + + List words180 = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Rotate180), + l => l.GlyphRectangle.Width, Distances.Manhattan) + .OrderBy(x => x.BoundingBox.Top) + .ThenByDescending(x => x.BoundingBox.Right).ToList(); + wordsH.AddRange(words180); + + List words90 = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Rotate90), + l => l.GlyphRectangle.Height, Distances.Manhattan) + .OrderByDescending(x => x.BoundingBox.Left) + .ThenBy(x => x.BoundingBox.Top).ToList(); + wordsH.AddRange(words90); + + List words270 = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Rotate270), + l => l.GlyphRectangle.Height, Distances.Manhattan) + .OrderBy(x => x.BoundingBox.Right) + .ThenByDescending(x => x.BoundingBox.Bottom).ToList(); + wordsH.AddRange(words270); + + List wordsU = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Unknown), + l => l.GlyphRectangle.Width, Distances.Manhattan) + .OrderByDescending(x => x.BoundingBox.Bottom) + .ThenBy(x => x.BoundingBox.Left).ToList(); + wordsH.AddRange(wordsU); + + return wordsH; + } + + /// + /// Private method to get the words. + /// + /// The letters in the page, they must have + /// the same text directions. + /// The letter's metric to use in the minimum distance + /// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height. + /// The distance measure between two start and end base line points, + /// e.g. the Manhattan distance. + private static List GetWords(IEnumerable pageLetters, + Func metric, Func distMeasure) + { + if (pageLetters == null || pageLetters.Count() == 0) return new List(); + TextDirection textDirection = pageLetters.ElementAt(0).TextDirection; + + if (pageLetters.Any(x => textDirection != x.TextDirection)) + { + throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction."); + } + + Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList(); + if (textDirection == TextDirection.Rotate180) + { + orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Right).ToList(); + } + else if (textDirection == TextDirection.Rotate90) + { + orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Top).ToList(); + } + else if (textDirection == TextDirection.Rotate270) + { + orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Bottom).ToList(); + } + + Letter[] letters = pageLetters.ToArray(); + int lettersCount = letters.Length; + List startBaseLines = letters.Select(x => x.StartBaseLine).ToList(); + + int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray(); + + // Find nearest neighbours indexes + Parallel.For(0, lettersCount, c => + { + var currentLetter = letters[c]; + // only check neighbours if not a white space + if (!string.IsNullOrWhiteSpace(currentLetter.Value)) + { + int index = currentLetter.EndBaseLine.FindIndexNearest(startBaseLines, distMeasure, out double dist); + var pairedLetter = letters[index]; + + if (!string.IsNullOrWhiteSpace(pairedLetter.Value) && + string.Equals(currentLetter.FontName, pairedLetter.FontName, StringComparison.OrdinalIgnoreCase)) + { + decimal minDist = Math.Max(Math.Abs(metric(currentLetter)), Math.Abs(metric(pairedLetter))) * 0.60m; + if ((decimal)dist < minDist) + { + indexes[c] = index; + } + } + } + }); + + // Group indexes + List> groupedIndexes = new List>(); + List indexDone = new List(); + for (int c = 0; c < lettersCount; c++) + { + int i = indexes[c]; + if (i == -1) continue; + + bool isDoneC = indexDone.Contains(c); + bool isDoneI = indexDone.Contains(i); + if (isDoneC || isDoneI) + { + if (isDoneC && !isDoneI) + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(c))) + { + pair.Add(i); + } + indexDone.Add(i); + } + else if (!isDoneC && isDoneI) + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(i))) + { + pair.Add(c); + } + indexDone.Add(c); + } + else + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(i))) + { + if (!pair.Contains(c)) pair.Add(c); + } + + foreach (var pair in groupedIndexes.Where(x => x.Contains(c))) + { + if (!pair.Contains(i)) pair.Add(i); + } + } + } + else + { + List pair = new List() { c, i }; + groupedIndexes.Add(pair); + indexDone.AddRange(pair); + } + } + + // Merge lists with common index + for (int c = 0; c < lettersCount; c++) + { + List> candidates = groupedIndexes.Where(x => x.Any(t => t == c)).ToList(); + if (candidates.Count < 2) continue; // only one group with this index + + List merged = candidates.First(); + groupedIndexes.Remove(merged); + for (int i = 1; i < candidates.Count; i++) + { + var current = candidates[i]; + merged = merged.Union(current).ToList(); + groupedIndexes.Remove(current); + } + groupedIndexes.Add(merged); + } + + List words = new List(); + for (int a = 0; a < groupedIndexes.Count(); a++) + { + List groupedLetters = new List(); + foreach (int s in groupedIndexes[a]) + { + groupedLetters.Add(letters[s]); + } + + words.Add(new Word(orderFunc(groupedLetters))); + } + + List indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList(); + for (int n = 0; n < indexesNotDone.Count(); n++) + { + Letter letter = letters[indexesNotDone[n]]; + words.Add(new Word(new Letter[] { letter })); + } + + return words; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs new file mode 100644 index 00000000..a961ff17 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs @@ -0,0 +1,231 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Content; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document + /// recursively into a set of rectangular blocks. This implementation leverages bounding boxes. + /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut + /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips + /// + public class RecursiveXYCut + { + /// + /// Get the blocks. + /// + /// The words in the page. + /// The minimum width for a block. + public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth = 0) + { + return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5m, 3)); + } + + /// + /// Get the blocks. + /// + /// The words in the page. + /// The minimum width for a block. + /// The dominant font width. + /// The dominant font height. + public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth, + decimal dominantFontWidth, decimal dominantFontHeight) + { + return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight); + } + + /// + /// Get the blocks. + /// + /// The words in the page. + /// The minimum width for a block. + /// The function that determines the dominant font width. + /// The function that determines the dominant font height. + public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth, + Func, decimal> dominantFontWidthFunc, + Func, decimal> dominantFontHeightFunc) + { + var root = new XYLeaf(pageWords); // Create a root node. + return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); + } + + private static XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth, + Func, decimal> dominantFontWidthFunc, + Func, decimal> dominantFontHeightFunc, int level = 0) + { + if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth) + { + // we stop cutting if + // - only one word remains + // - width is too small + return leaf; + } + + // order words left to right + var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray(); + + // determine dominantFontWidth and dominantFontHeight + decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) + .Select(x => Math.Abs(x.GlyphRectangle.Width))); + decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) + .Select(x => Math.Abs(x.GlyphRectangle.Height))); + + List projectionProfile = new List(); + decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Left, words[0].BoundingBox.Right }; + int wordsCount = words.Count(); + for (int i = 1; i < wordsCount; i++) + { + if ((words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1]) + || (words[i].BoundingBox.Right >= currentProj[0] && words[i].BoundingBox.Right <= currentProj[1])) + { + // it is overlapping + if (words[i].BoundingBox.Left >= currentProj[0] + && words[i].BoundingBox.Left <= currentProj[1] + && words[i].BoundingBox.Right > currentProj[1]) + { + // |____| + // |____| + // |_______| <- updated + currentProj[1] = words[i].BoundingBox.Right; + } + + // we ignore the following cases: + // |____| + // |____| (not possible because of OrderBy) + // + // |____| + //|___________| (not possible because of OrderBy) + // + // |____| + // |_| + } + else + { + // no overlap + if (words[i].BoundingBox.Left - currentProj[1] <= domFontWidth) + { + // if gap too small -> don't cut + // |____| |____| + currentProj[1] = words[i].BoundingBox.Right; + } + else if (currentProj[1] - currentProj[0] < minimumWidth) + { + // still too small + currentProj[1] = words[i].BoundingBox.Right; + } + else + { + // if gap big enough -> cut! + // |____| | |____| + if (i != wordsCount - 1) // will always add the last one after + { + projectionProfile.Add(currentProj); + currentProj = new decimal[2] { words[i].BoundingBox.Left, words[i].BoundingBox.Right }; + } + } + } + if (i == wordsCount - 1) projectionProfile.Add(currentProj); + } + + var newLeafsEnums = projectionProfile.Select(p => leaf.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1])); + var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e)); + + var newNodes = newLeafs.Select(l => HorizontalCut(l, minimumWidth, + dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); + + var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); + if (lost.Count > 0) + { + newNodes.AddRange(lost.Select(w => new XYLeaf(w))); + } + + return new XYNode(newNodes); + } + + private static XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth, + Func, decimal> dominantFontWidthFunc, + Func, decimal> dominantFontHeightFunc, int level = 0) + { + if (leaf.CountWords() <= 1) + { + // we stop cutting if + // - only one word remains + return leaf; + } + + var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top + + // determine dominantFontWidth and dominantFontHeight + decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) + .Select(x => Math.Abs(x.GlyphRectangle.Width))); + decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) + .Select(x => Math.Abs(x.GlyphRectangle.Height))); + + List projectionProfile = new List(); + decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Bottom, words[0].BoundingBox.Top }; + int wordsCount = words.Count(); + for (int i = 1; i < wordsCount; i++) + { + if ((words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1]) + || (words[i].BoundingBox.Top >= currentProj[0] && words[i].BoundingBox.Top <= currentProj[1])) + { + // it is overlapping + if (words[i].BoundingBox.Bottom >= currentProj[0] + && words[i].BoundingBox.Bottom <= currentProj[1] + && words[i].BoundingBox.Top > currentProj[1]) + { + currentProj[1] = words[i].BoundingBox.Top; + } + } + else + { + // no overlap + if (words[i].BoundingBox.Bottom - currentProj[1] <= domFontHeight) + { + // if gap too small -> don't cut + // |____| |____| + currentProj[1] = words[i].BoundingBox.Top; + } + else + { + // if gap big enough -> cut! + // |____| | |____| + if (i != wordsCount - 1) // will always add the last one after + { + projectionProfile.Add(currentProj); + currentProj = new decimal[2] { words[i].BoundingBox.Bottom, words[i].BoundingBox.Top }; + } + } + } + if (i == wordsCount - 1) projectionProfile.Add(currentProj); + } + + if (projectionProfile.Count == 1) + { + if (level >= 1) + { + return leaf; + } + else + { + level++; + } + } + + var newLeafsEnums = projectionProfile.Select(p => + leaf.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1])); + var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e)); + var newNodes = newLeafs.Select(l => VerticalCut(l, minimumWidth, + dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); + + var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); + if (lost.Count > 0) + { + newNodes.AddRange(lost.Select(w => new XYLeaf(w))); + } + return new XYNode(newNodes); + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs new file mode 100644 index 00000000..9dab8daf --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs @@ -0,0 +1,76 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// A Leaf node used in the algorithm, i.e. a block. + /// + public class XYLeaf : XYNode + { + /// + /// Returns true if this node is a leaf, false otherwise. + /// + public override bool IsLeaf => true; + + /// + /// The words in the leaf. + /// + public IReadOnlyList Words { get; } + + /// + /// The number of words in the leaf. + /// + public override int CountWords() => Words == null ? 0 : Words.Count; + + /// + /// Returns null as a leaf doesn't have leafs. + /// + public override List GetLeafs() + { + return null; + } + + /// + /// Gets the lines of the leaf. + /// + public IReadOnlyList GetLines() + { + return Words.GroupBy(x => x.BoundingBox.Bottom).OrderByDescending(x => x.Key) + .Select(x => new TextLine(x.ToList())).ToArray(); + } + + /// + /// Create a new . + /// + /// The words contained in the leaf. + public XYLeaf(params Word[] words) : this(words == null ? null : words.ToList()) + { + + } + + /// + /// Create a new . + /// + /// The words contained in the leaf. + public XYLeaf(IEnumerable words) : base(null) + { + if (words == null) + { + throw new ArgumentException("XYLeaf(): The words contained in the leaf cannot be null.", "words"); + } + + decimal left = words.Min(b => b.BoundingBox.Left); + decimal right = words.Max(b => b.BoundingBox.Right); + + decimal bottom = words.Min(b => b.BoundingBox.Bottom); + decimal top = words.Max(b => b.BoundingBox.Top); + + BoundingBox = new PdfRectangle(left, bottom, right, top); + Words = words.ToArray(); + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs new file mode 100644 index 00000000..70620807 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs @@ -0,0 +1,130 @@ +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Geometry; +using UglyToad.PdfPig.Util; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// A Node used in the algorithm. + /// + public class XYNode + { + /// + /// Returns true if this node is a leaf, false otherwise. + /// + public virtual bool IsLeaf => false; + + /// + /// The rectangle completely containing the node. + /// + public PdfRectangle BoundingBox { get; set; } + + /// + /// The children of the node. + /// + public XYNode[] Children { get; set; } + + /// + /// Recursively counts the words included in this node. + /// + public virtual int CountWords() + { + if (Children == null) return 0; + int count = 0; + RecursiveCount(Children, ref count); + return count; + } + + /// + /// Recursively gets the leafs (last nodes) of this node. + /// + public virtual List GetLeafs() + { + List leafs = new List(); + if (Children == null || Children.Count() == 0) return leafs; + int level = 0; + RecursiveGetLeafs(Children, ref leafs, level); + return leafs; + } + + /// + /// Create a new . + /// + /// The node's children. + public XYNode(params XYNode[] children) + : this(children?.ToList()) + { + + } + + /// + /// Create a new . + /// + /// The node's children. + public XYNode(IEnumerable children) + { + if (children != null && children.Count() != 0) + { + Children = children.ToArray(); + decimal left = children.Min(b => b.BoundingBox.Left); + decimal right = children.Max(b => b.BoundingBox.Right); + decimal bottom = children.Min(b => b.BoundingBox.Bottom); + decimal top = children.Max(b => b.BoundingBox.Top); + BoundingBox = new PdfRectangle(left, bottom, right, top); + } + else + { + Children = EmptyArray.Instance; + } + } + + private void RecursiveCount(IEnumerable children, ref int count) + { + if (children.Count() == 0) return; + foreach (XYNode node in children.Where(x => x.IsLeaf)) + { + count += node.CountWords(); + } + + foreach (XYNode node in children.Where(x => !x.IsLeaf)) + { + RecursiveCount(node.Children, ref count); + } + } + + private void RecursiveGetLeafs(IEnumerable children, ref List leafs, int level) + { + if (children.Count() == 0) return; + bool isVerticalCut = level % 2 == 0; + + foreach (XYLeaf node in children.Where(x => x.IsLeaf)) + { + leafs.Add(node); + } + + level++; + + IEnumerable notLeafs = children.Where(x => !x.IsLeaf); + + if (isVerticalCut) + { + notLeafs = notLeafs.OrderBy(x => x.BoundingBox.Left).ToList(); + } + else + { + notLeafs = notLeafs.OrderByDescending(x => x.BoundingBox.Top).ToList(); + } + + foreach (XYNode node in notLeafs) + { + RecursiveGetLeafs(node.Children, ref leafs, level); + } + } + + public override string ToString() + { + return (IsLeaf ? "Leaf" : "Node"); + } + } +}