From a0c864e8aff62568e3943c3494df19623994467d Mon Sep 17 00:00:00 2001 From: BobLd Date: Sun, 16 Jun 2019 13:57:30 +0100 Subject: [PATCH 1/6] Addind Document Layout Analysis: - Nearest Neighbour Word Extractor - Recursive X-Y Cut algorithm, useful for multi-column pdf documents --- .../PublicApiScannerTests.cs | 4 + .../DocumentLayoutAnalysis/Distances.cs | 86 ++++ .../DocumentLayoutAnalysis/MathExtensions.cs | 32 ++ .../DocumentLayoutAnalysis/NNWordExtractor.cs | 200 +++++++++ .../DocumentLayoutAnalysis/RecursiveXYCut.cs | 411 ++++++++++++++++++ 5 files changed, 733 insertions(+) create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 662780b3..02e38938 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -51,6 +51,10 @@ "UglyToad.PdfPig.CrossReference.CrossReferenceTable", "UglyToad.PdfPig.CrossReference.CrossReferenceType", "UglyToad.PdfPig.CrossReference.TrailerDictionary", + "UglyToad.PdfPig.DocumentLayoutAnalysis.Distances", + "UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions", + "UglyToad.PdfPig.DocumentLayoutAnalysis.NNWordExtractor", + "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Fonts.DescriptorFontFile", diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs new file mode 100644 index 00000000..d5ad4ea4 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs @@ -0,0 +1,86 @@ +using System; +using System.Linq; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Contains helpful tools for distance measures. + /// + public static class Distances + { + /// + /// The Euclidean distance is the "ordinary" straight-line distance between two points. + /// + /// The first point. + /// The second point. + /// + public static double Euclidean(PdfPoint point1, PdfPoint point2) + { + double dx = (double)(point1.X - point2.X); + double dy = (double)(point1.Y - point2.Y); + return Math.Sqrt(dx * dx + dy * dy); + } + + /// + /// The weighted Euclidean distance. + /// + /// The first point. + /// The second point. + /// The weight of the X coordinates. Default is 1. + /// The weight of the Y coordinates. Default is 1. + /// + public static double WghtdEuclidean(PdfPoint point1, PdfPoint point2, double wX = 1.0, double wY = 1.0) + { + double dx = (double)(point1.X - point2.X); + double dy = (double)(point1.Y - point2.Y); + return Math.Sqrt(wX * dx * dx + wY * dy * dy); + } + + /// + /// The Manhattan distance between two points is the sum of the absolute differences of their Cartesian coordinates. + /// Also known as rectilinear distance, L1 distance, L1 norm, snake distance, city block distance, taxicab metric. + /// + /// The first point. + /// The second point. + /// + public static double Manhattan(PdfPoint point1, PdfPoint point2) + { + return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y)); + } + + /// + /// Find the nearest point. + /// + /// The reference point, for which to find the nearest neighbour. + /// The list of neighbours candidates. + /// The distance measure to use. + /// The distance between reference point, and its nearest neighbour + /// + public static PdfPoint FindNearest(this PdfPoint pdfPoint, PdfPoint[] points, + Func measure, out double dist) + { + double d = points.Min(k => measure(k, pdfPoint)); + PdfPoint point = points.First(x => measure(x, pdfPoint) == d); + dist = d; + return point; + } + + /// + /// Find the index of the nearest point. + /// + /// The reference point, for which to find the nearest neighbour. + /// The list of neighbours candidates. + /// The distance measure to use. + /// The distance between reference point, and its nearest neighbour + /// + public static int FindIndexNearest(this PdfPoint pdfPoint, PdfPoint[] points, + Func measure, out double dist) + { + double d = points.Min(k => measure(k, pdfPoint)); + int index = Array.FindIndex(points, x => measure(x, pdfPoint) == d); + dist = d; + return index; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs new file mode 100644 index 00000000..a32fc576 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs @@ -0,0 +1,32 @@ +using System.Collections.Generic; +using System.Linq; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Useful math extensions. + /// + public static class MathExtensions + { + /// + /// Computes the mode of a sequence of float values. + /// + /// + /// + public static float Mode(this IEnumerable array) + { + if (array == null || array.Count() == 0) return float.NaN; + return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key; + } + + /// + /// Computes the mode of a sequence of decimal values. + /// + /// + /// + public static decimal Mode(this IEnumerable array) + { + return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs new file mode 100644 index 00000000..5da1efbf --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs @@ -0,0 +1,200 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; +using UglyToad.PdfPig.Util; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Nearest Neighbour Word Extractor, using the distance. + /// This implementation leverages bounding boxes. + /// + public class NNWordExtractor : IWordExtractor + { + /// + /// Create an instance of Nearest Neighbour Word Extractor, . + /// + public static IWordExtractor Instance { get; } = new NNWordExtractor(); + + /// + /// Gets the words. + /// + /// + /// + public IEnumerable GetWords(IReadOnlyList letters) + { + List wordsH = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Horizontal), + l => l.GlyphRectangle.Width, Distances.Manhattan) + .OrderByDescending(x => x.BoundingBox.Bottom) + .ThenBy(x => x.BoundingBox.Left).ToList(); + + List words180 = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Rotate180), + l => l.GlyphRectangle.Width, Distances.Manhattan) + .OrderBy(x => x.BoundingBox.Top) + .ThenByDescending(x => x.BoundingBox.Right).ToList(); + wordsH.AddRange(words180); + + List words90 = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Rotate90), + l => l.GlyphRectangle.Height, Distances.Manhattan) + .OrderByDescending(x => x.BoundingBox.Left) + .ThenBy(x => x.BoundingBox.Top).ToList(); + wordsH.AddRange(words90); + + List words270 = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Rotate270), + l => l.GlyphRectangle.Height, Distances.Manhattan) + .OrderBy(x => x.BoundingBox.Right) + .ThenByDescending(x => x.BoundingBox.Bottom).ToList(); + wordsH.AddRange(words270); + + List wordsU = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Unknown), + l => l.GlyphRectangle.Width, Distances.Manhattan) + .OrderByDescending(x => x.BoundingBox.Bottom) + .ThenBy(x => x.BoundingBox.Left).ToList(); + wordsH.AddRange(wordsU); + + return wordsH; + } + + /// + /// + /// + /// The letters in the page, they must have + /// the same text directions. + /// The letter's metric to use in the minimum distance + /// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height. + /// The distance measure between two start and end base line points, + /// e.g. the Manhattan distance. + /// + private static List GetWords(IEnumerable pageLetters, + Func metric, Func distMeasure) + { + if (pageLetters == null || pageLetters.Count() == 0) return new List(); + + if (pageLetters.Any(x => pageLetters.ElementAt(0).TextDirection != x.TextDirection)) + { + throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction."); + } + + Letter[] letters = pageLetters.ToArray(); + int lettersCount = letters.Length; + PdfPoint[] startBaseLines = letters.Select(x => x.StartBaseLine).ToArray(); + int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray(); + + // Find nearest neighbours indexes + Parallel.For(0, lettersCount, c => + { + var currentLetter = letters[c]; + // only check neighbours if not a white space + if (!string.IsNullOrWhiteSpace(currentLetter.Value)) + { + int index = currentLetter.EndBaseLine.FindIndexNearest(startBaseLines, distMeasure, out double dist); + var pairedLetter = letters[index]; + + if (!string.IsNullOrWhiteSpace(pairedLetter.Value) && + string.Equals(currentLetter.FontName, pairedLetter.FontName, StringComparison.OrdinalIgnoreCase)) + { + decimal minDist = Math.Max(Math.Abs(metric(currentLetter)), Math.Abs(metric(pairedLetter))) * 0.60m; + if ((decimal)dist < minDist) + { + indexes[c] = index; + } + } + } + }); + + // Group indexes + List> groupedIndexes = new List>(); + List indexDone = new List(); + for (int c = 0; c < lettersCount; c++) + { + int i = indexes[c]; + if (i == -1) continue; + + bool isDoneC = indexDone.Contains(c); + bool isDoneI = indexDone.Contains(i); + if (isDoneC || isDoneI) + { + if (isDoneC && !isDoneI) + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(c))) + { + pair.Add(i); + } + indexDone.Add(i); + } + else if (!isDoneC && isDoneI) + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(i))) + { + pair.Add(c); + } + indexDone.Add(c); + } + else + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(i))) + { + if (!pair.Contains(c)) pair.Add(c); + } + + foreach (var pair in groupedIndexes.Where(x => x.Contains(c))) + { + if (!pair.Contains(i)) pair.Add(i); + } + } + } + else + { + List pair = new List() { c, i }; + groupedIndexes.Add(pair); + indexDone.AddRange(pair); + } + } + + // Merge lists with common index + for (int c = 0; c < lettersCount; c++) + { + List> candidates = groupedIndexes.Where(x => x.Any(t => t == c)).ToList(); + if (candidates.Count < 2) continue; // only one group with this index + + List merged = candidates.First(); + groupedIndexes.Remove(merged); + for (int i = 1; i < candidates.Count; i++) + { + var current = candidates[i]; + merged = merged.Union(current).ToList(); + groupedIndexes.Remove(current); + } + groupedIndexes.Add(merged); + } + + List words = new List(); + for (int a = 0; a < groupedIndexes.Count(); a++) + { + List groupedLetters = new List(); + foreach (int s in groupedIndexes[a]) + { + groupedLetters.Add(letters[s]); + } + words.Add(new Word(groupedLetters)); + } + + List indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList(); + for (int n = 0; n < indexesNotDone.Count(); n++) + { + Letter letter = letters[indexesNotDone[n]]; + words.Add(new Word(new Letter[] { letter })); + } + + return words; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs new file mode 100644 index 00000000..7893e71e --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs @@ -0,0 +1,411 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document + /// recursively into a set of rectangular blocks. This implementation leverages bounding boxes. + /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut + /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha and Robert M.Haralick Ihsin T. Phillips + /// + public class RecursiveXYCut + { + /// + /// Get the blocks. + /// + /// The words in a page. + /// The minimum widht for a block. + /// The dominant font width. + /// The dominant font height. + /// + public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidht, + decimal dominantFontWidth, decimal dominantFontHeight) + { + return GetBlocks(pageWords, minimumWidht, k => dominantFontWidth, k => dominantFontHeight); + } + + /// + /// Get the blocks. + /// + /// The words in a page. + /// The minimum widht for a block. + /// The function that determines the dominant font width. + /// The function that determines the dominant font height. + /// + public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidht, + Func, decimal> dominantFontWidthFunc, + Func, decimal> dominantFontHeightFunc) + { + var root = new XYLeef(pageWords); + return VerticalCut(root, minimumWidht, dominantFontWidthFunc, dominantFontHeightFunc); + } + + private static XYNode VerticalCut(XYLeef leef, decimal minimumWidht, + Func, decimal> dominantFontWidthFunc, + Func, decimal> dominantFontHeightFunc, int level = 0) + { + if (leef.CountWords() <= 1 || leef.BoundingBox.Width <= minimumWidht) + { + // we stop cutting if + // - only one word remains + // - width is too small + return leef; + } + + // order words left to right + var words = leef.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray(); + + // determine dominantFontWidth and dominantFontHeight + decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) + .Select(x => Math.Abs(x.GlyphRectangle.Width))); + decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) + .Select(x => Math.Abs(x.GlyphRectangle.Height))); + + List projectionProfile = new List(); + decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Left, words[0].BoundingBox.Right }; + int wordsCount = words.Count(); + for (int i = 1; i < wordsCount; i++) + { + if ((words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1]) + || (words[i].BoundingBox.Right >= currentProj[0] && words[i].BoundingBox.Right <= currentProj[1])) + { + // it is overlapping + if (words[i].BoundingBox.Left >= currentProj[0] + && words[i].BoundingBox.Left <= currentProj[1] + && words[i].BoundingBox.Right > currentProj[1]) + { + // |____| + // |____| + // |_______| <- updated + currentProj[1] = words[i].BoundingBox.Right; + } + + // we ignore the following cases: + // |____| + // |____| (not possible because of OrderBy) + // + // |____| + //|___________| (not possible because of OrderBy) + // + // |____| + // |_| + } + else + { + // no overlap + if (words[i].BoundingBox.Left - currentProj[1] <= domFontWidth) + { + // if gap too small -> don't cut + // |____| |____| + currentProj[1] = words[i].BoundingBox.Right; + } + else if (currentProj[1] - currentProj[0] < minimumWidht) + { + // still too small + currentProj[1] = words[i].BoundingBox.Right; + } + else + { + // if gap big enough -> cut! + // |____| | |____| + if (i != wordsCount - 1) // will always add the last one after + { + projectionProfile.Add(currentProj); + currentProj = new decimal[2] { words[i].BoundingBox.Left, words[i].BoundingBox.Right }; + } + } + } + if (i == wordsCount - 1) projectionProfile.Add(currentProj); + } + + var newLeefsEnums = projectionProfile.Select(p => leef.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1])); + var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e)); + + var newNodes = newLeefs.Select(l => HorizontalCut(l, minimumWidht, + dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); + + var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); + if (lost.Count > 0) + { + newNodes.AddRange(lost.Select(w => new XYLeef(w))); + } + + return new XYNode(newNodes); + } + + private static XYNode HorizontalCut(XYLeef leef, decimal minimumWidht, + Func, decimal> dominantFontWidthFunc, + Func, decimal> dominantFontHeightFunc, int level = 0) + { + if (leef.CountWords() <= 1) + { + // we stop cutting if + // - only one word remains + return leef; + } + + var words = leef.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top + + // determine dominantFontWidth and dominantFontHeight + decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) + .Select(x => Math.Abs(x.GlyphRectangle.Width))); + decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) + .Select(x => Math.Abs(x.GlyphRectangle.Height))); + + List projectionProfile = new List(); + decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Bottom, words[0].BoundingBox.Top }; + int wordsCount = words.Count(); + for (int i = 1; i < wordsCount; i++) + { + if ((words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1]) + || (words[i].BoundingBox.Top >= currentProj[0] && words[i].BoundingBox.Top <= currentProj[1])) + { + // it is overlapping + if (words[i].BoundingBox.Bottom >= currentProj[0] + && words[i].BoundingBox.Bottom <= currentProj[1] + && words[i].BoundingBox.Top > currentProj[1]) + { + currentProj[1] = words[i].BoundingBox.Top; + } + } + else + { + // no overlap + if (words[i].BoundingBox.Bottom - currentProj[1] <= domFontHeight) + { + // if gap too small -> don't cut + // |____| |____| + currentProj[1] = words[i].BoundingBox.Top; + } + else + { + // if gap big enough -> cut! + // |____| | |____| + if (i != wordsCount - 1) // will always add the last one after + { + projectionProfile.Add(currentProj); + currentProj = new decimal[2] { words[i].BoundingBox.Bottom, words[i].BoundingBox.Top }; + } + } + } + if (i == wordsCount - 1) projectionProfile.Add(currentProj); + } + + if (projectionProfile.Count == 1) + { + if (level >= 1) + { + return leef; + } + else + { + level++; + } + } + + var newLeefsEnums = projectionProfile.Select(p => + leef.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1])); + var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e)); + var newNodes = newLeefs.Select(l => VerticalCut(l, minimumWidht, + dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); + + var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); + if (lost.Count > 0) + { + newNodes.AddRange(lost.Select(w => new XYLeef(w))); + } + return new XYNode(newNodes); + } + } + + /// + /// A Node used in the algorithm. + /// + public class XYNode + { + /// + /// Returns true if this node is a leef, false otherwise. + /// + public virtual bool IsLeef => false; + + /// + /// The rectangle completely containing the node. + /// + public PdfRectangle BoundingBox { get; set; } + + /// + /// The children of the node. + /// + public XYNode[] Children { get; set; } + + /// + /// Recursively counts the words included in this node. + /// + /// + public virtual int CountWords() + { + if (Children == null) return 0; + int count = 0; + RecursiveCount(Children, ref count); + return count; + } + + /// + /// Recursively gets the leefs (last nodes) of this node. + /// + /// + public virtual List GetLeefs() + { + List leefs = new List(); + if (Children == null || Children.Count() == 0) return leefs; + int level = 0; + RecursiveGetLeefs(Children, ref leefs, level); + return leefs; + } + + /// + /// Create a new . + /// + /// + public XYNode(params XYNode[] children) + : this(children?.ToList()) + { + + } + + /// + /// Create a new . + /// + /// + public XYNode(IEnumerable children) + { + if (children != null && children.Count() != 0) + { + Children = children.ToArray(); + decimal left = children.Min(b => b.BoundingBox.Left); + decimal right = children.Max(b => b.BoundingBox.Right); + decimal bottom = children.Min(b => b.BoundingBox.Bottom); + decimal top = children.Max(b => b.BoundingBox.Top); + BoundingBox = new PdfRectangle(left, bottom, right, top); + } + } + + private void RecursiveCount(IEnumerable children, ref int count) + { + if (children.Count() == 0) return; + foreach (XYNode node in children.Where(x => x.IsLeef)) + { + count += node.CountWords(); + } + + foreach (XYNode node in children.Where(x => !x.IsLeef)) + { + RecursiveCount(node.Children, ref count); + } + } + + private void RecursiveGetLeefs(IEnumerable children, ref List leefs, int level) + { + if (children.Count() == 0) return; + bool isVerticalCut = level % 2 == 0; + + foreach (XYLeef node in children.Where(x => x.IsLeef)) + { + leefs.Add(node); + } + + level++; + + IEnumerable notLeefs = children.Where(x => !x.IsLeef); + + if (isVerticalCut) + { + notLeefs = notLeefs.OrderBy(x => x.BoundingBox.Left).ToList(); + } + else + { + notLeefs = notLeefs.OrderByDescending(x => x.BoundingBox.Top).ToList(); + } + + foreach (XYNode node in notLeefs) + { + RecursiveGetLeefs(node.Children, ref leefs, level); + } + } + + public override string ToString() + { + return (IsLeef ? "Leef" : "Node"); + } + } + + /// + /// A Leef node used in the algorithm, i.e. a block. + /// + public class XYLeef : XYNode + { + /// + /// Returns true if this node is a leef, false otherwise. + /// + public override bool IsLeef => true; + + /// + /// The words in the leef. + /// + public Word[] Words { get; set; } + + /// + /// The number of words in the leef. + /// + /// + public override int CountWords() => Words == null ? 0 : Words.Length; + + /// + /// Returns null as a leef doesn't have leefs. + /// + /// + public override List GetLeefs() + { + return null; + } + + /// + /// Gets the lines of the leef. + /// + /// + public TextLine[] GetLines() + { + var groupedWords = Words.GroupBy(x => x.BoundingBox.Bottom).ToDictionary(x => x.Key, x => x.ToList()); + return groupedWords.OrderByDescending(x => x.Key).Select(ws => new TextLine(ws.Value)).ToArray(); + } + + /// + /// Create a new . + /// + /// The words contained in the leef. + public XYLeef(params Word[] words) : this(words == null ? null : words.ToList()) + { + + } + + /// + /// Create a new . + /// + /// The words contained in the leef. + public XYLeef(IEnumerable words) : base(null) + { + decimal left = words.Min(b => b.BoundingBox.Left); + decimal right = words.Max(b => b.BoundingBox.Right); + + decimal bottom = words.Min(b => b.BoundingBox.Bottom); + decimal top = words.Max(b => b.BoundingBox.Top); + + BoundingBox = new PdfRectangle(left, bottom, right, top); + Words = words.ToArray(); + } + } +} From 2525cd243fbf83684ae0c9b683159441020f42f5 Mon Sep 17 00:00:00 2001 From: BobLd Date: Sun, 16 Jun 2019 14:03:12 +0100 Subject: [PATCH 2/6] Typo correction --- .../DocumentLayoutAnalysis/RecursiveXYCut.cs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs index 7893e71e..102a8679 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs @@ -18,37 +18,37 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// Get the blocks. /// /// The words in a page. - /// The minimum widht for a block. + /// The minimum width for a block. /// The dominant font width. /// The dominant font height. /// - public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidht, + public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth, decimal dominantFontWidth, decimal dominantFontHeight) { - return GetBlocks(pageWords, minimumWidht, k => dominantFontWidth, k => dominantFontHeight); + return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight); } /// /// Get the blocks. /// /// The words in a page. - /// The minimum widht for a block. + /// The minimum width for a block. /// The function that determines the dominant font width. /// The function that determines the dominant font height. /// - public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidht, + public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth, Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc) { var root = new XYLeef(pageWords); - return VerticalCut(root, minimumWidht, dominantFontWidthFunc, dominantFontHeightFunc); + return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); } - private static XYNode VerticalCut(XYLeef leef, decimal minimumWidht, + private static XYNode VerticalCut(XYLeef leef, decimal minimumWidth, Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc, int level = 0) { - if (leef.CountWords() <= 1 || leef.BoundingBox.Width <= minimumWidht) + if (leef.CountWords() <= 1 || leef.BoundingBox.Width <= minimumWidth) { // we stop cutting if // - only one word remains @@ -103,7 +103,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis // |____| |____| currentProj[1] = words[i].BoundingBox.Right; } - else if (currentProj[1] - currentProj[0] < minimumWidht) + else if (currentProj[1] - currentProj[0] < minimumWidth) { // still too small currentProj[1] = words[i].BoundingBox.Right; @@ -125,7 +125,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis var newLeefsEnums = projectionProfile.Select(p => leef.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1])); var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e)); - var newNodes = newLeefs.Select(l => HorizontalCut(l, minimumWidht, + var newNodes = newLeefs.Select(l => HorizontalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); @@ -137,7 +137,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return new XYNode(newNodes); } - private static XYNode HorizontalCut(XYLeef leef, decimal minimumWidht, + private static XYNode HorizontalCut(XYLeef leef, decimal minimumWidth, Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc, int level = 0) { @@ -210,7 +210,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis var newLeefsEnums = projectionProfile.Select(p => leef.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1])); var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e)); - var newNodes = newLeefs.Select(l => VerticalCut(l, minimumWidht, + var newNodes = newLeefs.Select(l => VerticalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); From 4416793f6d85d5732271d9e427ceb5f6f5839229 Mon Sep 17 00:00:00 2001 From: BobLd Date: Sun, 16 Jun 2019 19:19:44 +0100 Subject: [PATCH 3/6] Corrected PublicApiScannerTests --- src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 02e38938..9d2fde2b 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -55,6 +55,8 @@ "UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions", "UglyToad.PdfPig.DocumentLayoutAnalysis.NNWordExtractor", "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", + "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", + "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeef", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Fonts.DescriptorFontFile", From f8d0883da5559051e4a283a32ca99e4248fd74cb Mon Sep 17 00:00:00 2001 From: BobLd Date: Tue, 18 Jun 2019 20:48:49 +0100 Subject: [PATCH 4/6] Update with corrections --- .../PublicApiScannerTests.cs | 2 +- .../DocumentLayoutAnalysis/Distances.cs | 80 ++++++++---- .../DocumentLayoutAnalysis/MathExtensions.cs | 6 +- ...r.cs => NearestNeighbourWordExtractor .cs} | 15 ++- .../DocumentLayoutAnalysis/RecursiveXYCut.cs | 119 +++++++++--------- 5 files changed, 126 insertions(+), 96 deletions(-) rename src/UglyToad.PdfPig/DocumentLayoutAnalysis/{NNWordExtractor.cs => NearestNeighbourWordExtractor .cs} (95%) diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 9d2fde2b..cc100bc8 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -56,7 +56,7 @@ "UglyToad.PdfPig.DocumentLayoutAnalysis.NNWordExtractor", "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", - "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeef", + "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Fonts.DescriptorFontFile", diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs index d5ad4ea4..2b06eea9 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.Linq; using UglyToad.PdfPig.Geometry; @@ -14,7 +15,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// The first point. /// The second point. - /// public static double Euclidean(PdfPoint point1, PdfPoint point2) { double dx = (double)(point1.X - point2.X); @@ -29,8 +29,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The second point. /// The weight of the X coordinates. Default is 1. /// The weight of the Y coordinates. Default is 1. - /// - public static double WghtdEuclidean(PdfPoint point1, PdfPoint point2, double wX = 1.0, double wY = 1.0) + public static double WeightedEuclidean(PdfPoint point1, PdfPoint point2, double wX = 1.0, double wY = 1.0) { double dx = (double)(point1.X - point2.X); double dy = (double)(point1.Y - point2.Y); @@ -43,7 +42,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// The first point. /// The second point. - /// public static double Manhattan(PdfPoint point1, PdfPoint point2) { return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y)); @@ -54,16 +52,35 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// The reference point, for which to find the nearest neighbour. /// The list of neighbours candidates. - /// The distance measure to use. - /// The distance between reference point, and its nearest neighbour - /// - public static PdfPoint FindNearest(this PdfPoint pdfPoint, PdfPoint[] points, - Func measure, out double dist) + /// The distance measure to use. + /// The distance between reference point, and its nearest neighbour + public static PdfPoint FindNearest(this PdfPoint pdfPoint, IReadOnlyList points, + Func distanceMeasure, out double distance) { - double d = points.Min(k => measure(k, pdfPoint)); - PdfPoint point = points.First(x => measure(x, pdfPoint) == d); - dist = d; - return point; + if (points == null || points.Count == 0) + { + throw new ArgumentException("Distances.FindNearest(): The list of neighbours candidates is either null or empty.", "points"); + } + + if (distanceMeasure == null) + { + throw new ArgumentException("Distances.FindNearest(): The distance measure must not be null.", "distanceMeasure"); + } + + distance = double.MaxValue; + PdfPoint closestPoint = default; + + for (var i = 0; i < points.Count; i++) + { + double currentDistance = distanceMeasure(points[i], pdfPoint); + if (currentDistance < distance) + { + distance = currentDistance; + closestPoint = points[i]; + } + } + + return closestPoint; } /// @@ -71,16 +88,35 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// The reference point, for which to find the nearest neighbour. /// The list of neighbours candidates. - /// The distance measure to use. - /// The distance between reference point, and its nearest neighbour - /// - public static int FindIndexNearest(this PdfPoint pdfPoint, PdfPoint[] points, - Func measure, out double dist) + /// The distance measure to use. + /// The distance between reference point, and its nearest neighbour + public static int FindIndexNearest(this PdfPoint pdfPoint, IReadOnlyList points, + Func distanceMeasure, out double distance) { - double d = points.Min(k => measure(k, pdfPoint)); - int index = Array.FindIndex(points, x => measure(x, pdfPoint) == d); - dist = d; - return index; + if (points == null || points.Count == 0) + { + throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "points"); + } + + if (distanceMeasure == null) + { + throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", "distanceMeasure"); + } + + distance = double.MaxValue; + int closestPointIndex = -1; + + for (var i = 0; i < points.Count; i++) + { + double currentDistance = distanceMeasure(points[i], pdfPoint); + if (currentDistance < distance) + { + distance = currentDistance; + closestPointIndex = i; + } + } + + return closestPointIndex; } } } diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs index a32fc576..295c524e 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs @@ -11,8 +11,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// Computes the mode of a sequence of float values. /// - /// - /// + /// The array of floats. public static float Mode(this IEnumerable array) { if (array == null || array.Count() == 0) return float.NaN; @@ -22,8 +21,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// Computes the mode of a sequence of decimal values. /// - /// - /// + /// The array of decimal. public static decimal Mode(this IEnumerable array) { return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key; diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs similarity index 95% rename from src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs rename to src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs index 5da1efbf..97ef624c 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs @@ -12,18 +12,17 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// Nearest Neighbour Word Extractor, using the distance. /// This implementation leverages bounding boxes. /// - public class NNWordExtractor : IWordExtractor + public class NearestNeighbourWordExtractor : IWordExtractor { /// - /// Create an instance of Nearest Neighbour Word Extractor, . + /// Create an instance of Nearest Neighbour Word Extractor, . /// - public static IWordExtractor Instance { get; } = new NNWordExtractor(); + public static IWordExtractor Instance { get; } = new NearestNeighbourWordExtractor(); /// /// Gets the words. /// - /// - /// + /// The letters in the page. public IEnumerable GetWords(IReadOnlyList letters) { List wordsH = GetWords( @@ -64,7 +63,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis } /// - /// + /// Private method to get the words. /// /// The letters in the page, they must have /// the same text directions. @@ -72,7 +71,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height. /// The distance measure between two start and end base line points, /// e.g. the Manhattan distance. - /// private static List GetWords(IEnumerable pageLetters, Func metric, Func distMeasure) { @@ -85,7 +83,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis Letter[] letters = pageLetters.ToArray(); int lettersCount = letters.Length; - PdfPoint[] startBaseLines = letters.Select(x => x.StartBaseLine).ToArray(); + List startBaseLines = letters.Select(x => x.StartBaseLine).ToList(); + int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray(); // Find nearest neighbours indexes diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs index 102a8679..71d8aeff 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs @@ -10,14 +10,14 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document /// recursively into a set of rectangular blocks. This implementation leverages bounding boxes. /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut - /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha and Robert M.Haralick Ihsin T. Phillips + /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips /// public class RecursiveXYCut { /// /// Get the blocks. /// - /// The words in a page. + /// The words in the page. /// The minimum width for a block. /// The dominant font width. /// The dominant font height. @@ -31,7 +31,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// /// Get the blocks. /// - /// The words in a page. + /// The words in the page. /// The minimum width for a block. /// The function that determines the dominant font width. /// The function that determines the dominant font height. @@ -40,24 +40,24 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc) { - var root = new XYLeef(pageWords); + var root = new XYLeaf(pageWords); return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); } - private static XYNode VerticalCut(XYLeef leef, decimal minimumWidth, + private static XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth, Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc, int level = 0) { - if (leef.CountWords() <= 1 || leef.BoundingBox.Width <= minimumWidth) + if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth) { // we stop cutting if // - only one word remains // - width is too small - return leef; + return leaf; } // order words left to right - var words = leef.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray(); + var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray(); // determine dominantFontWidth and dominantFontHeight decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) @@ -122,33 +122,33 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis if (i == wordsCount - 1) projectionProfile.Add(currentProj); } - var newLeefsEnums = projectionProfile.Select(p => leef.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1])); - var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e)); + var newLeafsEnums = projectionProfile.Select(p => leaf.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1])); + var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e)); - var newNodes = newLeefs.Select(l => HorizontalCut(l, minimumWidth, + var newNodes = newLeafs.Select(l => HorizontalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); - var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); + var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); if (lost.Count > 0) { - newNodes.AddRange(lost.Select(w => new XYLeef(w))); + newNodes.AddRange(lost.Select(w => new XYLeaf(w))); } return new XYNode(newNodes); } - private static XYNode HorizontalCut(XYLeef leef, decimal minimumWidth, + private static XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth, Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc, int level = 0) { - if (leef.CountWords() <= 1) + if (leaf.CountWords() <= 1) { // we stop cutting if // - only one word remains - return leef; + return leaf; } - var words = leef.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top + var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top // determine dominantFontWidth and dominantFontHeight decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) @@ -199,7 +199,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { if (level >= 1) { - return leef; + return leaf; } else { @@ -207,16 +207,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis } } - var newLeefsEnums = projectionProfile.Select(p => - leef.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1])); - var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e)); - var newNodes = newLeefs.Select(l => VerticalCut(l, minimumWidth, + var newLeafsEnums = projectionProfile.Select(p => + leaf.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1])); + var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e)); + var newNodes = newLeafs.Select(l => VerticalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); - var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); + var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); if (lost.Count > 0) { - newNodes.AddRange(lost.Select(w => new XYLeef(w))); + newNodes.AddRange(lost.Select(w => new XYLeaf(w))); } return new XYNode(newNodes); } @@ -228,9 +228,9 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis public class XYNode { /// - /// Returns true if this node is a leef, false otherwise. + /// Returns true if this node is a leaf, false otherwise. /// - public virtual bool IsLeef => false; + public virtual bool IsLeaf => false; /// /// The rectangle completely containing the node. @@ -255,16 +255,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis } /// - /// Recursively gets the leefs (last nodes) of this node. + /// Recursively gets the leafs (last nodes) of this node. /// /// - public virtual List GetLeefs() + public virtual List GetLeafs() { - List leefs = new List(); - if (Children == null || Children.Count() == 0) return leefs; + List leafs = new List(); + if (Children == null || Children.Count() == 0) return leafs; int level = 0; - RecursiveGetLeefs(Children, ref leefs, level); - return leefs; + RecursiveGetLeafs(Children, ref leafs, level); + return leafs; } /// @@ -297,86 +297,83 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis private void RecursiveCount(IEnumerable children, ref int count) { if (children.Count() == 0) return; - foreach (XYNode node in children.Where(x => x.IsLeef)) + foreach (XYNode node in children.Where(x => x.IsLeaf)) { count += node.CountWords(); } - foreach (XYNode node in children.Where(x => !x.IsLeef)) + foreach (XYNode node in children.Where(x => !x.IsLeaf)) { RecursiveCount(node.Children, ref count); } } - private void RecursiveGetLeefs(IEnumerable children, ref List leefs, int level) + private void RecursiveGetLeafs(IEnumerable children, ref List leafs, int level) { if (children.Count() == 0) return; bool isVerticalCut = level % 2 == 0; - foreach (XYLeef node in children.Where(x => x.IsLeef)) + foreach (XYLeaf node in children.Where(x => x.IsLeaf)) { - leefs.Add(node); + leafs.Add(node); } level++; - IEnumerable notLeefs = children.Where(x => !x.IsLeef); + IEnumerable notLeafs = children.Where(x => !x.IsLeaf); if (isVerticalCut) { - notLeefs = notLeefs.OrderBy(x => x.BoundingBox.Left).ToList(); + notLeafs = notLeafs.OrderBy(x => x.BoundingBox.Left).ToList(); } else { - notLeefs = notLeefs.OrderByDescending(x => x.BoundingBox.Top).ToList(); + notLeafs = notLeafs.OrderByDescending(x => x.BoundingBox.Top).ToList(); } - foreach (XYNode node in notLeefs) + foreach (XYNode node in notLeafs) { - RecursiveGetLeefs(node.Children, ref leefs, level); + RecursiveGetLeafs(node.Children, ref leafs, level); } } public override string ToString() { - return (IsLeef ? "Leef" : "Node"); + return (IsLeaf ? "Leaf" : "Node"); } } /// - /// A Leef node used in the algorithm, i.e. a block. + /// A Leaf node used in the algorithm, i.e. a block. /// - public class XYLeef : XYNode + public class XYLeaf : XYNode { /// - /// Returns true if this node is a leef, false otherwise. + /// Returns true if this node is a leaf, false otherwise. /// - public override bool IsLeef => true; + public override bool IsLeaf => true; /// - /// The words in the leef. + /// The words in the leaf. /// public Word[] Words { get; set; } /// - /// The number of words in the leef. + /// The number of words in the leaf. /// - /// public override int CountWords() => Words == null ? 0 : Words.Length; /// - /// Returns null as a leef doesn't have leefs. + /// Returns null as a leaf doesn't have leafs. /// - /// - public override List GetLeefs() + public override List GetLeafs() { return null; } /// - /// Gets the lines of the leef. + /// Gets the lines of the leaf. /// - /// public TextLine[] GetLines() { var groupedWords = Words.GroupBy(x => x.BoundingBox.Bottom).ToDictionary(x => x.Key, x => x.ToList()); @@ -384,19 +381,19 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis } /// - /// Create a new . + /// Create a new . /// - /// The words contained in the leef. - public XYLeef(params Word[] words) : this(words == null ? null : words.ToList()) + /// The words contained in the leaf. + public XYLeaf(params Word[] words) : this(words == null ? null : words.ToList()) { } /// - /// Create a new . + /// Create a new . /// - /// The words contained in the leef. - public XYLeef(IEnumerable words) : base(null) + /// The words contained in the leaf. + public XYLeaf(IEnumerable words) : base(null) { decimal left = words.Min(b => b.BoundingBox.Left); decimal right = words.Max(b => b.BoundingBox.Right); From 080354dc54ab70c69bab63df55e8c18168ca50da Mon Sep 17 00:00:00 2001 From: BobLd Date: Tue, 18 Jun 2019 21:32:14 +0100 Subject: [PATCH 5/6] Corrected PublicApiScannerTests --- src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index cc100bc8..0e38466d 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -53,7 +53,7 @@ "UglyToad.PdfPig.CrossReference.TrailerDictionary", "UglyToad.PdfPig.DocumentLayoutAnalysis.Distances", "UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions", - "UglyToad.PdfPig.DocumentLayoutAnalysis.NNWordExtractor", + "UglyToad.PdfPig.DocumentLayoutAnalysis.NearestNeighbourWordExtractor", "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode", "UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf", From 00233fa5d023f8b5c7af227ea33737e5c706dab8 Mon Sep 17 00:00:00 2001 From: BobLd Date: Thu, 20 Jun 2019 22:10:05 +0100 Subject: [PATCH 6/6] Update with corrections - 2 --- .../NearestNeighbourWordExtractor .cs | 20 +- .../DocumentLayoutAnalysis/RecursiveXYCut.cs | 199 +----------------- .../DocumentLayoutAnalysis/XYLeaf.cs | 76 +++++++ .../DocumentLayoutAnalysis/XYNode.cs | 130 ++++++++++++ 4 files changed, 235 insertions(+), 190 deletions(-) create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs create mode 100644 src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs index 97ef624c..34455cda 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NearestNeighbourWordExtractor .cs @@ -75,12 +75,27 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis Func metric, Func distMeasure) { if (pageLetters == null || pageLetters.Count() == 0) return new List(); + TextDirection textDirection = pageLetters.ElementAt(0).TextDirection; - if (pageLetters.Any(x => pageLetters.ElementAt(0).TextDirection != x.TextDirection)) + if (pageLetters.Any(x => textDirection != x.TextDirection)) { throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction."); } + Func, IReadOnlyList> orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList(); + if (textDirection == TextDirection.Rotate180) + { + orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Right).ToList(); + } + else if (textDirection == TextDirection.Rotate90) + { + orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Top).ToList(); + } + else if (textDirection == TextDirection.Rotate270) + { + orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Bottom).ToList(); + } + Letter[] letters = pageLetters.ToArray(); int lettersCount = letters.Length; List startBaseLines = letters.Select(x => x.StartBaseLine).ToList(); @@ -183,7 +198,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { groupedLetters.Add(letters[s]); } - words.Add(new Word(groupedLetters)); + + words.Add(new Word(orderFunc(groupedLetters))); } List indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList(); diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs index 71d8aeff..a961ff17 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs @@ -2,7 +2,6 @@ using System.Collections.Generic; using System.Linq; using UglyToad.PdfPig.Content; -using UglyToad.PdfPig.Geometry; namespace UglyToad.PdfPig.DocumentLayoutAnalysis { @@ -14,6 +13,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// public class RecursiveXYCut { + /// + /// Get the blocks. + /// + /// The words in the page. + /// The minimum width for a block. + public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth = 0) + { + return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5m, 3)); + } + /// /// Get the blocks. /// @@ -21,7 +30,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The minimum width for a block. /// The dominant font width. /// The dominant font height. - /// public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth, decimal dominantFontWidth, decimal dominantFontHeight) { @@ -35,12 +43,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// The minimum width for a block. /// The function that determines the dominant font width. /// The function that determines the dominant font height. - /// public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidth, Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc) { - var root = new XYLeaf(pageWords); + var root = new XYLeaf(pageWords); // Create a root node. return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); } @@ -221,188 +228,4 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return new XYNode(newNodes); } } - - /// - /// A Node used in the algorithm. - /// - public class XYNode - { - /// - /// Returns true if this node is a leaf, false otherwise. - /// - public virtual bool IsLeaf => false; - - /// - /// The rectangle completely containing the node. - /// - public PdfRectangle BoundingBox { get; set; } - - /// - /// The children of the node. - /// - public XYNode[] Children { get; set; } - - /// - /// Recursively counts the words included in this node. - /// - /// - public virtual int CountWords() - { - if (Children == null) return 0; - int count = 0; - RecursiveCount(Children, ref count); - return count; - } - - /// - /// Recursively gets the leafs (last nodes) of this node. - /// - /// - public virtual List GetLeafs() - { - List leafs = new List(); - if (Children == null || Children.Count() == 0) return leafs; - int level = 0; - RecursiveGetLeafs(Children, ref leafs, level); - return leafs; - } - - /// - /// Create a new . - /// - /// - public XYNode(params XYNode[] children) - : this(children?.ToList()) - { - - } - - /// - /// Create a new . - /// - /// - public XYNode(IEnumerable children) - { - if (children != null && children.Count() != 0) - { - Children = children.ToArray(); - decimal left = children.Min(b => b.BoundingBox.Left); - decimal right = children.Max(b => b.BoundingBox.Right); - decimal bottom = children.Min(b => b.BoundingBox.Bottom); - decimal top = children.Max(b => b.BoundingBox.Top); - BoundingBox = new PdfRectangle(left, bottom, right, top); - } - } - - private void RecursiveCount(IEnumerable children, ref int count) - { - if (children.Count() == 0) return; - foreach (XYNode node in children.Where(x => x.IsLeaf)) - { - count += node.CountWords(); - } - - foreach (XYNode node in children.Where(x => !x.IsLeaf)) - { - RecursiveCount(node.Children, ref count); - } - } - - private void RecursiveGetLeafs(IEnumerable children, ref List leafs, int level) - { - if (children.Count() == 0) return; - bool isVerticalCut = level % 2 == 0; - - foreach (XYLeaf node in children.Where(x => x.IsLeaf)) - { - leafs.Add(node); - } - - level++; - - IEnumerable notLeafs = children.Where(x => !x.IsLeaf); - - if (isVerticalCut) - { - notLeafs = notLeafs.OrderBy(x => x.BoundingBox.Left).ToList(); - } - else - { - notLeafs = notLeafs.OrderByDescending(x => x.BoundingBox.Top).ToList(); - } - - foreach (XYNode node in notLeafs) - { - RecursiveGetLeafs(node.Children, ref leafs, level); - } - } - - public override string ToString() - { - return (IsLeaf ? "Leaf" : "Node"); - } - } - - /// - /// A Leaf node used in the algorithm, i.e. a block. - /// - public class XYLeaf : XYNode - { - /// - /// Returns true if this node is a leaf, false otherwise. - /// - public override bool IsLeaf => true; - - /// - /// The words in the leaf. - /// - public Word[] Words { get; set; } - - /// - /// The number of words in the leaf. - /// - public override int CountWords() => Words == null ? 0 : Words.Length; - - /// - /// Returns null as a leaf doesn't have leafs. - /// - public override List GetLeafs() - { - return null; - } - - /// - /// Gets the lines of the leaf. - /// - public TextLine[] GetLines() - { - var groupedWords = Words.GroupBy(x => x.BoundingBox.Bottom).ToDictionary(x => x.Key, x => x.ToList()); - return groupedWords.OrderByDescending(x => x.Key).Select(ws => new TextLine(ws.Value)).ToArray(); - } - - /// - /// Create a new . - /// - /// The words contained in the leaf. - public XYLeaf(params Word[] words) : this(words == null ? null : words.ToList()) - { - - } - - /// - /// Create a new . - /// - /// The words contained in the leaf. - public XYLeaf(IEnumerable words) : base(null) - { - decimal left = words.Min(b => b.BoundingBox.Left); - decimal right = words.Max(b => b.BoundingBox.Right); - - decimal bottom = words.Min(b => b.BoundingBox.Bottom); - decimal top = words.Max(b => b.BoundingBox.Top); - - BoundingBox = new PdfRectangle(left, bottom, right, top); - Words = words.ToArray(); - } - } } diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs new file mode 100644 index 00000000..9dab8daf --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYLeaf.cs @@ -0,0 +1,76 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// A Leaf node used in the algorithm, i.e. a block. + /// + public class XYLeaf : XYNode + { + /// + /// Returns true if this node is a leaf, false otherwise. + /// + public override bool IsLeaf => true; + + /// + /// The words in the leaf. + /// + public IReadOnlyList Words { get; } + + /// + /// The number of words in the leaf. + /// + public override int CountWords() => Words == null ? 0 : Words.Count; + + /// + /// Returns null as a leaf doesn't have leafs. + /// + public override List GetLeafs() + { + return null; + } + + /// + /// Gets the lines of the leaf. + /// + public IReadOnlyList GetLines() + { + return Words.GroupBy(x => x.BoundingBox.Bottom).OrderByDescending(x => x.Key) + .Select(x => new TextLine(x.ToList())).ToArray(); + } + + /// + /// Create a new . + /// + /// The words contained in the leaf. + public XYLeaf(params Word[] words) : this(words == null ? null : words.ToList()) + { + + } + + /// + /// Create a new . + /// + /// The words contained in the leaf. + public XYLeaf(IEnumerable words) : base(null) + { + if (words == null) + { + throw new ArgumentException("XYLeaf(): The words contained in the leaf cannot be null.", "words"); + } + + decimal left = words.Min(b => b.BoundingBox.Left); + decimal right = words.Max(b => b.BoundingBox.Right); + + decimal bottom = words.Min(b => b.BoundingBox.Bottom); + decimal top = words.Max(b => b.BoundingBox.Top); + + BoundingBox = new PdfRectangle(left, bottom, right, top); + Words = words.ToArray(); + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs new file mode 100644 index 00000000..70620807 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/XYNode.cs @@ -0,0 +1,130 @@ +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Geometry; +using UglyToad.PdfPig.Util; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// A Node used in the algorithm. + /// + public class XYNode + { + /// + /// Returns true if this node is a leaf, false otherwise. + /// + public virtual bool IsLeaf => false; + + /// + /// The rectangle completely containing the node. + /// + public PdfRectangle BoundingBox { get; set; } + + /// + /// The children of the node. + /// + public XYNode[] Children { get; set; } + + /// + /// Recursively counts the words included in this node. + /// + public virtual int CountWords() + { + if (Children == null) return 0; + int count = 0; + RecursiveCount(Children, ref count); + return count; + } + + /// + /// Recursively gets the leafs (last nodes) of this node. + /// + public virtual List GetLeafs() + { + List leafs = new List(); + if (Children == null || Children.Count() == 0) return leafs; + int level = 0; + RecursiveGetLeafs(Children, ref leafs, level); + return leafs; + } + + /// + /// Create a new . + /// + /// The node's children. + public XYNode(params XYNode[] children) + : this(children?.ToList()) + { + + } + + /// + /// Create a new . + /// + /// The node's children. + public XYNode(IEnumerable children) + { + if (children != null && children.Count() != 0) + { + Children = children.ToArray(); + decimal left = children.Min(b => b.BoundingBox.Left); + decimal right = children.Max(b => b.BoundingBox.Right); + decimal bottom = children.Min(b => b.BoundingBox.Bottom); + decimal top = children.Max(b => b.BoundingBox.Top); + BoundingBox = new PdfRectangle(left, bottom, right, top); + } + else + { + Children = EmptyArray.Instance; + } + } + + private void RecursiveCount(IEnumerable children, ref int count) + { + if (children.Count() == 0) return; + foreach (XYNode node in children.Where(x => x.IsLeaf)) + { + count += node.CountWords(); + } + + foreach (XYNode node in children.Where(x => !x.IsLeaf)) + { + RecursiveCount(node.Children, ref count); + } + } + + private void RecursiveGetLeafs(IEnumerable children, ref List leafs, int level) + { + if (children.Count() == 0) return; + bool isVerticalCut = level % 2 == 0; + + foreach (XYLeaf node in children.Where(x => x.IsLeaf)) + { + leafs.Add(node); + } + + level++; + + IEnumerable notLeafs = children.Where(x => !x.IsLeaf); + + if (isVerticalCut) + { + notLeafs = notLeafs.OrderBy(x => x.BoundingBox.Left).ToList(); + } + else + { + notLeafs = notLeafs.OrderByDescending(x => x.BoundingBox.Top).ToList(); + } + + foreach (XYNode node in notLeafs) + { + RecursiveGetLeafs(node.Children, ref leafs, level); + } + } + + public override string ToString() + { + return (IsLeaf ? "Leaf" : "Node"); + } + } +}