diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 662780b3..02e38938 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -51,6 +51,10 @@ "UglyToad.PdfPig.CrossReference.CrossReferenceTable", "UglyToad.PdfPig.CrossReference.CrossReferenceType", "UglyToad.PdfPig.CrossReference.TrailerDictionary", + "UglyToad.PdfPig.DocumentLayoutAnalysis.Distances", + "UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions", + "UglyToad.PdfPig.DocumentLayoutAnalysis.NNWordExtractor", + "UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Fonts.DescriptorFontFile", diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs new file mode 100644 index 00000000..d5ad4ea4 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs @@ -0,0 +1,86 @@ +using System; +using System.Linq; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Contains helpful tools for distance measures. + /// + public static class Distances + { + /// + /// The Euclidean distance is the "ordinary" straight-line distance between two points. + /// + /// The first point. + /// The second point. + /// + public static double Euclidean(PdfPoint point1, PdfPoint point2) + { + double dx = (double)(point1.X - point2.X); + double dy = (double)(point1.Y - point2.Y); + return Math.Sqrt(dx * dx + dy * dy); + } + + /// + /// The weighted Euclidean distance. + /// + /// The first point. + /// The second point. + /// The weight of the X coordinates. Default is 1. + /// The weight of the Y coordinates. Default is 1. + /// + public static double WghtdEuclidean(PdfPoint point1, PdfPoint point2, double wX = 1.0, double wY = 1.0) + { + double dx = (double)(point1.X - point2.X); + double dy = (double)(point1.Y - point2.Y); + return Math.Sqrt(wX * dx * dx + wY * dy * dy); + } + + /// + /// The Manhattan distance between two points is the sum of the absolute differences of their Cartesian coordinates. + /// Also known as rectilinear distance, L1 distance, L1 norm, snake distance, city block distance, taxicab metric. + /// + /// The first point. + /// The second point. + /// + public static double Manhattan(PdfPoint point1, PdfPoint point2) + { + return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y)); + } + + /// + /// Find the nearest point. + /// + /// The reference point, for which to find the nearest neighbour. + /// The list of neighbours candidates. + /// The distance measure to use. + /// The distance between reference point, and its nearest neighbour + /// + public static PdfPoint FindNearest(this PdfPoint pdfPoint, PdfPoint[] points, + Func measure, out double dist) + { + double d = points.Min(k => measure(k, pdfPoint)); + PdfPoint point = points.First(x => measure(x, pdfPoint) == d); + dist = d; + return point; + } + + /// + /// Find the index of the nearest point. + /// + /// The reference point, for which to find the nearest neighbour. + /// The list of neighbours candidates. + /// The distance measure to use. + /// The distance between reference point, and its nearest neighbour + /// + public static int FindIndexNearest(this PdfPoint pdfPoint, PdfPoint[] points, + Func measure, out double dist) + { + double d = points.Min(k => measure(k, pdfPoint)); + int index = Array.FindIndex(points, x => measure(x, pdfPoint) == d); + dist = d; + return index; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs new file mode 100644 index 00000000..a32fc576 --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/MathExtensions.cs @@ -0,0 +1,32 @@ +using System.Collections.Generic; +using System.Linq; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Useful math extensions. + /// + public static class MathExtensions + { + /// + /// Computes the mode of a sequence of float values. + /// + /// + /// + public static float Mode(this IEnumerable array) + { + if (array == null || array.Count() == 0) return float.NaN; + return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key; + } + + /// + /// Computes the mode of a sequence of decimal values. + /// + /// + /// + public static decimal Mode(this IEnumerable array) + { + return array.GroupBy(v => v).OrderByDescending(g => g.Count()).First().Key; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs new file mode 100644 index 00000000..5da1efbf --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/NNWordExtractor.cs @@ -0,0 +1,200 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; +using UglyToad.PdfPig.Util; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Nearest Neighbour Word Extractor, using the distance. + /// This implementation leverages bounding boxes. + /// + public class NNWordExtractor : IWordExtractor + { + /// + /// Create an instance of Nearest Neighbour Word Extractor, . + /// + public static IWordExtractor Instance { get; } = new NNWordExtractor(); + + /// + /// Gets the words. + /// + /// + /// + public IEnumerable GetWords(IReadOnlyList letters) + { + List wordsH = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Horizontal), + l => l.GlyphRectangle.Width, Distances.Manhattan) + .OrderByDescending(x => x.BoundingBox.Bottom) + .ThenBy(x => x.BoundingBox.Left).ToList(); + + List words180 = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Rotate180), + l => l.GlyphRectangle.Width, Distances.Manhattan) + .OrderBy(x => x.BoundingBox.Top) + .ThenByDescending(x => x.BoundingBox.Right).ToList(); + wordsH.AddRange(words180); + + List words90 = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Rotate90), + l => l.GlyphRectangle.Height, Distances.Manhattan) + .OrderByDescending(x => x.BoundingBox.Left) + .ThenBy(x => x.BoundingBox.Top).ToList(); + wordsH.AddRange(words90); + + List words270 = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Rotate270), + l => l.GlyphRectangle.Height, Distances.Manhattan) + .OrderBy(x => x.BoundingBox.Right) + .ThenByDescending(x => x.BoundingBox.Bottom).ToList(); + wordsH.AddRange(words270); + + List wordsU = GetWords( + letters.Where(l => l.TextDirection == TextDirection.Unknown), + l => l.GlyphRectangle.Width, Distances.Manhattan) + .OrderByDescending(x => x.BoundingBox.Bottom) + .ThenBy(x => x.BoundingBox.Left).ToList(); + wordsH.AddRange(wordsU); + + return wordsH; + } + + /// + /// + /// + /// The letters in the page, they must have + /// the same text directions. + /// The letter's metric to use in the minimum distance + /// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height. + /// The distance measure between two start and end base line points, + /// e.g. the Manhattan distance. + /// + private static List GetWords(IEnumerable pageLetters, + Func metric, Func distMeasure) + { + if (pageLetters == null || pageLetters.Count() == 0) return new List(); + + if (pageLetters.Any(x => pageLetters.ElementAt(0).TextDirection != x.TextDirection)) + { + throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction."); + } + + Letter[] letters = pageLetters.ToArray(); + int lettersCount = letters.Length; + PdfPoint[] startBaseLines = letters.Select(x => x.StartBaseLine).ToArray(); + int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray(); + + // Find nearest neighbours indexes + Parallel.For(0, lettersCount, c => + { + var currentLetter = letters[c]; + // only check neighbours if not a white space + if (!string.IsNullOrWhiteSpace(currentLetter.Value)) + { + int index = currentLetter.EndBaseLine.FindIndexNearest(startBaseLines, distMeasure, out double dist); + var pairedLetter = letters[index]; + + if (!string.IsNullOrWhiteSpace(pairedLetter.Value) && + string.Equals(currentLetter.FontName, pairedLetter.FontName, StringComparison.OrdinalIgnoreCase)) + { + decimal minDist = Math.Max(Math.Abs(metric(currentLetter)), Math.Abs(metric(pairedLetter))) * 0.60m; + if ((decimal)dist < minDist) + { + indexes[c] = index; + } + } + } + }); + + // Group indexes + List> groupedIndexes = new List>(); + List indexDone = new List(); + for (int c = 0; c < lettersCount; c++) + { + int i = indexes[c]; + if (i == -1) continue; + + bool isDoneC = indexDone.Contains(c); + bool isDoneI = indexDone.Contains(i); + if (isDoneC || isDoneI) + { + if (isDoneC && !isDoneI) + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(c))) + { + pair.Add(i); + } + indexDone.Add(i); + } + else if (!isDoneC && isDoneI) + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(i))) + { + pair.Add(c); + } + indexDone.Add(c); + } + else + { + foreach (var pair in groupedIndexes.Where(x => x.Contains(i))) + { + if (!pair.Contains(c)) pair.Add(c); + } + + foreach (var pair in groupedIndexes.Where(x => x.Contains(c))) + { + if (!pair.Contains(i)) pair.Add(i); + } + } + } + else + { + List pair = new List() { c, i }; + groupedIndexes.Add(pair); + indexDone.AddRange(pair); + } + } + + // Merge lists with common index + for (int c = 0; c < lettersCount; c++) + { + List> candidates = groupedIndexes.Where(x => x.Any(t => t == c)).ToList(); + if (candidates.Count < 2) continue; // only one group with this index + + List merged = candidates.First(); + groupedIndexes.Remove(merged); + for (int i = 1; i < candidates.Count; i++) + { + var current = candidates[i]; + merged = merged.Union(current).ToList(); + groupedIndexes.Remove(current); + } + groupedIndexes.Add(merged); + } + + List words = new List(); + for (int a = 0; a < groupedIndexes.Count(); a++) + { + List groupedLetters = new List(); + foreach (int s in groupedIndexes[a]) + { + groupedLetters.Add(letters[s]); + } + words.Add(new Word(groupedLetters)); + } + + List indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList(); + for (int n = 0; n < indexesNotDone.Count(); n++) + { + Letter letter = letters[indexesNotDone[n]]; + words.Add(new Word(new Letter[] { letter })); + } + + return words; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs new file mode 100644 index 00000000..7893e71e --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs @@ -0,0 +1,411 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document + /// recursively into a set of rectangular blocks. This implementation leverages bounding boxes. + /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut + /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha and Robert M.Haralick Ihsin T. Phillips + /// + public class RecursiveXYCut + { + /// + /// Get the blocks. + /// + /// The words in a page. + /// The minimum widht for a block. + /// The dominant font width. + /// The dominant font height. + /// + public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidht, + decimal dominantFontWidth, decimal dominantFontHeight) + { + return GetBlocks(pageWords, minimumWidht, k => dominantFontWidth, k => dominantFontHeight); + } + + /// + /// Get the blocks. + /// + /// The words in a page. + /// The minimum widht for a block. + /// The function that determines the dominant font width. + /// The function that determines the dominant font height. + /// + public static XYNode GetBlocks(IEnumerable pageWords, decimal minimumWidht, + Func, decimal> dominantFontWidthFunc, + Func, decimal> dominantFontHeightFunc) + { + var root = new XYLeef(pageWords); + return VerticalCut(root, minimumWidht, dominantFontWidthFunc, dominantFontHeightFunc); + } + + private static XYNode VerticalCut(XYLeef leef, decimal minimumWidht, + Func, decimal> dominantFontWidthFunc, + Func, decimal> dominantFontHeightFunc, int level = 0) + { + if (leef.CountWords() <= 1 || leef.BoundingBox.Width <= minimumWidht) + { + // we stop cutting if + // - only one word remains + // - width is too small + return leef; + } + + // order words left to right + var words = leef.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray(); + + // determine dominantFontWidth and dominantFontHeight + decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) + .Select(x => Math.Abs(x.GlyphRectangle.Width))); + decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) + .Select(x => Math.Abs(x.GlyphRectangle.Height))); + + List projectionProfile = new List(); + decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Left, words[0].BoundingBox.Right }; + int wordsCount = words.Count(); + for (int i = 1; i < wordsCount; i++) + { + if ((words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1]) + || (words[i].BoundingBox.Right >= currentProj[0] && words[i].BoundingBox.Right <= currentProj[1])) + { + // it is overlapping + if (words[i].BoundingBox.Left >= currentProj[0] + && words[i].BoundingBox.Left <= currentProj[1] + && words[i].BoundingBox.Right > currentProj[1]) + { + // |____| + // |____| + // |_______| <- updated + currentProj[1] = words[i].BoundingBox.Right; + } + + // we ignore the following cases: + // |____| + // |____| (not possible because of OrderBy) + // + // |____| + //|___________| (not possible because of OrderBy) + // + // |____| + // |_| + } + else + { + // no overlap + if (words[i].BoundingBox.Left - currentProj[1] <= domFontWidth) + { + // if gap too small -> don't cut + // |____| |____| + currentProj[1] = words[i].BoundingBox.Right; + } + else if (currentProj[1] - currentProj[0] < minimumWidht) + { + // still too small + currentProj[1] = words[i].BoundingBox.Right; + } + else + { + // if gap big enough -> cut! + // |____| | |____| + if (i != wordsCount - 1) // will always add the last one after + { + projectionProfile.Add(currentProj); + currentProj = new decimal[2] { words[i].BoundingBox.Left, words[i].BoundingBox.Right }; + } + } + } + if (i == wordsCount - 1) projectionProfile.Add(currentProj); + } + + var newLeefsEnums = projectionProfile.Select(p => leef.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1])); + var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e)); + + var newNodes = newLeefs.Select(l => HorizontalCut(l, minimumWidht, + dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); + + var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); + if (lost.Count > 0) + { + newNodes.AddRange(lost.Select(w => new XYLeef(w))); + } + + return new XYNode(newNodes); + } + + private static XYNode HorizontalCut(XYLeef leef, decimal minimumWidht, + Func, decimal> dominantFontWidthFunc, + Func, decimal> dominantFontHeightFunc, int level = 0) + { + if (leef.CountWords() <= 1) + { + // we stop cutting if + // - only one word remains + return leef; + } + + var words = leef.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top + + // determine dominantFontWidth and dominantFontHeight + decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) + .Select(x => Math.Abs(x.GlyphRectangle.Width))); + decimal domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) + .Select(x => Math.Abs(x.GlyphRectangle.Height))); + + List projectionProfile = new List(); + decimal[] currentProj = new decimal[2] { words[0].BoundingBox.Bottom, words[0].BoundingBox.Top }; + int wordsCount = words.Count(); + for (int i = 1; i < wordsCount; i++) + { + if ((words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1]) + || (words[i].BoundingBox.Top >= currentProj[0] && words[i].BoundingBox.Top <= currentProj[1])) + { + // it is overlapping + if (words[i].BoundingBox.Bottom >= currentProj[0] + && words[i].BoundingBox.Bottom <= currentProj[1] + && words[i].BoundingBox.Top > currentProj[1]) + { + currentProj[1] = words[i].BoundingBox.Top; + } + } + else + { + // no overlap + if (words[i].BoundingBox.Bottom - currentProj[1] <= domFontHeight) + { + // if gap too small -> don't cut + // |____| |____| + currentProj[1] = words[i].BoundingBox.Top; + } + else + { + // if gap big enough -> cut! + // |____| | |____| + if (i != wordsCount - 1) // will always add the last one after + { + projectionProfile.Add(currentProj); + currentProj = new decimal[2] { words[i].BoundingBox.Bottom, words[i].BoundingBox.Top }; + } + } + } + if (i == wordsCount - 1) projectionProfile.Add(currentProj); + } + + if (projectionProfile.Count == 1) + { + if (level >= 1) + { + return leef; + } + else + { + level++; + } + } + + var newLeefsEnums = projectionProfile.Select(p => + leef.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1])); + var newLeefs = newLeefsEnums.Where(e => e.Count() > 0).Select(e => new XYLeef(e)); + var newNodes = newLeefs.Select(l => VerticalCut(l, minimumWidht, + dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); + + var lost = leef.Words.Except(newLeefsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); + if (lost.Count > 0) + { + newNodes.AddRange(lost.Select(w => new XYLeef(w))); + } + return new XYNode(newNodes); + } + } + + /// + /// A Node used in the algorithm. + /// + public class XYNode + { + /// + /// Returns true if this node is a leef, false otherwise. + /// + public virtual bool IsLeef => false; + + /// + /// The rectangle completely containing the node. + /// + public PdfRectangle BoundingBox { get; set; } + + /// + /// The children of the node. + /// + public XYNode[] Children { get; set; } + + /// + /// Recursively counts the words included in this node. + /// + /// + public virtual int CountWords() + { + if (Children == null) return 0; + int count = 0; + RecursiveCount(Children, ref count); + return count; + } + + /// + /// Recursively gets the leefs (last nodes) of this node. + /// + /// + public virtual List GetLeefs() + { + List leefs = new List(); + if (Children == null || Children.Count() == 0) return leefs; + int level = 0; + RecursiveGetLeefs(Children, ref leefs, level); + return leefs; + } + + /// + /// Create a new . + /// + /// + public XYNode(params XYNode[] children) + : this(children?.ToList()) + { + + } + + /// + /// Create a new . + /// + /// + public XYNode(IEnumerable children) + { + if (children != null && children.Count() != 0) + { + Children = children.ToArray(); + decimal left = children.Min(b => b.BoundingBox.Left); + decimal right = children.Max(b => b.BoundingBox.Right); + decimal bottom = children.Min(b => b.BoundingBox.Bottom); + decimal top = children.Max(b => b.BoundingBox.Top); + BoundingBox = new PdfRectangle(left, bottom, right, top); + } + } + + private void RecursiveCount(IEnumerable children, ref int count) + { + if (children.Count() == 0) return; + foreach (XYNode node in children.Where(x => x.IsLeef)) + { + count += node.CountWords(); + } + + foreach (XYNode node in children.Where(x => !x.IsLeef)) + { + RecursiveCount(node.Children, ref count); + } + } + + private void RecursiveGetLeefs(IEnumerable children, ref List leefs, int level) + { + if (children.Count() == 0) return; + bool isVerticalCut = level % 2 == 0; + + foreach (XYLeef node in children.Where(x => x.IsLeef)) + { + leefs.Add(node); + } + + level++; + + IEnumerable notLeefs = children.Where(x => !x.IsLeef); + + if (isVerticalCut) + { + notLeefs = notLeefs.OrderBy(x => x.BoundingBox.Left).ToList(); + } + else + { + notLeefs = notLeefs.OrderByDescending(x => x.BoundingBox.Top).ToList(); + } + + foreach (XYNode node in notLeefs) + { + RecursiveGetLeefs(node.Children, ref leefs, level); + } + } + + public override string ToString() + { + return (IsLeef ? "Leef" : "Node"); + } + } + + /// + /// A Leef node used in the algorithm, i.e. a block. + /// + public class XYLeef : XYNode + { + /// + /// Returns true if this node is a leef, false otherwise. + /// + public override bool IsLeef => true; + + /// + /// The words in the leef. + /// + public Word[] Words { get; set; } + + /// + /// The number of words in the leef. + /// + /// + public override int CountWords() => Words == null ? 0 : Words.Length; + + /// + /// Returns null as a leef doesn't have leefs. + /// + /// + public override List GetLeefs() + { + return null; + } + + /// + /// Gets the lines of the leef. + /// + /// + public TextLine[] GetLines() + { + var groupedWords = Words.GroupBy(x => x.BoundingBox.Bottom).ToDictionary(x => x.Key, x => x.ToList()); + return groupedWords.OrderByDescending(x => x.Key).Select(ws => new TextLine(ws.Value)).ToArray(); + } + + /// + /// Create a new . + /// + /// The words contained in the leef. + public XYLeef(params Word[] words) : this(words == null ? null : words.ToList()) + { + + } + + /// + /// Create a new . + /// + /// The words contained in the leef. + public XYLeef(IEnumerable words) : base(null) + { + decimal left = words.Min(b => b.BoundingBox.Left); + decimal right = words.Max(b => b.BoundingBox.Right); + + decimal bottom = words.Min(b => b.BoundingBox.Bottom); + decimal top = words.Max(b => b.BoundingBox.Top); + + BoundingBox = new PdfRectangle(left, bottom, right, top); + Words = words.ToArray(); + } + } +}