using System; using System.Collections.Generic; using System.Linq; using UglyToad.PdfPig.Content; using UglyToad.PdfPig.Util; namespace UglyToad.PdfPig.DocumentLayoutAnalysis { /// /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document /// recursively into a set of rectangular blocks. This implementation leverages bounding boxes. /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips /// public class RecursiveXYCut : IPageSegmenter { /// /// Create an instance of Recursive X-Y Cut page segmenter, . /// public static RecursiveXYCut Instance { get; } = new RecursiveXYCut(); /// /// Get the blocks. /// Uses 'minimumWidth' = 0, 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height) /// /// The words in the page. /// public IReadOnlyList GetBlocks(IEnumerable pageWords) { return GetBlocks(pageWords, 0); } /// /// Get the blocks. /// Uses 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height) /// /// The words in the page. /// The minimum width for a block. public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth) { return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5, 3)); } /// /// Get the blocks. /// /// The words in the page. /// The minimum width for a block. /// The dominant font width. /// The dominant font height. public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth, double dominantFontWidth, double dominantFontHeight) { return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight); } /// /// Get the blocks. /// /// The words in the page. /// The minimum width for a block. /// The function that determines the dominant font width. /// The function that determines the dominant font height. public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth, Func, double> dominantFontWidthFunc, Func, double> dominantFontHeightFunc) { if (pageWords.Count() == 0) return EmptyArray.Instance; XYLeaf root = new XYLeaf(pageWords); // Create a root node. XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); if (node.IsLeaf) { return new List{ new TextBlock((node as XYLeaf).GetLines())}; } else { var leafs = node.GetLeafs(); if (leafs.Count > 0) { return leafs.Select(l => new TextBlock(l.GetLines())).ToList(); } } return new List(); } private XYNode VerticalCut(XYLeaf leaf, double minimumWidth, Func, double> dominantFontWidthFunc, Func, double> dominantFontHeightFunc, int level = 0) { // order words left to right var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray(); if(!words.Any()) { return new XYNode(null); } else { //Create new leaf with non-whitespace words. leaf = new XYLeaf(words); } if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth) { // we stop cutting if // - only one word remains // - width is too small return leaf; } // determine dominantFontWidth and dominantFontHeight double domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) .Select(x => Math.Abs(x.GlyphRectangle.Width))); double domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) .Select(x => Math.Abs(x.GlyphRectangle.Height))); List projectionProfile = new List(); double[] currentProj = new double[2] { words[0].BoundingBox.Left, words[0].BoundingBox.Right }; int wordsCount = words.Count(); for (int i = 1; i < wordsCount; i++) { if ((words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1]) || (words[i].BoundingBox.Right >= currentProj[0] && words[i].BoundingBox.Right <= currentProj[1])) { // it is overlapping if (words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1] && words[i].BoundingBox.Right > currentProj[1]) { // |____| // |____| // |_______| <- updated currentProj[1] = words[i].BoundingBox.Right; } // we ignore the following cases: // |____| // |____| (not possible because of OrderBy) // // |____| //|___________| (not possible because of OrderBy) // // |____| // |_| } else { // no overlap if (words[i].BoundingBox.Left - currentProj[1] <= domFontWidth) { // if gap too small -> don't cut // |____| |____| currentProj[1] = words[i].BoundingBox.Right; } else if (currentProj[1] - currentProj[0] < minimumWidth) { // still too small currentProj[1] = words[i].BoundingBox.Right; } else { // if gap big enough -> cut! // |____| | |____| if (i != wordsCount - 1) // will always add the last one after { projectionProfile.Add(currentProj); currentProj = new double[2] { words[i].BoundingBox.Left, words[i].BoundingBox.Right }; } } } if (i == wordsCount - 1) projectionProfile.Add(currentProj); } var newLeafsEnums = projectionProfile.Select(p => leaf.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1])); var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e)); var newNodes = newLeafs.Select(l => HorizontalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); if (lost.Count > 0) { newNodes.AddRange(lost.Select(w => new XYLeaf(w))); } return new XYNode(newNodes); } private XYNode HorizontalCut(XYLeaf leaf, double minimumWidth, Func, double> dominantFontWidthFunc, Func, double> dominantFontHeightFunc, int level = 0) { var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top if (!words.Any()) { return new XYNode(null); } //Create new leaf with non-whitespace words. leaf = new XYLeaf(words); if (leaf.CountWords() <= 1) { // we stop cutting if // - only one word remains return leaf; } // determine dominantFontWidth and dominantFontHeight double domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) .Select(x => Math.Abs(x.GlyphRectangle.Width))); double domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) .Select(x => Math.Abs(x.GlyphRectangle.Height))); List projectionProfile = new List(); double[] currentProj = new double[2] { words[0].BoundingBox.Bottom, words[0].BoundingBox.Top }; int wordsCount = words.Count(); for (int i = 1; i < wordsCount; i++) { if ((words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1]) || (words[i].BoundingBox.Top >= currentProj[0] && words[i].BoundingBox.Top <= currentProj[1])) { // it is overlapping if (words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1] && words[i].BoundingBox.Top > currentProj[1]) { currentProj[1] = words[i].BoundingBox.Top; } } else { // no overlap if (words[i].BoundingBox.Bottom - currentProj[1] <= domFontHeight) { // if gap too small -> don't cut // |____| |____| currentProj[1] = words[i].BoundingBox.Top; } else { // if gap big enough -> cut! // |____| | |____| if (i != wordsCount - 1) // will always add the last one after { projectionProfile.Add(currentProj); currentProj = new double[2] { words[i].BoundingBox.Bottom, words[i].BoundingBox.Top }; } } } if (i == wordsCount - 1) projectionProfile.Add(currentProj); } if (projectionProfile.Count == 1) { if (level >= 1) { return leaf; } else { level++; } } var newLeafsEnums = projectionProfile.Select(p => leaf.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1])); var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e)); var newNodes = newLeafs.Select(l => VerticalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); if (lost.Count > 0) { newNodes.AddRange(lost.Select(w => new XYLeaf(w))); } return new XYNode(newNodes); } } }