namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter { using Content; using Core; using System; using System.Collections.Generic; using System.Linq; using UglyToad.PdfPig.Geometry; /// /// /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document /// recursively into a set of rectangular blocks. This implementation leverages bounding boxes. /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips /// public class RecursiveXYCut : IPageSegmenter { /// /// Create an instance of Recursive X-Y Cut page segmenter, . /// public static RecursiveXYCut Instance { get; } = new RecursiveXYCut(); /// /// /// Get the blocks using default options values. /// /// The page's words to segment into s. /// The s generated by the Recursive X-Y cut method. public IReadOnlyList GetBlocks(IEnumerable words) { return GetBlocks(words, new RecursiveXYCutOptions()); } /// /// /// Get the blocks using options values. /// /// The page's words to segment into s. /// The to use. /// The s generated by the Recursive X-Y cut method. public IReadOnlyList GetBlocks(IEnumerable words, DlaOptions options) { if (options is RecursiveXYCutOptions ryxcOptions) { if (words?.Any() != true) { return EmptyArray.Instance; } return GetBlocks(words, ryxcOptions.MinimumWidth, ryxcOptions.DominantFontWidthFunc, ryxcOptions.DominantFontHeightFunc, ryxcOptions.WordSeparator, ryxcOptions.LineSeparator); } else { throw new ArgumentException("Options provided must be of type " + nameof(RecursiveXYCutOptions) + ".", nameof(options)); } } /// /// Get the blocks. /// /// The words in the page. /// The minimum width for a block. /// The function that determines the dominant font width. /// The function that determines the dominant font height. /// /// private IReadOnlyList GetBlocks(IEnumerable words, double minimumWidth, Func, double> dominantFontWidthFunc, Func, double> dominantFontHeightFunc, string wordSeparator, string lineSeparator) { // Filter out white spaces words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)); if (!words.Any()) { return EmptyArray.Instance; } XYLeaf root = new XYLeaf(words); // Create a root node. XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); if (node.IsLeaf) { return new List { new TextBlock((node as XYLeaf).GetLines(wordSeparator), lineSeparator) }; } else { var leaves = node.GetLeaves(); if (leaves.Count > 0) { return leaves.Select(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator)).ToList(); } } return new List(); } private XYNode VerticalCut(XYLeaf leaf, double minimumWidth, Func, double> dominantFontWidthFunc, Func, double> dominantFontHeightFunc, int level = 0) { // Order words left to right var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Left).ToArray(); if (words.Length == 0) { return new XYNode(null); } // Create new leaf with non-whitespace words. leaf = new XYLeaf(words); if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth) { // We stop cutting if // - only one word remains // - width is too small return leaf; } // Determine dominant font width double dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)); List projectionProfile = new List(); var firstWordBound = words[0].BoundingBox.Normalise(); Projection currentProjection = new Projection(firstWordBound.Left, firstWordBound.Right); int wordsCount = words.Length; for (int i = 1; i < wordsCount; i++) { var currentWordBound = words[i].BoundingBox.Normalise(); if (currentProjection.Contains(currentWordBound.Left) || currentProjection.Contains(currentWordBound.Right)) { // It is overlapping if (currentWordBound.Left >= currentProjection.LowerBound && currentWordBound.Left <= currentProjection.UpperBound && currentWordBound.Right > currentProjection.UpperBound) { // |____| // |____| // |_______| <- updated currentProjection.UpperBound = currentWordBound.Right; } // We ignore the following cases: // |____| // |____| (not possible because of OrderBy) // // |____| //|___________| (not possible because of OrderBy) // // |____| // |_| } else { // No overlap if (currentWordBound.Left - currentProjection.UpperBound <= dominantFontWidth) { // If gap too small -> don't cut // |____| |____| currentProjection.UpperBound = currentWordBound.Right; } else if (currentProjection.UpperBound - currentProjection.LowerBound < minimumWidth) { // Still too small currentProjection.UpperBound = currentWordBound.Right; } else { // If gap big enough -> cut! // |____| | |____| if (i != wordsCount - 1) // Will always add the last one after { projectionProfile.Add(currentProjection); currentProjection = new Projection(currentWordBound.Left, currentWordBound.Right); } } } if (i == wordsCount - 1) projectionProfile.Add(currentProjection); } var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w => { // Get words that are contained in each projection profiles var normalisedBB = w.BoundingBox.Normalise(); return normalisedBB.Left >= p.LowerBound && normalisedBB.Right <= p.UpperBound; })); var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e)); var newNodes = newLeaves.Select(l => HorizontalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); var lost = leaf.Words.Except(newLeavesEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); if (lost.Count > 0) { newNodes.AddRange(lost.Select(w => new XYLeaf(w))); } return new XYNode(newNodes); } private XYNode HorizontalCut(XYLeaf leaf, double minimumWidth, Func, double> dominantFontWidthFunc, Func, double> dominantFontHeightFunc, int level = 0) { // Order words bottom to top var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray(); if (words.Length == 0) { return new XYNode(null); } // Create new leaf with non-whitespace words. leaf = new XYLeaf(words); if (leaf.CountWords() <= 1) { // We stop cutting if // - only one word remains return leaf; } // Determine dominant font height double dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)); List projectionProfile = new List(); var firstWordBound = words[0].BoundingBox.Normalise(); Projection currentProjection = new Projection(firstWordBound.Bottom, firstWordBound.Top); int wordsCount = words.Length; for (int i = 1; i < wordsCount; i++) { var currentWordBound = words[i].BoundingBox.Normalise(); if (currentProjection.Contains(currentWordBound.Bottom) || currentProjection.Contains(currentWordBound.Top)) { // It is overlapping if (currentWordBound.Bottom >= currentProjection.LowerBound && currentWordBound.Bottom <= currentProjection.UpperBound && currentWordBound.Top > currentProjection.UpperBound) { currentProjection.UpperBound = currentWordBound.Top; } } else { // No overlap if (currentWordBound.Bottom - currentProjection.UpperBound <= dominantFontHeight) { // If gap too small -> don't cut // |____| |____| currentProjection.UpperBound = currentWordBound.Top; } else { // If gap big enough -> cut! // |____| | |____| if (i != wordsCount - 1) // Will always add the last one after { projectionProfile.Add(currentProjection); currentProjection = new Projection(currentWordBound.Bottom, currentWordBound.Top); } } } if (i == wordsCount - 1) projectionProfile.Add(currentProjection); } if (projectionProfile.Count == 1) { if (level >= 1) { return leaf; } else { level++; } } var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w => { // Get words that are contained in each projection profiles var normalisedBB = w.BoundingBox.Normalise(); return normalisedBB.Bottom >= p.LowerBound && normalisedBB.Top <= p.UpperBound; })); var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e)); var newNodes = newLeaves.Select(l => VerticalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); var lost = leaf.Words.Except(newLeavesEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); if (lost.Count > 0) { newNodes.AddRange(lost.Select(w => new XYLeaf(w))); } return new XYNode(newNodes); } private struct Projection { public double UpperBound { get; set; } public double LowerBound { get; set; } public Projection(double lowerBound, double upperBound) { UpperBound = upperBound; LowerBound = lowerBound; } /// /// Returns true if the value is greater or equal to the lower bound and smaller or equal to the upper bound. /// /// The value to test. public bool Contains(double value) { return value >= LowerBound && value <= UpperBound; } } /// /// Recursive X-Y cut page segmenter options. /// public class RecursiveXYCutOptions : PageSegmenterOptions { /// /// The minimum width for a block. /// Default value is 1. /// public double MinimumWidth { get; set; } = 1; /// /// The function that determines the dominant font width. /// Default value is the mode of the block's letters width. /// If the mode is not available, the average is used. /// public Func, double> DominantFontWidthFunc { get; set; } = (letters) => { var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3))); var mode = widths.Mode(); if (double.IsNaN(mode) || mode == 0) { mode = widths.Average(); } return mode; }; /// /// The function that determines the dominant font height. /// Default value is the mode of the block's letters height times 1.5. /// If the mode is not available, the average is used. /// public Func, double> DominantFontHeightFunc { get; set; } = (letters) => { var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3)); var mode = heights.Mode(); if (double.IsNaN(mode) || mode == 0) { mode = heights.Average(); } return mode * 1.5; }; } } }