diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs index e393184e..c030c663 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs @@ -7,8 +7,9 @@ using System.Linq; using UglyToad.PdfPig.Geometry; + /// /// - /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document + /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document /// recursively into a set of rectangular blocks. This implementation leverages bounding boxes. /// https://en.wikipedia.org/wiki/Recursive_X-Y_cut /// See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips @@ -20,81 +21,73 @@ /// public static RecursiveXYCut Instance { get; } = new RecursiveXYCut(); + /// /// - /// Get the blocks. - /// Uses 'minimumWidth' = 0, 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height) + /// Get the blocks using default options values. /// - /// The words in the page. - /// - public IReadOnlyList GetBlocks(IEnumerable pageWords) + /// The page's words to segment into s. + /// The s generated by the Recursive X-Y cut method. + public IReadOnlyList GetBlocks(IEnumerable words) { - return GetBlocks(pageWords, 0); + return GetBlocks(words, new RecursiveXYCutOptions()); } + /// /// - /// Get the blocks. - /// Uses 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height) + /// Get the blocks using options values. /// - /// The words in the page. - /// The minimum width for a block. - public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth) + /// The page's words to segment into s. + /// The to use. + /// The s generated by the Recursive X-Y cut method. + public IReadOnlyList GetBlocks(IEnumerable words, DlaOptions options) { - return GetBlocks(pageWords, minimumWidth, - (letters) => + if (options is RecursiveXYCutOptions ryxcOptions) + { + if (words?.Any() != true) { - var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3))); - var mode = widths.Mode(); - if (double.IsNaN(mode) || mode == 0) - { - mode = widths.Average(); - } - return mode; - }, - (letters) => - { - var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3)); - var mode = heights.Mode(); - if (double.IsNaN(mode) || mode == 0) - { - mode = heights.Average(); - } - return mode * 1.5; + return EmptyArray.Instance; } - ); + + return GetBlocks(words, + ryxcOptions.MinimumWidth, + ryxcOptions.DominantFontWidthFunc, + ryxcOptions.DominantFontHeightFunc, + ryxcOptions.WordSeparator, + ryxcOptions.LineSeparator); + } + else + { + throw new ArgumentException("Options provided must be of type " + nameof(RecursiveXYCutOptions) + ".", nameof(options)); + } } /// /// Get the blocks. /// - /// The words in the page. - /// The minimum width for a block. - /// The dominant font width. - /// The dominant font height. - public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth, - double dominantFontWidth, double dominantFontHeight) - { - return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight); - } - - /// - /// Get the blocks. - /// - /// The words in the page. + /// The words in the page. /// The minimum width for a block. /// The function that determines the dominant font width. /// The function that determines the dominant font height. - public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth, + /// + /// + private IReadOnlyList GetBlocks(IEnumerable words, double minimumWidth, Func, double> dominantFontWidthFunc, - Func, double> dominantFontHeightFunc) + Func, double> dominantFontHeightFunc, + string wordSeparator, string lineSeparator) { - if (pageWords.Count() == 0) return EmptyArray.Instance; + // Filter out white spaces + words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)); + if (!words.Any()) + { + return EmptyArray.Instance; + } - XYLeaf root = new XYLeaf(pageWords); // Create a root node. + XYLeaf root = new XYLeaf(words); // Create a root node. XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); if (node.IsLeaf) { - return new List { new TextBlock((node as XYLeaf).GetLines()) }; + return new List { new TextBlock((node as XYLeaf).GetLines(wordSeparator), lineSeparator) }; } else { @@ -102,7 +95,7 @@ if (leaves.Count > 0) { - return leaves.Select(l => new TextBlock(l.GetLines())).ToList(); + return leaves.Select(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator)).ToList(); } } @@ -114,7 +107,7 @@ Func, double> dominantFontHeightFunc, int level = 0) { // Order words left to right - var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Left).ToArray(); + var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Left).ToArray(); if (words.Length == 0) { @@ -123,7 +116,7 @@ // Create new leaf with non-whitespace words. leaf = new XYLeaf(words); - + if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth) { // We stop cutting if @@ -195,7 +188,7 @@ } if (i == wordsCount - 1) projectionProfile.Add(currentProjection); } - + var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w => { // Get words that are contained in each projection profiles @@ -203,7 +196,7 @@ return normalisedBB.Left >= p.LowerBound && normalisedBB.Right <= p.UpperBound; })); - var newLeaves = newLeavesEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e)); + var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e)); var newNodes = newLeaves.Select(l => HorizontalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); @@ -221,7 +214,7 @@ Func, double> dominantFontHeightFunc, int level = 0) { // Order words bottom to top - var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray(); + var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray(); if (words.Length == 0) { @@ -303,7 +296,7 @@ return normalisedBB.Bottom >= p.LowerBound && normalisedBB.Top <= p.UpperBound; })); - var newLeaves = newLeavesEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e)); + var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e)); var newNodes = newLeaves.Select(l => VerticalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); @@ -335,5 +328,51 @@ return value >= LowerBound && value <= UpperBound; } } + + /// + /// Recursive X-Y cut page segmenter options. + /// + public class RecursiveXYCutOptions : PageSegmenterOptions + { + /// + /// The minimum width for a block. + /// Default value is 1. + /// + public double MinimumWidth { get; set; } = 1; + + /// + /// The function that determines the dominant font width. + /// Default value is the mode of the block's letters width. + /// If the mode is not available, the average is used. + /// + public Func, double> DominantFontWidthFunc { get; set; } = + (letters) => + { + var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3))); + var mode = widths.Mode(); + if (double.IsNaN(mode) || mode == 0) + { + mode = widths.Average(); + } + return mode; + }; + + /// + /// The function that determines the dominant font height. + /// Default value is the mode of the block's letters height times 1.5. + /// If the mode is not available, the average is used. + /// + public Func, double> DominantFontHeightFunc { get; set; } = + (letters) => + { + var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3)); + var mode = heights.Mode(); + if (double.IsNaN(mode) || mode == 0) + { + mode = heights.Average(); + } + return mode * 1.5; + }; + } } -} +} \ No newline at end of file