From 395c5a7fd902cb425bbce04657ccf9cbba429311 Mon Sep 17 00:00:00 2001 From: BobLd Date: Sun, 12 Apr 2020 19:49:30 +0100 Subject: [PATCH] Improve default RecursiveXYCut dominant font width and height functions --- .../MathExtensions.cs | 10 ++-- .../PageSegmenter/RecursiveXYCut.cs | 47 +++++++++++++------ 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs index 5da09106..de0dc77c 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs @@ -9,9 +9,10 @@ public static class MathExtensions { /// - /// Computes the mode of a sequence of float values. + /// Computes the mode of a sequence of values. /// - /// The array of floats. + /// The sequence of floats. + /// The mode of the sequence. Returns if the sequence has no mode or if it is not unique. public static float Mode(this IEnumerable array) { if (array == null || array.Count() == 0) return float.NaN; @@ -22,9 +23,10 @@ } /// - /// Computes the mode of a sequence of decimal values. + /// Computes the mode of a sequence of values. /// - /// The array of decimal. + /// The sequence of doubles. + /// The mode of the sequence. Returns if the sequence has no mode or if it is not unique. public static double Mode(this IEnumerable array) { if (array == null || array.Count() == 0) return double.NaN; diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs index 4ea35bcf..e393184e 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs @@ -39,7 +39,28 @@ /// The minimum width for a block. public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth) { - return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5, 3)); + return GetBlocks(pageWords, minimumWidth, + (letters) => + { + var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3))); + var mode = widths.Mode(); + if (double.IsNaN(mode) || mode == 0) + { + mode = widths.Average(); + } + return mode; + }, + (letters) => + { + var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3)); + var mode = heights.Mode(); + if (double.IsNaN(mode) || mode == 0) + { + mode = heights.Average(); + } + return mode * 1.5; + } + ); } /// @@ -63,8 +84,8 @@ /// The function that determines the dominant font width. /// The function that determines the dominant font height. public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth, - Func, double> dominantFontWidthFunc, - Func, double> dominantFontHeightFunc) + Func, double> dominantFontWidthFunc, + Func, double> dominantFontHeightFunc) { if (pageWords.Count() == 0) return EmptyArray.Instance; @@ -89,13 +110,13 @@ } private XYNode VerticalCut(XYLeaf leaf, double minimumWidth, - Func, double> dominantFontWidthFunc, - Func, double> dominantFontHeightFunc, int level = 0) + Func, double> dominantFontWidthFunc, + Func, double> dominantFontHeightFunc, int level = 0) { // Order words left to right var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Left).ToArray(); - if (!words.Any()) + if (words.Length == 0) { return new XYNode(null); } @@ -112,8 +133,7 @@ } // Determine dominant font width - double dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) - .Select(x => x.GlyphRectangle.Normalise().Width)); + double dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)); List projectionProfile = new List(); @@ -197,13 +217,13 @@ } private XYNode HorizontalCut(XYLeaf leaf, double minimumWidth, - Func, double> dominantFontWidthFunc, - Func, double> dominantFontHeightFunc, int level = 0) + Func, double> dominantFontWidthFunc, + Func, double> dominantFontHeightFunc, int level = 0) { // Order words bottom to top var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray(); - if (!words.Any()) + if (words.Length == 0) { return new XYNode(null); } @@ -219,14 +239,13 @@ } // Determine dominant font height - double dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) - .Select(x => x.GlyphRectangle.Normalise().Height)); + double dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)); List projectionProfile = new List(); var firstWordBound = words[0].BoundingBox.Normalise(); Projection currentProjection = new Projection(firstWordBound.Bottom, firstWordBound.Top); - int wordsCount = words.Count(); + int wordsCount = words.Length; for (int i = 1; i < wordsCount; i++) {