From 30247ba774555f20bb70709b34a12c18b59f23cc Mon Sep 17 00:00:00 2001 From: Zhiguan Hu Date: Tue, 10 Dec 2019 15:03:05 -0600 Subject: [PATCH 1/2] Fix the bug that happens when all the words in the current leaf for VerticalCut/HorizontalCut are all white spaces. --- .../DocumentLayoutAnalysis/RecursiveXYCut.cs | 43 +++++++++++++++---- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs index 8b3cbaab..8a8d05c4 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs @@ -70,11 +70,18 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis XYLeaf root = new XYLeaf(pageWords); // Create a root node. XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); - var leafs = node.GetLeafs(); - - if (leafs.Count > 0) + if (node.IsLeaf) { - return leafs.Select(l => new TextBlock(l.GetLines())).ToList(); + return new List{ new TextBlock((node as XYLeaf).GetLines())}; + } + else + { + var leafs = node.GetLeafs(); + + if (leafs.Count > 0) + { + return leafs.Select(l => new TextBlock(l.GetLines())).ToList(); + } } return new List(); @@ -84,6 +91,18 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc, int level = 0) { + // order words left to right + var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray(); + + if(!words.Any()) + { + return new XYNode(null); + } + else + { + //Create new leaf with non-whitespace words. + leaf = new XYLeaf(words); + } if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth) { // we stop cutting if @@ -92,9 +111,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return leaf; } - // order words left to right - var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray(); - // determine dominantFontWidth and dominantFontHeight decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) .Select(x => Math.Abs(x.GlyphRectangle.Width))); @@ -177,6 +193,18 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis Func, decimal> dominantFontWidthFunc, Func, decimal> dominantFontHeightFunc, int level = 0) { + var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top + + if (!words.Any()) + { + return new XYNode(null); + } + else + { + //Create new leaf with non-whitespace words. + leaf = new XYLeaf(words); + } + if (leaf.CountWords() <= 1) { // we stop cutting if @@ -184,7 +212,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return leaf; } - var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top // determine dominantFontWidth and dominantFontHeight decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) From 9baa8c3ca0f433c40a43a8735e066ddfef53980f Mon Sep 17 00:00:00 2001 From: Zhiguan Hu Date: Wed, 11 Dec 2019 10:05:17 -0600 Subject: [PATCH 2/2] Fix format as suggested. --- src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs index 8a8d05c4..1476452a 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/RecursiveXYCut.cs @@ -103,6 +103,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis //Create new leaf with non-whitespace words. leaf = new XYLeaf(words); } + if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth) { // we stop cutting if @@ -212,7 +213,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return leaf; } - // determine dominantFontWidth and dominantFontHeight decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) .Select(x => Math.Abs(x.GlyphRectangle.Width)));