mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-20 11:37:57 +08:00
Fix the bug that happens when all the words in the current leaf for VerticalCut/HorizontalCut are all white spaces.
This commit is contained in:
@@ -70,11 +70,18 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
XYLeaf root = new XYLeaf(pageWords); // Create a root node.
|
XYLeaf root = new XYLeaf(pageWords); // Create a root node.
|
||||||
XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
|
XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
|
||||||
|
|
||||||
var leafs = node.GetLeafs();
|
if (node.IsLeaf)
|
||||||
|
|
||||||
if (leafs.Count > 0)
|
|
||||||
{
|
{
|
||||||
return leafs.Select(l => new TextBlock(l.GetLines())).ToList();
|
return new List<TextBlock>{ new TextBlock((node as XYLeaf).GetLines())};
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
var leafs = node.GetLeafs();
|
||||||
|
|
||||||
|
if (leafs.Count > 0)
|
||||||
|
{
|
||||||
|
return leafs.Select(l => new TextBlock(l.GetLines())).ToList();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new List<TextBlock>();
|
return new List<TextBlock>();
|
||||||
@@ -84,6 +91,18 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
||||||
{
|
{
|
||||||
|
// order words left to right
|
||||||
|
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray();
|
||||||
|
|
||||||
|
if(!words.Any())
|
||||||
|
{
|
||||||
|
return new XYNode(null);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//Create new leaf with non-whitespace words.
|
||||||
|
leaf = new XYLeaf(words);
|
||||||
|
}
|
||||||
if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
|
if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
|
||||||
{
|
{
|
||||||
// we stop cutting if
|
// we stop cutting if
|
||||||
@@ -92,9 +111,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
return leaf;
|
return leaf;
|
||||||
}
|
}
|
||||||
|
|
||||||
// order words left to right
|
|
||||||
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray();
|
|
||||||
|
|
||||||
// determine dominantFontWidth and dominantFontHeight
|
// determine dominantFontWidth and dominantFontHeight
|
||||||
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
|
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
|
||||||
.Select(x => Math.Abs(x.GlyphRectangle.Width)));
|
.Select(x => Math.Abs(x.GlyphRectangle.Width)));
|
||||||
@@ -177,6 +193,18 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
||||||
{
|
{
|
||||||
|
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top
|
||||||
|
|
||||||
|
if (!words.Any())
|
||||||
|
{
|
||||||
|
return new XYNode(null);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//Create new leaf with non-whitespace words.
|
||||||
|
leaf = new XYLeaf(words);
|
||||||
|
}
|
||||||
|
|
||||||
if (leaf.CountWords() <= 1)
|
if (leaf.CountWords() <= 1)
|
||||||
{
|
{
|
||||||
// we stop cutting if
|
// we stop cutting if
|
||||||
@@ -184,7 +212,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
return leaf;
|
return leaf;
|
||||||
}
|
}
|
||||||
|
|
||||||
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top
|
|
||||||
|
|
||||||
// determine dominantFontWidth and dominantFontHeight
|
// determine dominantFontWidth and dominantFontHeight
|
||||||
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
|
decimal domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
|
||||||
|
Reference in New Issue
Block a user