Update with corrections - 2

This commit is contained in:
BobLd
2019-06-20 22:10:05 +01:00
parent 080354dc54
commit 00233fa5d0
4 changed files with 235 additions and 190 deletions

View File

@@ -75,12 +75,27 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
Func<Letter, decimal> metric, Func<PdfPoint, PdfPoint, double> distMeasure)
{
if (pageLetters == null || pageLetters.Count() == 0) return new List<Word>();
TextDirection textDirection = pageLetters.ElementAt(0).TextDirection;
if (pageLetters.Any(x => pageLetters.ElementAt(0).TextDirection != x.TextDirection))
if (pageLetters.Any(x => textDirection != x.TextDirection))
{
throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction.");
}
Func<IEnumerable<Letter>, IReadOnlyList<Letter>> orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList();
if (textDirection == TextDirection.Rotate180)
{
orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Right).ToList();
}
else if (textDirection == TextDirection.Rotate90)
{
orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Top).ToList();
}
else if (textDirection == TextDirection.Rotate270)
{
orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Bottom).ToList();
}
Letter[] letters = pageLetters.ToArray();
int lettersCount = letters.Length;
List<PdfPoint> startBaseLines = letters.Select(x => x.StartBaseLine).ToList();
@@ -183,7 +198,8 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
groupedLetters.Add(letters[s]);
}
words.Add(new Word(groupedLetters));
words.Add(new Word(orderFunc(groupedLetters)));
}
List<int> indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList();

View File

@@ -2,7 +2,6 @@
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Geometry;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
@@ -14,6 +13,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// </summary>
public class RecursiveXYCut
{
/// <summary>
/// Get the blocks.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="minimumWidth">The minimum width for a block.</param>
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth = 0)
{
return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5m, 3));
}
/// <summary>
/// Get the blocks.
/// </summary>
@@ -21,7 +30,6 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// <param name="minimumWidth">The minimum width for a block.</param>
/// <param name="dominantFontWidth">The dominant font width.</param>
/// <param name="dominantFontHeight">The dominant font height.</param>
/// <returns></returns>
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
decimal dominantFontWidth, decimal dominantFontHeight)
{
@@ -35,12 +43,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// <param name="minimumWidth">The minimum width for a block.</param>
/// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
/// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
/// <returns></returns>
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc)
{
var root = new XYLeaf(pageWords);
var root = new XYLeaf(pageWords); // Create a root node.
return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
}
@@ -221,188 +228,4 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
return new XYNode(newNodes);
}
}
/// <summary>
/// A Node used in the <see cref="RecursiveXYCut"/> algorithm.
/// </summary>
public class XYNode
{
/// <summary>
/// Returns true if this node is a leaf, false otherwise.
/// </summary>
public virtual bool IsLeaf => false;
/// <summary>
/// The rectangle completely containing the node.
/// </summary>
public PdfRectangle BoundingBox { get; set; }
/// <summary>
/// The children of the node.
/// </summary>
public XYNode[] Children { get; set; }
/// <summary>
/// Recursively counts the words included in this node.
/// </summary>
/// <returns></returns>
public virtual int CountWords()
{
if (Children == null) return 0;
int count = 0;
RecursiveCount(Children, ref count);
return count;
}
/// <summary>
/// Recursively gets the leafs (last nodes) of this node.
/// </summary>
/// <returns></returns>
public virtual List<XYLeaf> GetLeafs()
{
List<XYLeaf> leafs = new List<XYLeaf>();
if (Children == null || Children.Count() == 0) return leafs;
int level = 0;
RecursiveGetLeafs(Children, ref leafs, level);
return leafs;
}
/// <summary>
/// Create a new <see cref="XYNode"/>.
/// </summary>
/// <param name="children"></param>
public XYNode(params XYNode[] children)
: this(children?.ToList())
{
}
/// <summary>
/// Create a new <see cref="XYNode"/>.
/// </summary>
/// <param name="children"></param>
public XYNode(IEnumerable<XYNode> children)
{
if (children != null && children.Count() != 0)
{
Children = children.ToArray();
decimal left = children.Min(b => b.BoundingBox.Left);
decimal right = children.Max(b => b.BoundingBox.Right);
decimal bottom = children.Min(b => b.BoundingBox.Bottom);
decimal top = children.Max(b => b.BoundingBox.Top);
BoundingBox = new PdfRectangle(left, bottom, right, top);
}
}
private void RecursiveCount(IEnumerable<XYNode> children, ref int count)
{
if (children.Count() == 0) return;
foreach (XYNode node in children.Where(x => x.IsLeaf))
{
count += node.CountWords();
}
foreach (XYNode node in children.Where(x => !x.IsLeaf))
{
RecursiveCount(node.Children, ref count);
}
}
private void RecursiveGetLeafs(IEnumerable<XYNode> children, ref List<XYLeaf> leafs, int level)
{
if (children.Count() == 0) return;
bool isVerticalCut = level % 2 == 0;
foreach (XYLeaf node in children.Where(x => x.IsLeaf))
{
leafs.Add(node);
}
level++;
IEnumerable<XYNode> notLeafs = children.Where(x => !x.IsLeaf);
if (isVerticalCut)
{
notLeafs = notLeafs.OrderBy(x => x.BoundingBox.Left).ToList();
}
else
{
notLeafs = notLeafs.OrderByDescending(x => x.BoundingBox.Top).ToList();
}
foreach (XYNode node in notLeafs)
{
RecursiveGetLeafs(node.Children, ref leafs, level);
}
}
public override string ToString()
{
return (IsLeaf ? "Leaf" : "Node");
}
}
/// <summary>
/// A Leaf node used in the <see cref="RecursiveXYCut"/> algorithm, i.e. a block.
/// </summary>
public class XYLeaf : XYNode
{
/// <summary>
/// Returns true if this node is a leaf, false otherwise.
/// </summary>
public override bool IsLeaf => true;
/// <summary>
/// The words in the leaf.
/// </summary>
public Word[] Words { get; set; }
/// <summary>
/// The number of words in the leaf.
/// </summary>
public override int CountWords() => Words == null ? 0 : Words.Length;
/// <summary>
/// Returns null as a leaf doesn't have leafs.
/// </summary>
public override List<XYLeaf> GetLeafs()
{
return null;
}
/// <summary>
/// Gets the lines of the leaf.
/// </summary>
public TextLine[] GetLines()
{
var groupedWords = Words.GroupBy(x => x.BoundingBox.Bottom).ToDictionary(x => x.Key, x => x.ToList());
return groupedWords.OrderByDescending(x => x.Key).Select(ws => new TextLine(ws.Value)).ToArray();
}
/// <summary>
/// Create a new <see cref="XYLeaf"/>.
/// </summary>
/// <param name="words">The words contained in the leaf.</param>
public XYLeaf(params Word[] words) : this(words == null ? null : words.ToList())
{
}
/// <summary>
/// Create a new <see cref="XYLeaf"/>.
/// </summary>
/// <param name="words">The words contained in the leaf.</param>
public XYLeaf(IEnumerable<Word> words) : base(null)
{
decimal left = words.Min(b => b.BoundingBox.Left);
decimal right = words.Max(b => b.BoundingBox.Right);
decimal bottom = words.Min(b => b.BoundingBox.Bottom);
decimal top = words.Max(b => b.BoundingBox.Top);
BoundingBox = new PdfRectangle(left, bottom, right, top);
Words = words.ToArray();
}
}
}

View File

@@ -0,0 +1,76 @@
using System;
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Geometry;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// A Leaf node used in the <see cref="RecursiveXYCut"/> algorithm, i.e. a block.
/// </summary>
public class XYLeaf : XYNode
{
/// <summary>
/// Returns true if this node is a leaf, false otherwise.
/// </summary>
public override bool IsLeaf => true;
/// <summary>
/// The words in the leaf.
/// </summary>
public IReadOnlyList<Word> Words { get; }
/// <summary>
/// The number of words in the leaf.
/// </summary>
public override int CountWords() => Words == null ? 0 : Words.Count;
/// <summary>
/// Returns null as a leaf doesn't have leafs.
/// </summary>
public override List<XYLeaf> GetLeafs()
{
return null;
}
/// <summary>
/// Gets the lines of the leaf.
/// </summary>
public IReadOnlyList<TextLine> GetLines()
{
return Words.GroupBy(x => x.BoundingBox.Bottom).OrderByDescending(x => x.Key)
.Select(x => new TextLine(x.ToList())).ToArray();
}
/// <summary>
/// Create a new <see cref="XYLeaf"/>.
/// </summary>
/// <param name="words">The words contained in the leaf.</param>
public XYLeaf(params Word[] words) : this(words == null ? null : words.ToList())
{
}
/// <summary>
/// Create a new <see cref="XYLeaf"/>.
/// </summary>
/// <param name="words">The words contained in the leaf.</param>
public XYLeaf(IEnumerable<Word> words) : base(null)
{
if (words == null)
{
throw new ArgumentException("XYLeaf(): The words contained in the leaf cannot be null.", "words");
}
decimal left = words.Min(b => b.BoundingBox.Left);
decimal right = words.Max(b => b.BoundingBox.Right);
decimal bottom = words.Min(b => b.BoundingBox.Bottom);
decimal top = words.Max(b => b.BoundingBox.Top);
BoundingBox = new PdfRectangle(left, bottom, right, top);
Words = words.ToArray();
}
}
}

View File

@@ -0,0 +1,130 @@
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.Geometry;
using UglyToad.PdfPig.Util;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// A Node used in the <see cref="RecursiveXYCut"/> algorithm.
/// </summary>
public class XYNode
{
/// <summary>
/// Returns true if this node is a leaf, false otherwise.
/// </summary>
public virtual bool IsLeaf => false;
/// <summary>
/// The rectangle completely containing the node.
/// </summary>
public PdfRectangle BoundingBox { get; set; }
/// <summary>
/// The children of the node.
/// </summary>
public XYNode[] Children { get; set; }
/// <summary>
/// Recursively counts the words included in this node.
/// </summary>
public virtual int CountWords()
{
if (Children == null) return 0;
int count = 0;
RecursiveCount(Children, ref count);
return count;
}
/// <summary>
/// Recursively gets the leafs (last nodes) of this node.
/// </summary>
public virtual List<XYLeaf> GetLeafs()
{
List<XYLeaf> leafs = new List<XYLeaf>();
if (Children == null || Children.Count() == 0) return leafs;
int level = 0;
RecursiveGetLeafs(Children, ref leafs, level);
return leafs;
}
/// <summary>
/// Create a new <see cref="XYNode"/>.
/// </summary>
/// <param name="children">The node's children.</param>
public XYNode(params XYNode[] children)
: this(children?.ToList())
{
}
/// <summary>
/// Create a new <see cref="XYNode"/>.
/// </summary>
/// <param name="children">The node's children.</param>
public XYNode(IEnumerable<XYNode> children)
{
if (children != null && children.Count() != 0)
{
Children = children.ToArray();
decimal left = children.Min(b => b.BoundingBox.Left);
decimal right = children.Max(b => b.BoundingBox.Right);
decimal bottom = children.Min(b => b.BoundingBox.Bottom);
decimal top = children.Max(b => b.BoundingBox.Top);
BoundingBox = new PdfRectangle(left, bottom, right, top);
}
else
{
Children = EmptyArray<XYNode>.Instance;
}
}
private void RecursiveCount(IEnumerable<XYNode> children, ref int count)
{
if (children.Count() == 0) return;
foreach (XYNode node in children.Where(x => x.IsLeaf))
{
count += node.CountWords();
}
foreach (XYNode node in children.Where(x => !x.IsLeaf))
{
RecursiveCount(node.Children, ref count);
}
}
private void RecursiveGetLeafs(IEnumerable<XYNode> children, ref List<XYLeaf> leafs, int level)
{
if (children.Count() == 0) return;
bool isVerticalCut = level % 2 == 0;
foreach (XYLeaf node in children.Where(x => x.IsLeaf))
{
leafs.Add(node);
}
level++;
IEnumerable<XYNode> notLeafs = children.Where(x => !x.IsLeaf);
if (isVerticalCut)
{
notLeafs = notLeafs.OrderBy(x => x.BoundingBox.Left).ToList();
}
else
{
notLeafs = notLeafs.OrderByDescending(x => x.BoundingBox.Top).ToList();
}
foreach (XYNode node in notLeafs)
{
RecursiveGetLeafs(node.Children, ref leafs, level);
}
}
public override string ToString()
{
return (IsLeaf ? "Leaf" : "Node");
}
}
}