mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-24 05:03:39 +08:00
Document Layout Analysis - IPageSegmenter, Docstrum
- Create a TextBlock class - Creates IPageSegmenter - Add other useful distances: angle, etc. - Update RecursiveXYCut - With IPageSegmenter and TextBlock - Make XYNode and XYLeaf internal - Optimise (faster) NearestNeighbourWordExtractor and isolate the clustering algorithms for use outside of this class - Implement a Docstrum inspired page segmentation algorithm
This commit is contained in:
68
src/UglyToad.PdfPig/Content/TextBlock.cs
Normal file
68
src/UglyToad.PdfPig/Content/TextBlock.cs
Normal file
@@ -0,0 +1,68 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.Geometry;
|
||||
|
||||
namespace UglyToad.PdfPig.Content
|
||||
{
|
||||
/// <summary>
|
||||
/// A block of text.
|
||||
/// </summary>
|
||||
public class TextBlock
|
||||
{
|
||||
/// <summary>
|
||||
/// The text of the block.
|
||||
/// </summary>
|
||||
public string Text { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The text direction of the block.
|
||||
/// </summary>
|
||||
public TextDirection TextDirection { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The rectangle completely containing the block.
|
||||
/// </summary>
|
||||
public PdfRectangle BoundingBox { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The text lines contained in the block.
|
||||
/// </summary>
|
||||
public IReadOnlyList<TextLine> TextLines { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="TextBlock"/>.
|
||||
/// </summary>
|
||||
/// <param name="lines"></param>
|
||||
public TextBlock(IReadOnlyList<TextLine> lines)
|
||||
{
|
||||
if (lines == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(lines));
|
||||
}
|
||||
|
||||
if (lines.Count == 0)
|
||||
{
|
||||
throw new ArgumentException("Empty lines provided.", nameof(lines));
|
||||
}
|
||||
|
||||
TextLines = lines;
|
||||
|
||||
Text = string.Join(" ", lines.Select(x => x.Text));
|
||||
|
||||
var minX = lines.Min(x => x.BoundingBox.Left);
|
||||
var minY = lines.Min(x => x.BoundingBox.Bottom);
|
||||
var maxX = lines.Max(x => x.BoundingBox.Right);
|
||||
var maxY = lines.Max(x => x.BoundingBox.Top);
|
||||
BoundingBox = new PdfRectangle(minX, minY, maxX, maxY);
|
||||
|
||||
TextDirection = lines[0].TextDirection;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
return Text;
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user