mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-22 20:13:58 +08:00
Document Layout Analysis - IPageSegmenter, Docstrum
- Create a TextBlock class - Creates IPageSegmenter - Add other useful distances: angle, etc. - Update RecursiveXYCut - With IPageSegmenter and TextBlock - Make XYNode and XYLeaf internal - Optimise (faster) NearestNeighbourWordExtractor and isolate the clustering algorithms for use outside of this class - Implement a Docstrum inspired page segmentation algorithm
This commit is contained in:
19
src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs
Normal file
19
src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs
Normal file
@@ -0,0 +1,19 @@
|
||||
using System.Collections.Generic;
|
||||
using UglyToad.PdfPig.Content;
|
||||
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
/// <summary>
|
||||
/// Page segmentation divides a page into areas, each consisting of a layout structure (blocks, lines, etc.).
|
||||
/// <para> See 'Performance Comparison of Six Algorithms for Page Segmentation' by Faisal Shafait, Daniel Keysers, and Thomas M. Breuel.</para>
|
||||
/// </summary>
|
||||
public interface IPageSegmenter
|
||||
{
|
||||
/// <summary>
|
||||
/// Get the text blocks.
|
||||
/// </summary>
|
||||
/// <param name="pageWords">The words to generate text blocks for.</param>
|
||||
/// <returns>A list of text blocks from this approach.</returns>
|
||||
IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user