mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-22 20:13:58 +08:00

- Create a TextBlock class - Creates IPageSegmenter - Add other useful distances: angle, etc. - Update RecursiveXYCut - With IPageSegmenter and TextBlock - Make XYNode and XYLeaf internal - Optimise (faster) NearestNeighbourWordExtractor and isolate the clustering algorithms for use outside of this class - Implement a Docstrum inspired page segmentation algorithm
20 lines
776 B
C#
20 lines
776 B
C#
using System.Collections.Generic;
|
|
using UglyToad.PdfPig.Content;
|
|
|
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|
{
|
|
/// <summary>
|
|
/// Page segmentation divides a page into areas, each consisting of a layout structure (blocks, lines, etc.).
|
|
/// <para> See 'Performance Comparison of Six Algorithms for Page Segmentation' by Faisal Shafait, Daniel Keysers, and Thomas M. Breuel.</para>
|
|
/// </summary>
|
|
public interface IPageSegmenter
|
|
{
|
|
/// <summary>
|
|
/// Get the text blocks.
|
|
/// </summary>
|
|
/// <param name="pageWords">The words to generate text blocks for.</param>
|
|
/// <returns>A list of text blocks from this approach.</returns>
|
|
IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords);
|
|
}
|
|
}
|