mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-08-20 09:21:57 +08:00
Updated Document Layout Analysis (markdown)
parent
0d8e113f50
commit
03343b3239
@ -230,6 +230,95 @@ var recursiveXYCut = new RecursiveXYCut(new RecursiveXYCut.RecursiveXYCutOptions
|
||||
var blocks = recursiveXYCut.GetBlocks(words);
|
||||
```
|
||||
|
||||
A more advanced example to read two columns. See [#721](https://github.com/UglyToad/PdfPig/issues/721)
|
||||

|
||||
```cs
|
||||
static void testAdv()
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
string pdfFilePath = @"...\2302.14502.pdf";
|
||||
|
||||
using (var document = UglyToad.PdfPig.PdfDocument.Open(pdfFilePath))
|
||||
{
|
||||
foreach (var page in document.GetPages())
|
||||
{
|
||||
// 0. Preprocessing
|
||||
var letters = page.Letters; // no preprocessing
|
||||
|
||||
// 1. Extract words
|
||||
//var wordExtractor = NearestNeighbourWordExtractor.Instance;
|
||||
|
||||
var wordExtractorOptions = new NearestNeighbourWordExtractor.NearestNeighbourWordExtractorOptions()
|
||||
{
|
||||
Filter = (pivot, candidate) =>
|
||||
{
|
||||
// check if white space (default implementation of 'Filter')
|
||||
if (string.IsNullOrWhiteSpace(candidate.Value))
|
||||
{
|
||||
// pivot and candidate letters cannot belong to the same word
|
||||
// if candidate letter is null or white space.
|
||||
// ('FilterPivot' already checks if the pivot is null or white space by default)
|
||||
return false;
|
||||
}
|
||||
|
||||
// check for height difference
|
||||
var maxHeight = Math.Max(pivot.PointSize, candidate.PointSize);
|
||||
var minHeight = Math.Min(pivot.PointSize, candidate.PointSize);
|
||||
if (minHeight != 0 && maxHeight / minHeight > 2.0)
|
||||
{
|
||||
// pivot and candidate letters cannot belong to the same word
|
||||
// if one letter is more than twice the size of the other.
|
||||
return false;
|
||||
}
|
||||
|
||||
// check for colour difference
|
||||
var pivotRgb = pivot.Color.ToRGBValues();
|
||||
var candidateRgb = candidate.Color.ToRGBValues();
|
||||
if (!pivotRgb.Equals(candidateRgb))
|
||||
{
|
||||
// pivot and candidate letters cannot belong to the same word
|
||||
// if they don't have the same colour.
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
var wordExtractor = new NearestNeighbourWordExtractor(wordExtractorOptions);
|
||||
|
||||
var words = wordExtractor.GetWords(letters);
|
||||
|
||||
// 2. Segment page
|
||||
//var pageSegmenter = DocstrumBoundingBoxes.Instance;
|
||||
var pageSegmenterOptions = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
|
||||
{
|
||||
|
||||
};
|
||||
|
||||
var pageSegmenter = new DocstrumBoundingBoxes(pageSegmenterOptions);
|
||||
|
||||
var textBlocks = pageSegmenter.GetBlocks(words);
|
||||
|
||||
// 3. Postprocessing
|
||||
var readingOrder = UnsupervisedReadingOrderDetector.Instance;
|
||||
var orderedTextBlocks = readingOrder.Get(textBlocks);
|
||||
|
||||
// 4. Extract text
|
||||
foreach (var block in orderedTextBlocks)
|
||||
{
|
||||
sb.Append(block.Text.Normalize(NormalizationForm.FormKC)); // normalise text
|
||||
sb.AppendLine();
|
||||
}
|
||||
|
||||
sb.AppendLine();
|
||||
}
|
||||
}
|
||||
|
||||
Console.WriteLine(sb.ToString());
|
||||
}
|
||||
```
|
||||
|
||||
### Results
|
||||

|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user