Updated Home (markdown)

davebrokit
2024-06-23 10:34:48 +01:00
parent 7f1e19d6e9
commit 6ac180f720

50
Home.md

@@ -97,10 +97,13 @@ using (var document = PdfDocument.Open(sourcePdfPath))
In this example a more advanced document extraction is performed
```cs
//using UglyToad.PdfPig;
//using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
//using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
//using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
//using UglyToad.PdfPig.Fonts.Standard14Fonts;
//using UglyToad.PdfPig.Writer;
var sourcePdfPath = "";
var outputPath = "";
@@ -112,32 +115,29 @@ using (var document = PdfDocument.Open(sourcePdfPath))
var pageBuilder = builder.AddPage(document, pageNumber);
pageBuilder.SetStrokeColor(0, 255, 0);
var page = document.GetPage(pageNumber);
foreach (var word in page.GetWords())
var letters = page.Letters; // no preprocessing
// 1. Extract words
var wordExtractor = NearestNeighbourWordExtractor.Instance;
var words = wordExtractor.GetWords(letters);
// 2. Segment page
var pageSegmenter = DocstrumBoundingBoxes.Instance;
var textBlocks = pageSegmenter.GetBlocks(words);
// 3. Postprocessing
var readingOrder = UnsupervisedReadingOrderDetector.Instance;
var orderedTextBlocks = readingOrder.Get(textBlocks);
// 4. Add debug info - Bounding boxes and reading order
foreach (var block in orderedTextBlocks)
{
var letters = page.Letters; // no preprocessing
// 1. Extract words
var wordExtractor = NearestNeighbourWordExtractor.Instance;
var words = wordExtractor.GetWords(letters);
// 2. Segment page
var pageSegmenter = DocstrumBoundingBoxes.Instance;
var textBlocks = pageSegmenter.GetBlocks(words);
// 3. Postprocessing
var readingOrder = UnsupervisedReadingOrderDetector.Instance;
var orderedTextBlocks = readingOrder.Get(textBlocks);
// 4. Add debug info - Bounding boxes and reading order
foreach (var block in orderedTextBlocks)
{
var bbox = block.BoundingBox;
pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height);
pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox, font);
}
var bbox = block.BoundingBox;
pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height);
pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox.TopLeft, font);
}
// 5. Write result to a file