Fix issues in Advanced Example in README.md

This commit is contained in:
davebrokit
2024-06-23 10:31:47 +01:00
committed by BobLd
parent d2cae7985c
commit dc933aede9

View File

@@ -84,52 +84,52 @@ In this example a more advanced document extraction is performed. PdfDocumentBui
```cs ```cs
//using UglyToad.PdfPig;
//using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; //using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
//using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector; //using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
//using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; //using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
//using UglyToad.PdfPig.Fonts.Standard14Fonts; //using UglyToad.PdfPig.Fonts.Standard14Fonts;
//using UglyToad.PdfPig.Writer;
var sourcePdfPath = ""; var sourcePdfPath = "";
var outputPath = ""; var outputPath = "";
var pageNumber = 1; var pageNumber = 1;
using (var document = PdfDocument.Open(sourcePdfPath)) using (var document = PdfDocument.Open(sourcePdfPath))
{ {
var builder = new PdfDocumentBuilder { }; var builder = new PdfDocumentBuilder { };
PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica); PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica);
var pageBuilder = builder.AddPage(document, pageNumber); var pageBuilder = builder.AddPage(document, pageNumber);
pageBuilder.SetStrokeColor(0, 255, 0); pageBuilder.SetStrokeColor(0, 255, 0);
var page = document.GetPage(pageNumber); var page = document.GetPage(pageNumber);
foreach (var word in page.GetWords())
{
var letters = page.Letters; // no preprocessing var letters = page.Letters; // no preprocessing
// 1. Extract words // 1. Extract words
var wordExtractor = NearestNeighbourWordExtractor.Instance; var wordExtractor = NearestNeighbourWordExtractor.Instance;
var words = wordExtractor.GetWords(letters); var words = wordExtractor.GetWords(letters);
// 2. Segment page // 2. Segment page
var pageSegmenter = DocstrumBoundingBoxes.Instance; var pageSegmenter = DocstrumBoundingBoxes.Instance;
var textBlocks = pageSegmenter.GetBlocks(words); var textBlocks = pageSegmenter.GetBlocks(words);
// 3. Postprocessing // 3. Postprocessing
var readingOrder = UnsupervisedReadingOrderDetector.Instance; var readingOrder = UnsupervisedReadingOrderDetector.Instance;
var orderedTextBlocks = readingOrder.Get(textBlocks); var orderedTextBlocks = readingOrder.Get(textBlocks);
// 4. Add debug info - Bounding boxes and reading order // 4. Add debug info - Bounding boxes and reading order
foreach (var block in orderedTextBlocks) foreach (var block in orderedTextBlocks)
{ {
var bbox = block.BoundingBox; var bbox = block.BoundingBox;
pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height); pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height);
pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox, font); pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox.TopLeft, font);
} }
}
// 5. Write result to a file // 5. Write result to a file
byte[] fileBytes = builder.Build(); byte[] fileBytes = builder.Build();
File.WriteAllBytes(outputPath, fileBytes); // save to file File.WriteAllBytes(outputPath, fileBytes); // save to file
} }
``` ```