mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-07 16:14:03 +08:00
Fix issues in Advanced Example in README.md
This commit is contained in:
56
README.md
56
README.md
@@ -84,52 +84,52 @@ In this example a more advanced document extraction is performed. PdfDocumentBui
|
|||||||
|
|
||||||
|
|
||||||
```cs
|
```cs
|
||||||
|
//using UglyToad.PdfPig;
|
||||||
//using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
//using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||||
//using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
|
//using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
|
||||||
//using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
//using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
||||||
//using UglyToad.PdfPig.Fonts.Standard14Fonts;
|
//using UglyToad.PdfPig.Fonts.Standard14Fonts;
|
||||||
|
//using UglyToad.PdfPig.Writer;
|
||||||
|
|
||||||
|
|
||||||
var sourcePdfPath = "";
|
var sourcePdfPath = "";
|
||||||
var outputPath = "";
|
var outputPath = "";
|
||||||
var pageNumber = 1;
|
var pageNumber = 1;
|
||||||
using (var document = PdfDocument.Open(sourcePdfPath))
|
using (var document = PdfDocument.Open(sourcePdfPath))
|
||||||
{
|
{
|
||||||
var builder = new PdfDocumentBuilder { };
|
var builder = new PdfDocumentBuilder { };
|
||||||
PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica);
|
PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica);
|
||||||
var pageBuilder = builder.AddPage(document, pageNumber);
|
var pageBuilder = builder.AddPage(document, pageNumber);
|
||||||
pageBuilder.SetStrokeColor(0, 255, 0);
|
pageBuilder.SetStrokeColor(0, 255, 0);
|
||||||
var page = document.GetPage(pageNumber);
|
var page = document.GetPage(pageNumber);
|
||||||
foreach (var word in page.GetWords())
|
|
||||||
{
|
|
||||||
|
|
||||||
var letters = page.Letters; // no preprocessing
|
var letters = page.Letters; // no preprocessing
|
||||||
|
|
||||||
// 1. Extract words
|
// 1. Extract words
|
||||||
var wordExtractor = NearestNeighbourWordExtractor.Instance;
|
var wordExtractor = NearestNeighbourWordExtractor.Instance;
|
||||||
|
|
||||||
var words = wordExtractor.GetWords(letters);
|
var words = wordExtractor.GetWords(letters);
|
||||||
|
|
||||||
// 2. Segment page
|
// 2. Segment page
|
||||||
var pageSegmenter = DocstrumBoundingBoxes.Instance;
|
var pageSegmenter = DocstrumBoundingBoxes.Instance;
|
||||||
|
|
||||||
var textBlocks = pageSegmenter.GetBlocks(words);
|
var textBlocks = pageSegmenter.GetBlocks(words);
|
||||||
|
|
||||||
// 3. Postprocessing
|
// 3. Postprocessing
|
||||||
var readingOrder = UnsupervisedReadingOrderDetector.Instance;
|
var readingOrder = UnsupervisedReadingOrderDetector.Instance;
|
||||||
var orderedTextBlocks = readingOrder.Get(textBlocks);
|
var orderedTextBlocks = readingOrder.Get(textBlocks);
|
||||||
|
|
||||||
// 4. Add debug info - Bounding boxes and reading order
|
// 4. Add debug info - Bounding boxes and reading order
|
||||||
foreach (var block in orderedTextBlocks)
|
foreach (var block in orderedTextBlocks)
|
||||||
{
|
{
|
||||||
var bbox = block.BoundingBox;
|
var bbox = block.BoundingBox;
|
||||||
pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height);
|
pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height);
|
||||||
pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox, font);
|
pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox.TopLeft, font);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// 5. Write result to a file
|
// 5. Write result to a file
|
||||||
byte[] fileBytes = builder.Build();
|
byte[] fileBytes = builder.Build();
|
||||||
File.WriteAllBytes(outputPath, fileBytes); // save to file
|
File.WriteAllBytes(outputPath, fileBytes); // save to file
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user