Updated Home (markdown)

davebrokit
2024-05-27 19:59:20 +01:00
parent 3915ffd09e
commit 7722a7f82e

60
Home.md

@@ -69,11 +69,14 @@ byte[] b = builder.Build();
The resulting bytes are a valid PDF document and can be saved to the file system, served from a web server, etc.
You can use document builder to visualise what pdf pig has done for document reading by copying the pdf and drawing rectangles using bounding boxes information.
You can use document builder to visualise what pdf pig has done by copying the pdf and drawing rectangles around the words using bounding boxes information.
```cs
using UglyToad.PdfPig;
using UglyToad.PdfPig.Writer;
//using UglyToad.PdfPig;
//using UglyToad.PdfPig.Writer;
var sourcePdfPath = "";
var outputPath = "";
using (var document = PdfDocument.Open(sourcePdfPath))
{
@@ -92,7 +95,56 @@ using (var document = PdfDocument.Open(sourcePdfPath))
}
```
View this gist that goes through some basic beginner examples: https://gist.github.com/cordasfilip/c6d2510b358323dc2f71c843460cbcdf
In this example a more advanced document extraction is performed
```
//using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
//using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
//using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
//using UglyToad.PdfPig.Fonts.Standard14Fonts;
var sourcePdfPath = IntegrationHelpers.GetDocumentPath("SPARC - v9 Architecture Manual");
var outputPath = "C:\\Dev\\Temp\\Test2.pdf";
var pageNumber = 2;
using (var document = PdfDocument.Open(sourcePdfPath))
{
var builder = new PdfDocumentBuilder { };
PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica);
var pageBuilder = builder.AddPage(document, pageNumber);
pageBuilder.SetStrokeColor(0, 255, 0);
var page = document.GetPage(pageNumber);
foreach (var word in page.GetWords())
{
var letters = page.Letters; // no preprocessing
// 1. Extract words
var wordExtractor = NearestNeighbourWordExtractor.Instance;
var words = wordExtractor.GetWords(letters);
// 2. Segment page
var pageSegmenter = DocstrumBoundingBoxes.Instance;
var textBlocks = pageSegmenter.GetBlocks(words);
// 3. Postprocessing
var readingOrder = UnsupervisedReadingOrderDetector.Instance;
var orderedTextBlocks = readingOrder.Get(textBlocks);
// 4. Add debug info - Bounding boxes and reading order
foreach (var block in orderedTextBlocks)
{
var bbox = block.BoundingBox;
pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height);
pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox, font);
}
}
// 5. Write result to a file
byte[] fileBytes = builder.Build();
File.WriteAllBytes(outputPath, fileBytes); // save to file
}
```