mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-07 16:14:03 +08:00
Updated Home (markdown)
60
Home.md
60
Home.md
@@ -69,11 +69,14 @@ byte[] b = builder.Build();
|
||||
|
||||
The resulting bytes are a valid PDF document and can be saved to the file system, served from a web server, etc.
|
||||
|
||||
You can use document builder to visualise what pdf pig has done for document reading by copying the pdf and drawing rectangles using bounding boxes information.
|
||||
You can use document builder to visualise what pdf pig has done by copying the pdf and drawing rectangles around the words using bounding boxes information.
|
||||
|
||||
```cs
|
||||
using UglyToad.PdfPig;
|
||||
using UglyToad.PdfPig.Writer;
|
||||
//using UglyToad.PdfPig;
|
||||
//using UglyToad.PdfPig.Writer;
|
||||
|
||||
var sourcePdfPath = "";
|
||||
var outputPath = "";
|
||||
|
||||
using (var document = PdfDocument.Open(sourcePdfPath))
|
||||
{
|
||||
@@ -92,7 +95,56 @@ using (var document = PdfDocument.Open(sourcePdfPath))
|
||||
}
|
||||
```
|
||||
|
||||
View this gist that goes through some basic beginner examples: https://gist.github.com/cordasfilip/c6d2510b358323dc2f71c843460cbcdf
|
||||
In this example a more advanced document extraction is performed
|
||||
```
|
||||
//using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||
//using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
|
||||
//using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
||||
//using UglyToad.PdfPig.Fonts.Standard14Fonts;
|
||||
|
||||
var sourcePdfPath = IntegrationHelpers.GetDocumentPath("SPARC - v9 Architecture Manual");
|
||||
var outputPath = "C:\\Dev\\Temp\\Test2.pdf";
|
||||
var pageNumber = 2;
|
||||
using (var document = PdfDocument.Open(sourcePdfPath))
|
||||
{
|
||||
var builder = new PdfDocumentBuilder { };
|
||||
PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica);
|
||||
var pageBuilder = builder.AddPage(document, pageNumber);
|
||||
pageBuilder.SetStrokeColor(0, 255, 0);
|
||||
var page = document.GetPage(pageNumber);
|
||||
foreach (var word in page.GetWords())
|
||||
{
|
||||
|
||||
var letters = page.Letters; // no preprocessing
|
||||
|
||||
// 1. Extract words
|
||||
var wordExtractor = NearestNeighbourWordExtractor.Instance;
|
||||
|
||||
var words = wordExtractor.GetWords(letters);
|
||||
|
||||
// 2. Segment page
|
||||
var pageSegmenter = DocstrumBoundingBoxes.Instance;
|
||||
|
||||
var textBlocks = pageSegmenter.GetBlocks(words);
|
||||
|
||||
// 3. Postprocessing
|
||||
var readingOrder = UnsupervisedReadingOrderDetector.Instance;
|
||||
var orderedTextBlocks = readingOrder.Get(textBlocks);
|
||||
|
||||
// 4. Add debug info - Bounding boxes and reading order
|
||||
foreach (var block in orderedTextBlocks)
|
||||
{
|
||||
var bbox = block.BoundingBox;
|
||||
pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height);
|
||||
pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox, font);
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Write result to a file
|
||||
byte[] fileBytes = builder.Build();
|
||||
File.WriteAllBytes(outputPath, fileBytes); // save to file
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user