Updated Home (markdown)

2025-10-07 16:14:03 +08:00 · 2024-05-27 19:59:20 +01:00
parent 3915ffd09e
commit 7722a7f82e
1 changed files with 56 additions and 4 deletions
--- a/Home.md
+++ b/Home.md
@@ -69,11 +69,14 @@ byte[] b = builder.Build();

 The resulting bytes are a valid PDF document and can be saved to the file system, served from a web server, etc.

-You can use document builder to visualise what pdf pig has done for document reading by copying the pdf and drawing rectangles using bounding boxes information.
+You can use document builder to visualise what pdf pig has done by copying the pdf and drawing rectangles around the words using bounding boxes information.

 ```cs
-using UglyToad.PdfPig;
-using UglyToad.PdfPig.Writer;
+//using UglyToad.PdfPig;
+//using UglyToad.PdfPig.Writer;
+
+var sourcePdfPath = "";
+var outputPath = "";

 using (var document = PdfDocument.Open(sourcePdfPath))
 {
@@ -92,7 +95,56 @@ using (var document = PdfDocument.Open(sourcePdfPath))
 }
 ```

-View this gist that goes through some basic beginner examples: https://gist.github.com/cordasfilip/c6d2510b358323dc2f71c843460cbcdf
+In this example a more advanced document extraction is performed
+```
+//using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
+//using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
+//using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
+//using UglyToad.PdfPig.Fonts.Standard14Fonts;
+
+var sourcePdfPath = IntegrationHelpers.GetDocumentPath("SPARC - v9 Architecture Manual");
+var outputPath = "C:\\Dev\\Temp\\Test2.pdf";
+var pageNumber = 2;
+using (var document = PdfDocument.Open(sourcePdfPath))
+{
+    var builder = new PdfDocumentBuilder { };
+    PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica);
+    var pageBuilder = builder.AddPage(document, pageNumber);
+    pageBuilder.SetStrokeColor(0, 255, 0);
+    var page = document.GetPage(pageNumber);
+    foreach (var word in page.GetWords())
+    {
+
+        var letters = page.Letters; // no preprocessing
+
+        // 1. Extract words
+        var wordExtractor = NearestNeighbourWordExtractor.Instance;
+
+        var words = wordExtractor.GetWords(letters);
+
+        // 2. Segment page
+        var pageSegmenter = DocstrumBoundingBoxes.Instance;
+
+        var textBlocks = pageSegmenter.GetBlocks(words);
+
+        // 3. Postprocessing
+        var readingOrder = UnsupervisedReadingOrderDetector.Instance;
+        var orderedTextBlocks = readingOrder.Get(textBlocks);
+
+        // 4. Add debug info - Bounding boxes and reading order
+        foreach (var block in orderedTextBlocks)
+        {
+            var bbox = block.BoundingBox;
+            pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height);
+            pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox, font);
+        }
+    }
+
+    // 5. Write result to a file
+    byte[] fileBytes = builder.Build();
+    File.WriteAllBytes(outputPath, fileBytes); // save to file
+}
+```