diff --git a/Home.md b/Home.md index 868e99e..7e5ab57 100644 --- a/Home.md +++ b/Home.md @@ -97,10 +97,13 @@ using (var document = PdfDocument.Open(sourcePdfPath)) In this example a more advanced document extraction is performed ```cs +//using UglyToad.PdfPig; //using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; //using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector; //using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; //using UglyToad.PdfPig.Fonts.Standard14Fonts; +//using UglyToad.PdfPig.Writer; + var sourcePdfPath = ""; var outputPath = ""; @@ -112,32 +115,29 @@ using (var document = PdfDocument.Open(sourcePdfPath)) var pageBuilder = builder.AddPage(document, pageNumber); pageBuilder.SetStrokeColor(0, 255, 0); var page = document.GetPage(pageNumber); - foreach (var word in page.GetWords()) + + var letters = page.Letters; // no preprocessing + + // 1. Extract words + var wordExtractor = NearestNeighbourWordExtractor.Instance; + + var words = wordExtractor.GetWords(letters); + + // 2. Segment page + var pageSegmenter = DocstrumBoundingBoxes.Instance; + + var textBlocks = pageSegmenter.GetBlocks(words); + + // 3. Postprocessing + var readingOrder = UnsupervisedReadingOrderDetector.Instance; + var orderedTextBlocks = readingOrder.Get(textBlocks); + + // 4. Add debug info - Bounding boxes and reading order + foreach (var block in orderedTextBlocks) { - - var letters = page.Letters; // no preprocessing - - // 1. Extract words - var wordExtractor = NearestNeighbourWordExtractor.Instance; - - var words = wordExtractor.GetWords(letters); - - // 2. Segment page - var pageSegmenter = DocstrumBoundingBoxes.Instance; - - var textBlocks = pageSegmenter.GetBlocks(words); - - // 3. Postprocessing - var readingOrder = UnsupervisedReadingOrderDetector.Instance; - var orderedTextBlocks = readingOrder.Get(textBlocks); - - // 4. Add debug info - Bounding boxes and reading order - foreach (var block in orderedTextBlocks) - { - var bbox = block.BoundingBox; - pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height); - pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox, font); - } + var bbox = block.BoundingBox; + pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height); + pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox.TopLeft, font); } // 5. Write result to a file