From dc933aede9d4ddffed11fcbb1dd7d4de9793a4cd Mon Sep 17 00:00:00 2001 From: davebrokit <87085235+davebrokit@users.noreply.github.com> Date: Sun, 23 Jun 2024 10:31:47 +0100 Subject: [PATCH] Fix issues in Advanced Example in README.md --- README.md | 56 +++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 69a6e845..4ef2a379 100644 --- a/README.md +++ b/README.md @@ -84,52 +84,52 @@ In this example a more advanced document extraction is performed. PdfDocumentBui ```cs +//using UglyToad.PdfPig; //using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; //using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector; //using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; //using UglyToad.PdfPig.Fonts.Standard14Fonts; +//using UglyToad.PdfPig.Writer; + var sourcePdfPath = ""; var outputPath = ""; var pageNumber = 1; using (var document = PdfDocument.Open(sourcePdfPath)) { - var builder = new PdfDocumentBuilder { }; - PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica); - var pageBuilder = builder.AddPage(document, pageNumber); - pageBuilder.SetStrokeColor(0, 255, 0); - var page = document.GetPage(pageNumber); - foreach (var word in page.GetWords()) - { + var builder = new PdfDocumentBuilder { }; + PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica); + var pageBuilder = builder.AddPage(document, pageNumber); + pageBuilder.SetStrokeColor(0, 255, 0); + var page = document.GetPage(pageNumber); - var letters = page.Letters; // no preprocessing + var letters = page.Letters; // no preprocessing - // 1. Extract words - var wordExtractor = NearestNeighbourWordExtractor.Instance; + // 1. Extract words + var wordExtractor = NearestNeighbourWordExtractor.Instance; - var words = wordExtractor.GetWords(letters); + var words = wordExtractor.GetWords(letters); - // 2. Segment page - var pageSegmenter = DocstrumBoundingBoxes.Instance; + // 2. Segment page + var pageSegmenter = DocstrumBoundingBoxes.Instance; - var textBlocks = pageSegmenter.GetBlocks(words); + var textBlocks = pageSegmenter.GetBlocks(words); - // 3. Postprocessing - var readingOrder = UnsupervisedReadingOrderDetector.Instance; - var orderedTextBlocks = readingOrder.Get(textBlocks); + // 3. Postprocessing + var readingOrder = UnsupervisedReadingOrderDetector.Instance; + var orderedTextBlocks = readingOrder.Get(textBlocks); - // 4. Add debug info - Bounding boxes and reading order - foreach (var block in orderedTextBlocks) - { - var bbox = block.BoundingBox; - pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height); - pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox, font); - } - } + // 4. Add debug info - Bounding boxes and reading order + foreach (var block in orderedTextBlocks) + { + var bbox = block.BoundingBox; + pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height); + pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox.TopLeft, font); + } - // 5. Write result to a file - byte[] fileBytes = builder.Build(); - File.WriteAllBytes(outputPath, fileBytes); // save to file + // 5. Write result to a file + byte[] fileBytes = builder.Build(); + File.WriteAllBytes(outputPath, fileBytes); // save to file } ```