Updated Document Layout Analysis (markdown)

2025-08-20 09:21:57 +08:00 · 2024-05-30 09:31:54 +01:00 · 2024-05-30 09:31:54 +01:00 · 03343b3239
commit 03343b3239
parent 0d8e113f50
1 changed files with 89 additions and 0 deletions
--- a/Document-Layout-Analysis.md
+++ b/Document-Layout-Analysis.md
@ -230,6 +230,95 @@ var recursiveXYCut = new RecursiveXYCut(new RecursiveXYCut.RecursiveXYCutOptions
 var blocks = recursiveXYCut.GetBlocks(words);
 ```

+A more advanced example to read two columns. See [#721](https://github.com/UglyToad/PdfPig/issues/721)
+![Reading two columns PDF correctly](https://private-user-images.githubusercontent.com/486781/278690241-7c503d6e-2769-4279-93d1-691e981e28cc.png?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MTcwNTc0NjYsIm5iZiI6MTcxNzA1NzE2NiwicGF0aCI6Ii80ODY3ODEvMjc4NjkwMjQxLTdjNTAzZDZlLTI3NjktNDI3OS05M2QxLTY5MWU5ODFlMjhjYy5wbmc_WC1BbXotQWxnb3JpdGhtPUFXUzQtSE1BQy1TSEEyNTYmWC1BbXotQ3JlZGVudGlhbD1BS0lBVkNPRFlMU0E1M1BRSzRaQSUyRjIwMjQwNTMwJTJGdXMtZWFzdC0xJTJGczMlMkZhd3M0X3JlcXVlc3QmWC1BbXotRGF0ZT0yMDI0MDUzMFQwODE5MjZaJlgtQW16LUV4cGlyZXM9MzAwJlgtQW16LVNpZ25hdHVyZT0wMTE2YmYzNDA0ZTNiZDI5YjNlOWU3NDAyYWQwNDZkYzU5OGUyZWJjNjI0ZDQ1YjQ3NmEyMmY3ZjAxZTBjOTQ3JlgtQW16LVNpZ25lZEhlYWRlcnM9aG9zdCZhY3Rvcl9pZD0wJmtleV9pZD0wJnJlcG9faWQ9MCJ9.X3gE4FyFfyvUQTEqePJx1Pd_p6oC1Q-x8MG_hMm7bIg)
+```cs
+ static void testAdv()
+ {
+     var sb = new StringBuilder();
+     string pdfFilePath = @"...\2302.14502.pdf";
+
+     using (var document = UglyToad.PdfPig.PdfDocument.Open(pdfFilePath))
+     {
+         foreach (var page in document.GetPages())
+         {
+             // 0. Preprocessing
+             var letters = page.Letters; // no preprocessing
+
+             // 1. Extract words
+             //var wordExtractor = NearestNeighbourWordExtractor.Instance;
+             
+             var wordExtractorOptions = new NearestNeighbourWordExtractor.NearestNeighbourWordExtractorOptions()
+             {
+                 Filter = (pivot, candidate) =>
+                 {
+                     // check if white space (default implementation of 'Filter')
+                     if (string.IsNullOrWhiteSpace(candidate.Value))
+                     {
+                         // pivot and candidate letters cannot belong to the same word 
+                         // if candidate letter is null or white space.
+                         // ('FilterPivot' already checks if the pivot is null or white space by default)
+                         return false;
+                     }
+
+                     // check for height difference
+                     var maxHeight = Math.Max(pivot.PointSize, candidate.PointSize);
+                     var minHeight = Math.Min(pivot.PointSize, candidate.PointSize);
+                     if (minHeight != 0 && maxHeight / minHeight > 2.0)
+                     {
+                         // pivot and candidate letters cannot belong to the same word 
+                         // if one letter is more than twice the size of the other.
+                         return false;
+                     }
+
+                     // check for colour difference
+                     var pivotRgb = pivot.Color.ToRGBValues();
+                     var candidateRgb = candidate.Color.ToRGBValues();
+                     if (!pivotRgb.Equals(candidateRgb))
+                     {
+                         // pivot and candidate letters cannot belong to the same word 
+                         // if they don't have the same colour.
+                         return false;
+                     }
+
+                     return true;
+                 }
+             };
+
+             var wordExtractor = new NearestNeighbourWordExtractor(wordExtractorOptions);
+
+             var words = wordExtractor.GetWords(letters);
+
+             // 2. Segment page
+             //var pageSegmenter = DocstrumBoundingBoxes.Instance;
+             var pageSegmenterOptions = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
+             {
+
+             };
+
+             var pageSegmenter = new DocstrumBoundingBoxes(pageSegmenterOptions);
+
+             var textBlocks = pageSegmenter.GetBlocks(words);
+
+             // 3. Postprocessing
+             var readingOrder = UnsupervisedReadingOrderDetector.Instance;
+             var orderedTextBlocks = readingOrder.Get(textBlocks);
+
+             // 4. Extract text
+             foreach (var block in orderedTextBlocks)
+             {
+                 sb.Append(block.Text.Normalize(NormalizationForm.FormKC)); // normalise text
+                 sb.AppendLine();
+             }
+
+             sb.AppendLine();
+         }
+     }
+
+     Console.WriteLine(sb.ToString());
+ }
+```
+
 ### Results
 ![recursive xy cut example](https://github.com/UglyToad/PdfPig/blob/master/documentation/Document%20Layout%20Analysis/rxyc%20example.png)