add content order text extractor and example of use

2026-03-10 00:23:29 +08:00 · 2020-04-19 17:06:34 +01:00
parent f18bc0766a
commit 407ee5ca51
8 changed files with 202 additions and 3 deletions
--- a/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs
@@ -1,4 +1,6 @@
-namespace UglyToad.PdfPig.Tests.Integration
+using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
+
+namespace UglyToad.PdfPig.Tests.Integration
 {
    using System;
    using System.Collections.Generic;
@@ -159,6 +161,19 @@ used per estimate, we introduce a “complement class”
Naive Bayes is often us
                Assert.NotNull(svg);
            }
        }
+
+        [Fact]
+        public void CanExtractContentOrderText()
+        {
+            using (var document = PdfDocument.Open(GetFilename()))
+            {
+                foreach (var page in document.GetPages())
+                {
+                    var text = ContentOrderTextExtractor.GetText(page);
+                    Assert.NotNull(text);
+                }
+            }
+        }

        private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
        {