Merge pull request #517 from fnatzke/master

Fixes for Issue#512, 516 and 519
2026-03-10 00:23:29 +08:00 · 2022-12-09 09:39:55 -05:00
parent 060c7bc728 f5fe39b285
commit 2aed996319
9 changed files with 61 additions and 27 deletions
--- a/examples/AdvancedTextExtraction.cs
+++ b/examples/AdvancedTextExtraction.cs
@@ -10,7 +10,8 @@
    public static class AdvancedTextExtraction
    {
        public static void Run(string filePath)
-        {
+        {
+#if YET_TO_BE_DONE
            var sb = new StringBuilder();

            using (var document = PdfDocument.Open(filePath))
@@ -86,6 +87,7 @@
            }

            Console.WriteLine(sb.ToString());
+#endif
        }
    }
 }
--- a/examples/Program.cs
+++ b/examples/Program.cs
@@ -45,9 +45,14 @@
                },
                {7,
                    ("Advance text extraction using layout analysis algorithms",
-                    () => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
-                }
-            };
+                    () => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))                
+                },
+                {
+                8,
+                    ("Extract Words with newline detection (example with algorithm). Issue 512",
+                    () => OpenDocumentAndExtractWords.Run(Path.Combine(filesDirectory, "OPEN.RABBIT.ENGLISH.LOP.pdf")))
+                } 
+        };

            var choices = string.Join(Environment.NewLine, examples.Select(x => $"{x.Key}: {x.Value.name}"));

--- a/src/UglyToad.PdfPig.Tests/Integration/Documents/OPEN.RABBIT.ENGLISH.LOP.pdf
+++ b/src/UglyToad.PdfPig.Tests/Integration/Documents/OPEN.RABBIT.ENGLISH.LOP.pdf
--- a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs
@@ -51,7 +51,7 @@
            var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);

            Assert.Equal(1.2m, result.Version);
-            Assert.Equal(TestEnvironment.IsUnixPlatform ? 7 : 9, result.OffsetInFile);
+            Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 7 : 9, result.OffsetInFile);
        }

        [Fact]
@@ -66,38 +66,42 @@

        [Fact]
        public void HeaderPrecededByJunkNonLenientDoesNotThrow()
-        {
-            var scanner = StringBytesTestConverter.Scanner(@"one    
-    %PDF-1.2");
+        {
+            var input = @"one    
+    %PDF-1.2";
+            var scanner = StringBytesTestConverter.Scanner(input);

            var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);

            Assert.Equal(1.2m, result.Version);
-            Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
+            Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
        }

        [Fact]
        public void HeaderPrecededByJunkLenientReads()
-        {
-            var scanner = StringBytesTestConverter.Scanner(@"one    
-    %PDF-1.7");
+        {
+            var input = @"one    
+    %PDF-1.7";
+            var scanner = StringBytesTestConverter.Scanner(input);

            var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);

            Assert.Equal(1.7m, result.Version);
-            Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
+            Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
        }

        [Fact]
        public void HeaderPrecededByJunkDoesNotThrow()
-        {
-            var scanner = StringBytesTestConverter.Scanner(@"one two
-three %PDF-1.6");
+        {
+            var s = @"one two
+three %PDF-1.6";
+              
+            var scanner = StringBytesTestConverter.Scanner(s);

            var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);

            Assert.Equal(1.6m, result.Version);
-            Assert.Equal(TestEnvironment.IsUnixPlatform ? 14 : 15, result.OffsetInFile);
+            Assert.Equal(TestEnvironment.IsSingleByteNewLine(s) ? 14 : 15, result.OffsetInFile);
        }

        [Fact]
--- a/src/UglyToad.PdfPig.Tests/TestEnvironment.cs
+++ b/src/UglyToad.PdfPig.Tests/TestEnvironment.cs
@@ -4,6 +4,7 @@

    public static class TestEnvironment
    {
-        public static readonly bool IsUnixPlatform = Environment.NewLine.Length == 1;
+        public static bool IsSingleByteNewLine(string s) => s.IndexOf('\r') < 0;
+            
    }
 }
--- a/src/UglyToad.PdfPig/Content/Catalog.cs
+++ b/src/UglyToad.PdfPig/Content/Catalog.cs
@@ -29,7 +29,12 @@
        /// <summary>
        /// The page tree for this document containing all pages, page numbers and their dictionaries.
        /// </summary>
-        public PageTreeNode PageTree { get; }
+        public PageTreeNode PageTree { get; }
+
+        /// <summary>
+        /// Number of discovered pages.
+        /// </summary>
+        public int? NumberOfDiscoveredPages => pagesByNumber?.Count;

        /// <summary>
        /// Create a new <see cref="CatalogDictionary"/>.
--- a/src/UglyToad.PdfPig/Content/Pages.cs
+++ b/src/UglyToad.PdfPig/Content/Pages.cs
@@ -21,6 +21,13 @@
            this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));

            Count = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count);
+            var CountOfPagesByPagesTree = catalog.PageTree.Children.Count;
+            var numberOfDiscoveredPages = catalog.NumberOfDiscoveredPages;
+            if (numberOfDiscoveredPages is null == false && Count != numberOfDiscoveredPages)
+            {
+                //log.Warning($"Dictionary Page Count {Count} different to discovered pages {numberOfDiscoveredPages}. Using {numberOfDiscoveredPages}.");
+                Count = numberOfDiscoveredPages.Value;
+            }
        }
        
        public Page GetPage(int pageNumber, InternalParsingOptions parsingOptions)
--- a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs
+++ b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs
@@ -81,11 +81,13 @@
                pageNumber.Increment();

                return new PageTreeNode(nodeDictionaryInput, referenceInput, true, pageNumber.PageCount).WithChildren(EmptyArray<PageTreeNode>.Instance);
-            }
-            
-            
-
-            //If we got here, we have to iterate till we manage to exit
+            }
+
+
+
+            //If we got here, we have to iterate till we manage to exit
+
+            HashSet<int> visitedTokens = new HashSet<int>(); // As we visit each token add to this list (the hashcode of the indirect reference)

            var toProcess =
                new Queue<(PageTreeNode thisPage, IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference,
@@ -102,8 +104,16 @@

            do
            {
-                var current = toProcess.Dequeue();
-
+                var current = toProcess.Dequeue();
+                var currentReferenceHash = current.reference.GetHashCode();                
+                if (visitedTokens.Contains(currentReferenceHash))
+                {
+                    continue; // don't revisit token already processed. break infinite loop. Issue #512
+                }
+                else
+                {
+                    visitedTokens.Add(currentReferenceHash);
+                }
                if (!current.nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids))
                {
                    if (!isLenientParsing)
--- a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
+++ b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
@@ -176,7 +176,7 @@

            const string searchTerm = "%%EOF";

-            var minimumEndOffset = bytes.Length - searchTerm.Length;
+            var minimumEndOffset = bytes.Length - searchTerm.Length + 1; // Issue #512 - Unable to open PDF - BruteForceScan starts from earlier of two EOF marker due to min end offset off by 1

            bytes.Seek(minimumEndOffset);