diff --git a/examples/AdvancedTextExtraction.cs b/examples/AdvancedTextExtraction.cs index ff605fb5..036ed1aa 100644 --- a/examples/AdvancedTextExtraction.cs +++ b/examples/AdvancedTextExtraction.cs @@ -10,7 +10,8 @@ public static class AdvancedTextExtraction { public static void Run(string filePath) - { + { +#if YET_TO_BE_DONE var sb = new StringBuilder(); using (var document = PdfDocument.Open(filePath)) @@ -86,6 +87,7 @@ } Console.WriteLine(sb.ToString()); +#endif } } } diff --git a/examples/Program.cs b/examples/Program.cs index 5297a657..108128d6 100644 --- a/examples/Program.cs +++ b/examples/Program.cs @@ -45,9 +45,14 @@ }, {7, ("Advance text extraction using layout analysis algorithms", - () => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf"))) - } - }; + () => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf"))) + }, + { + 8, + ("Extract Words with newline detection (example with algorithm). Issue 512", + () => OpenDocumentAndExtractWords.Run(Path.Combine(filesDirectory, "OPEN.RABBIT.ENGLISH.LOP.pdf"))) + } + }; var choices = string.Join(Environment.NewLine, examples.Select(x => $"{x.Key}: {x.Value.name}")); diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/OPEN.RABBIT.ENGLISH.LOP.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/OPEN.RABBIT.ENGLISH.LOP.pdf new file mode 100644 index 00000000..0ca6c9a6 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/OPEN.RABBIT.ENGLISH.LOP.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs index 374b8396..6a50d0c6 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs @@ -51,7 +51,7 @@ var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); Assert.Equal(1.2m, result.Version); - Assert.Equal(TestEnvironment.IsUnixPlatform ? 7 : 9, result.OffsetInFile); + Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 7 : 9, result.OffsetInFile); } [Fact] @@ -66,38 +66,42 @@ [Fact] public void HeaderPrecededByJunkNonLenientDoesNotThrow() - { - var scanner = StringBytesTestConverter.Scanner(@"one - %PDF-1.2"); + { + var input = @"one + %PDF-1.2"; + var scanner = StringBytesTestConverter.Scanner(input); var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log); Assert.Equal(1.2m, result.Version); - Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile); + Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile); } [Fact] public void HeaderPrecededByJunkLenientReads() - { - var scanner = StringBytesTestConverter.Scanner(@"one - %PDF-1.7"); + { + var input = @"one + %PDF-1.7"; + var scanner = StringBytesTestConverter.Scanner(input); var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); Assert.Equal(1.7m, result.Version); - Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile); + Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile); } [Fact] public void HeaderPrecededByJunkDoesNotThrow() - { - var scanner = StringBytesTestConverter.Scanner(@"one two -three %PDF-1.6"); + { + var s = @"one two +three %PDF-1.6"; + + var scanner = StringBytesTestConverter.Scanner(s); var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log); Assert.Equal(1.6m, result.Version); - Assert.Equal(TestEnvironment.IsUnixPlatform ? 14 : 15, result.OffsetInFile); + Assert.Equal(TestEnvironment.IsSingleByteNewLine(s) ? 14 : 15, result.OffsetInFile); } [Fact] diff --git a/src/UglyToad.PdfPig.Tests/TestEnvironment.cs b/src/UglyToad.PdfPig.Tests/TestEnvironment.cs index abc44914..17545a27 100644 --- a/src/UglyToad.PdfPig.Tests/TestEnvironment.cs +++ b/src/UglyToad.PdfPig.Tests/TestEnvironment.cs @@ -4,6 +4,7 @@ public static class TestEnvironment { - public static readonly bool IsUnixPlatform = Environment.NewLine.Length == 1; + public static bool IsSingleByteNewLine(string s) => s.IndexOf('\r') < 0; + } } diff --git a/src/UglyToad.PdfPig/Content/Catalog.cs b/src/UglyToad.PdfPig/Content/Catalog.cs index 61906602..9bbdd71f 100644 --- a/src/UglyToad.PdfPig/Content/Catalog.cs +++ b/src/UglyToad.PdfPig/Content/Catalog.cs @@ -29,7 +29,12 @@ /// /// The page tree for this document containing all pages, page numbers and their dictionaries. /// - public PageTreeNode PageTree { get; } + public PageTreeNode PageTree { get; } + + /// + /// Number of discovered pages. + /// + public int? NumberOfDiscoveredPages => pagesByNumber?.Count; /// /// Create a new . diff --git a/src/UglyToad.PdfPig/Content/Pages.cs b/src/UglyToad.PdfPig/Content/Pages.cs index 93f60603..3bd55c09 100644 --- a/src/UglyToad.PdfPig/Content/Pages.cs +++ b/src/UglyToad.PdfPig/Content/Pages.cs @@ -21,6 +21,13 @@ this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner)); Count = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count); + var CountOfPagesByPagesTree = catalog.PageTree.Children.Count; + var numberOfDiscoveredPages = catalog.NumberOfDiscoveredPages; + if (numberOfDiscoveredPages is null == false && Count != numberOfDiscoveredPages) + { + //log.Warning($"Dictionary Page Count {Count} different to discovered pages {numberOfDiscoveredPages}. Using {numberOfDiscoveredPages}."); + Count = numberOfDiscoveredPages.Value; + } } public Page GetPage(int pageNumber, InternalParsingOptions parsingOptions) diff --git a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs index da2bb2cc..26095d8a 100644 --- a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs +++ b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs @@ -81,11 +81,13 @@ pageNumber.Increment(); return new PageTreeNode(nodeDictionaryInput, referenceInput, true, pageNumber.PageCount).WithChildren(EmptyArray.Instance); - } - - - - //If we got here, we have to iterate till we manage to exit + } + + + + //If we got here, we have to iterate till we manage to exit + + HashSet visitedTokens = new HashSet(); // As we visit each token add to this list (the hashcode of the indirect reference) var toProcess = new Queue<(PageTreeNode thisPage, IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference, @@ -102,8 +104,16 @@ do { - var current = toProcess.Dequeue(); - + var current = toProcess.Dequeue(); + var currentReferenceHash = current.reference.GetHashCode(); + if (visitedTokens.Contains(currentReferenceHash)) + { + continue; // don't revisit token already processed. break infinite loop. Issue #512 + } + else + { + visitedTokens.Add(currentReferenceHash); + } if (!current.nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids)) { if (!isLenientParsing) diff --git a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs index 2dda4ecc..ae91268c 100644 --- a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs +++ b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs @@ -176,7 +176,7 @@ const string searchTerm = "%%EOF"; - var minimumEndOffset = bytes.Length - searchTerm.Length; + var minimumEndOffset = bytes.Length - searchTerm.Length + 1; // Issue #512 - Unable to open PDF - BruteForceScan starts from earlier of two EOF marker due to min end offset off by 1 bytes.Seek(minimumEndOffset);