diff --git a/examples/AdvancedTextExtraction.cs b/examples/AdvancedTextExtraction.cs
index ff605fb5..036ed1aa 100644
--- a/examples/AdvancedTextExtraction.cs
+++ b/examples/AdvancedTextExtraction.cs
@@ -10,7 +10,8 @@
public static class AdvancedTextExtraction
{
public static void Run(string filePath)
- {
+ {
+#if YET_TO_BE_DONE
var sb = new StringBuilder();
using (var document = PdfDocument.Open(filePath))
@@ -86,6 +87,7 @@
}
Console.WriteLine(sb.ToString());
+#endif
}
}
}
diff --git a/examples/Program.cs b/examples/Program.cs
index 5297a657..108128d6 100644
--- a/examples/Program.cs
+++ b/examples/Program.cs
@@ -45,9 +45,14 @@
},
{7,
("Advance text extraction using layout analysis algorithms",
- () => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
- }
- };
+ () => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
+ },
+ {
+ 8,
+ ("Extract Words with newline detection (example with algorithm). Issue 512",
+ () => OpenDocumentAndExtractWords.Run(Path.Combine(filesDirectory, "OPEN.RABBIT.ENGLISH.LOP.pdf")))
+ }
+ };
var choices = string.Join(Environment.NewLine, examples.Select(x => $"{x.Key}: {x.Value.name}"));
diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/OPEN.RABBIT.ENGLISH.LOP.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/OPEN.RABBIT.ENGLISH.LOP.pdf
new file mode 100644
index 00000000..0ca6c9a6
Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/OPEN.RABBIT.ENGLISH.LOP.pdf differ
diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs
index 374b8396..6a50d0c6 100644
--- a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs
@@ -51,7 +51,7 @@
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Equal(1.2m, result.Version);
- Assert.Equal(TestEnvironment.IsUnixPlatform ? 7 : 9, result.OffsetInFile);
+ Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 7 : 9, result.OffsetInFile);
}
[Fact]
@@ -66,38 +66,42 @@
[Fact]
public void HeaderPrecededByJunkNonLenientDoesNotThrow()
- {
- var scanner = StringBytesTestConverter.Scanner(@"one
- %PDF-1.2");
+ {
+ var input = @"one
+ %PDF-1.2";
+ var scanner = StringBytesTestConverter.Scanner(input);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Equal(1.2m, result.Version);
- Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
+ Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
}
[Fact]
public void HeaderPrecededByJunkLenientReads()
- {
- var scanner = StringBytesTestConverter.Scanner(@"one
- %PDF-1.7");
+ {
+ var input = @"one
+ %PDF-1.7";
+ var scanner = StringBytesTestConverter.Scanner(input);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
Assert.Equal(1.7m, result.Version);
- Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
+ Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
}
[Fact]
public void HeaderPrecededByJunkDoesNotThrow()
- {
- var scanner = StringBytesTestConverter.Scanner(@"one two
-three %PDF-1.6");
+ {
+ var s = @"one two
+three %PDF-1.6";
+
+ var scanner = StringBytesTestConverter.Scanner(s);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
Assert.Equal(1.6m, result.Version);
- Assert.Equal(TestEnvironment.IsUnixPlatform ? 14 : 15, result.OffsetInFile);
+ Assert.Equal(TestEnvironment.IsSingleByteNewLine(s) ? 14 : 15, result.OffsetInFile);
}
[Fact]
diff --git a/src/UglyToad.PdfPig.Tests/TestEnvironment.cs b/src/UglyToad.PdfPig.Tests/TestEnvironment.cs
index abc44914..17545a27 100644
--- a/src/UglyToad.PdfPig.Tests/TestEnvironment.cs
+++ b/src/UglyToad.PdfPig.Tests/TestEnvironment.cs
@@ -4,6 +4,7 @@
public static class TestEnvironment
{
- public static readonly bool IsUnixPlatform = Environment.NewLine.Length == 1;
+ public static bool IsSingleByteNewLine(string s) => s.IndexOf('\r') < 0;
+
}
}
diff --git a/src/UglyToad.PdfPig/Content/Catalog.cs b/src/UglyToad.PdfPig/Content/Catalog.cs
index 61906602..9bbdd71f 100644
--- a/src/UglyToad.PdfPig/Content/Catalog.cs
+++ b/src/UglyToad.PdfPig/Content/Catalog.cs
@@ -29,7 +29,12 @@
///
/// The page tree for this document containing all pages, page numbers and their dictionaries.
///
- public PageTreeNode PageTree { get; }
+ public PageTreeNode PageTree { get; }
+
+ ///
+ /// Number of discovered pages.
+ ///
+ public int? NumberOfDiscoveredPages => pagesByNumber?.Count;
///
/// Create a new .
diff --git a/src/UglyToad.PdfPig/Content/Pages.cs b/src/UglyToad.PdfPig/Content/Pages.cs
index 93f60603..3bd55c09 100644
--- a/src/UglyToad.PdfPig/Content/Pages.cs
+++ b/src/UglyToad.PdfPig/Content/Pages.cs
@@ -21,6 +21,13 @@
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
Count = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count);
+ var CountOfPagesByPagesTree = catalog.PageTree.Children.Count;
+ var numberOfDiscoveredPages = catalog.NumberOfDiscoveredPages;
+ if (numberOfDiscoveredPages is null == false && Count != numberOfDiscoveredPages)
+ {
+ //log.Warning($"Dictionary Page Count {Count} different to discovered pages {numberOfDiscoveredPages}. Using {numberOfDiscoveredPages}.");
+ Count = numberOfDiscoveredPages.Value;
+ }
}
public Page GetPage(int pageNumber, InternalParsingOptions parsingOptions)
diff --git a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs
index da2bb2cc..26095d8a 100644
--- a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs
+++ b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs
@@ -81,11 +81,13 @@
pageNumber.Increment();
return new PageTreeNode(nodeDictionaryInput, referenceInput, true, pageNumber.PageCount).WithChildren(EmptyArray.Instance);
- }
-
-
-
- //If we got here, we have to iterate till we manage to exit
+ }
+
+
+
+ //If we got here, we have to iterate till we manage to exit
+
+ HashSet visitedTokens = new HashSet(); // As we visit each token add to this list (the hashcode of the indirect reference)
var toProcess =
new Queue<(PageTreeNode thisPage, IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference,
@@ -102,8 +104,16 @@
do
{
- var current = toProcess.Dequeue();
-
+ var current = toProcess.Dequeue();
+ var currentReferenceHash = current.reference.GetHashCode();
+ if (visitedTokens.Contains(currentReferenceHash))
+ {
+ continue; // don't revisit token already processed. break infinite loop. Issue #512
+ }
+ else
+ {
+ visitedTokens.Add(currentReferenceHash);
+ }
if (!current.nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids))
{
if (!isLenientParsing)
diff --git a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
index 2dda4ecc..ae91268c 100644
--- a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
+++ b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
@@ -176,7 +176,7 @@
const string searchTerm = "%%EOF";
- var minimumEndOffset = bytes.Length - searchTerm.Length;
+ var minimumEndOffset = bytes.Length - searchTerm.Length + 1; // Issue #512 - Unable to open PDF - BruteForceScan starts from earlier of two EOF marker due to min end offset off by 1
bytes.Seek(minimumEndOffset);