Merge pull request #517 from fnatzke/master

Fixes for Issue#512, 516 and 519
This commit is contained in:
Eliot Jones
2022-12-09 09:39:55 -05:00
committed by GitHub
9 changed files with 61 additions and 27 deletions

View File

@@ -10,7 +10,8 @@
public static class AdvancedTextExtraction
{
public static void Run(string filePath)
{
{
#if YET_TO_BE_DONE
var sb = new StringBuilder();
using (var document = PdfDocument.Open(filePath))
@@ -86,6 +87,7 @@
}
Console.WriteLine(sb.ToString());
#endif
}
}
}

View File

@@ -45,9 +45,14 @@
},
{7,
("Advance text extraction using layout analysis algorithms",
() => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
}
};
() => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
},
{
8,
("Extract Words with newline detection (example with algorithm). Issue 512",
() => OpenDocumentAndExtractWords.Run(Path.Combine(filesDirectory, "OPEN.RABBIT.ENGLISH.LOP.pdf")))
}
};
var choices = string.Join(Environment.NewLine, examples.Select(x => $"{x.Key}: {x.Value.name}"));

View File

@@ -51,7 +51,7 @@
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Equal(1.2m, result.Version);
Assert.Equal(TestEnvironment.IsUnixPlatform ? 7 : 9, result.OffsetInFile);
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 7 : 9, result.OffsetInFile);
}
[Fact]
@@ -66,38 +66,42 @@
[Fact]
public void HeaderPrecededByJunkNonLenientDoesNotThrow()
{
var scanner = StringBytesTestConverter.Scanner(@"one
%PDF-1.2");
{
var input = @"one
%PDF-1.2";
var scanner = StringBytesTestConverter.Scanner(input);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Equal(1.2m, result.Version);
Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
}
[Fact]
public void HeaderPrecededByJunkLenientReads()
{
var scanner = StringBytesTestConverter.Scanner(@"one
%PDF-1.7");
{
var input = @"one
%PDF-1.7";
var scanner = StringBytesTestConverter.Scanner(input);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
Assert.Equal(1.7m, result.Version);
Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
}
[Fact]
public void HeaderPrecededByJunkDoesNotThrow()
{
var scanner = StringBytesTestConverter.Scanner(@"one two
three %PDF-1.6");
{
var s = @"one two
three %PDF-1.6";
var scanner = StringBytesTestConverter.Scanner(s);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
Assert.Equal(1.6m, result.Version);
Assert.Equal(TestEnvironment.IsUnixPlatform ? 14 : 15, result.OffsetInFile);
Assert.Equal(TestEnvironment.IsSingleByteNewLine(s) ? 14 : 15, result.OffsetInFile);
}
[Fact]

View File

@@ -4,6 +4,7 @@
public static class TestEnvironment
{
public static readonly bool IsUnixPlatform = Environment.NewLine.Length == 1;
public static bool IsSingleByteNewLine(string s) => s.IndexOf('\r') < 0;
}
}

View File

@@ -29,7 +29,12 @@
/// <summary>
/// The page tree for this document containing all pages, page numbers and their dictionaries.
/// </summary>
public PageTreeNode PageTree { get; }
public PageTreeNode PageTree { get; }
/// <summary>
/// Number of discovered pages.
/// </summary>
public int? NumberOfDiscoveredPages => pagesByNumber?.Count;
/// <summary>
/// Create a new <see cref="CatalogDictionary"/>.

View File

@@ -21,6 +21,13 @@
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
Count = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count);
var CountOfPagesByPagesTree = catalog.PageTree.Children.Count;
var numberOfDiscoveredPages = catalog.NumberOfDiscoveredPages;
if (numberOfDiscoveredPages is null == false && Count != numberOfDiscoveredPages)
{
//log.Warning($"Dictionary Page Count {Count} different to discovered pages {numberOfDiscoveredPages}. Using {numberOfDiscoveredPages}.");
Count = numberOfDiscoveredPages.Value;
}
}
public Page GetPage(int pageNumber, InternalParsingOptions parsingOptions)

View File

@@ -81,11 +81,13 @@
pageNumber.Increment();
return new PageTreeNode(nodeDictionaryInput, referenceInput, true, pageNumber.PageCount).WithChildren(EmptyArray<PageTreeNode>.Instance);
}
//If we got here, we have to iterate till we manage to exit
}
//If we got here, we have to iterate till we manage to exit
HashSet<int> visitedTokens = new HashSet<int>(); // As we visit each token add to this list (the hashcode of the indirect reference)
var toProcess =
new Queue<(PageTreeNode thisPage, IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference,
@@ -102,8 +104,16 @@
do
{
var current = toProcess.Dequeue();
var current = toProcess.Dequeue();
var currentReferenceHash = current.reference.GetHashCode();
if (visitedTokens.Contains(currentReferenceHash))
{
continue; // don't revisit token already processed. break infinite loop. Issue #512
}
else
{
visitedTokens.Add(currentReferenceHash);
}
if (!current.nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids))
{
if (!isLenientParsing)

View File

@@ -176,7 +176,7 @@
const string searchTerm = "%%EOF";
var minimumEndOffset = bytes.Length - searchTerm.Length;
var minimumEndOffset = bytes.Length - searchTerm.Length + 1; // Issue #512 - Unable to open PDF - BruteForceScan starts from earlier of two EOF marker due to min end offset off by 1
bytes.Seek(minimumEndOffset);