mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
Merge pull request #517 from fnatzke/master
Fixes for Issue#512, 516 and 519
This commit is contained in:
@@ -10,7 +10,8 @@
|
||||
public static class AdvancedTextExtraction
|
||||
{
|
||||
public static void Run(string filePath)
|
||||
{
|
||||
{
|
||||
#if YET_TO_BE_DONE
|
||||
var sb = new StringBuilder();
|
||||
|
||||
using (var document = PdfDocument.Open(filePath))
|
||||
@@ -86,6 +87,7 @@
|
||||
}
|
||||
|
||||
Console.WriteLine(sb.ToString());
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,9 +45,14 @@
|
||||
},
|
||||
{7,
|
||||
("Advance text extraction using layout analysis algorithms",
|
||||
() => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
|
||||
}
|
||||
};
|
||||
() => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
|
||||
},
|
||||
{
|
||||
8,
|
||||
("Extract Words with newline detection (example with algorithm). Issue 512",
|
||||
() => OpenDocumentAndExtractWords.Run(Path.Combine(filesDirectory, "OPEN.RABBIT.ENGLISH.LOP.pdf")))
|
||||
}
|
||||
};
|
||||
|
||||
var choices = string.Join(Environment.NewLine, examples.Select(x => $"{x.Key}: {x.Value.name}"));
|
||||
|
||||
|
||||
Binary file not shown.
@@ -51,7 +51,7 @@
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Equal(1.2m, result.Version);
|
||||
Assert.Equal(TestEnvironment.IsUnixPlatform ? 7 : 9, result.OffsetInFile);
|
||||
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 7 : 9, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -66,38 +66,42 @@
|
||||
|
||||
[Fact]
|
||||
public void HeaderPrecededByJunkNonLenientDoesNotThrow()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner(@"one
|
||||
%PDF-1.2");
|
||||
{
|
||||
var input = @"one
|
||||
%PDF-1.2";
|
||||
var scanner = StringBytesTestConverter.Scanner(input);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
|
||||
|
||||
Assert.Equal(1.2m, result.Version);
|
||||
Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
|
||||
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HeaderPrecededByJunkLenientReads()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner(@"one
|
||||
%PDF-1.7");
|
||||
{
|
||||
var input = @"one
|
||||
%PDF-1.7";
|
||||
var scanner = StringBytesTestConverter.Scanner(input);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||
|
||||
Assert.Equal(1.7m, result.Version);
|
||||
Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
|
||||
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HeaderPrecededByJunkDoesNotThrow()
|
||||
{
|
||||
var scanner = StringBytesTestConverter.Scanner(@"one two
|
||||
three %PDF-1.6");
|
||||
{
|
||||
var s = @"one two
|
||||
three %PDF-1.6";
|
||||
|
||||
var scanner = StringBytesTestConverter.Scanner(s);
|
||||
|
||||
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
|
||||
|
||||
Assert.Equal(1.6m, result.Version);
|
||||
Assert.Equal(TestEnvironment.IsUnixPlatform ? 14 : 15, result.OffsetInFile);
|
||||
Assert.Equal(TestEnvironment.IsSingleByteNewLine(s) ? 14 : 15, result.OffsetInFile);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
public static class TestEnvironment
|
||||
{
|
||||
public static readonly bool IsUnixPlatform = Environment.NewLine.Length == 1;
|
||||
public static bool IsSingleByteNewLine(string s) => s.IndexOf('\r') < 0;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,7 +29,12 @@
|
||||
/// <summary>
|
||||
/// The page tree for this document containing all pages, page numbers and their dictionaries.
|
||||
/// </summary>
|
||||
public PageTreeNode PageTree { get; }
|
||||
public PageTreeNode PageTree { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of discovered pages.
|
||||
/// </summary>
|
||||
public int? NumberOfDiscoveredPages => pagesByNumber?.Count;
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="CatalogDictionary"/>.
|
||||
|
||||
@@ -21,6 +21,13 @@
|
||||
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
|
||||
|
||||
Count = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count);
|
||||
var CountOfPagesByPagesTree = catalog.PageTree.Children.Count;
|
||||
var numberOfDiscoveredPages = catalog.NumberOfDiscoveredPages;
|
||||
if (numberOfDiscoveredPages is null == false && Count != numberOfDiscoveredPages)
|
||||
{
|
||||
//log.Warning($"Dictionary Page Count {Count} different to discovered pages {numberOfDiscoveredPages}. Using {numberOfDiscoveredPages}.");
|
||||
Count = numberOfDiscoveredPages.Value;
|
||||
}
|
||||
}
|
||||
|
||||
public Page GetPage(int pageNumber, InternalParsingOptions parsingOptions)
|
||||
|
||||
@@ -81,11 +81,13 @@
|
||||
pageNumber.Increment();
|
||||
|
||||
return new PageTreeNode(nodeDictionaryInput, referenceInput, true, pageNumber.PageCount).WithChildren(EmptyArray<PageTreeNode>.Instance);
|
||||
}
|
||||
|
||||
|
||||
|
||||
//If we got here, we have to iterate till we manage to exit
|
||||
}
|
||||
|
||||
|
||||
|
||||
//If we got here, we have to iterate till we manage to exit
|
||||
|
||||
HashSet<int> visitedTokens = new HashSet<int>(); // As we visit each token add to this list (the hashcode of the indirect reference)
|
||||
|
||||
var toProcess =
|
||||
new Queue<(PageTreeNode thisPage, IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference,
|
||||
@@ -102,8 +104,16 @@
|
||||
|
||||
do
|
||||
{
|
||||
var current = toProcess.Dequeue();
|
||||
|
||||
var current = toProcess.Dequeue();
|
||||
var currentReferenceHash = current.reference.GetHashCode();
|
||||
if (visitedTokens.Contains(currentReferenceHash))
|
||||
{
|
||||
continue; // don't revisit token already processed. break infinite loop. Issue #512
|
||||
}
|
||||
else
|
||||
{
|
||||
visitedTokens.Add(currentReferenceHash);
|
||||
}
|
||||
if (!current.nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids))
|
||||
{
|
||||
if (!isLenientParsing)
|
||||
|
||||
@@ -176,7 +176,7 @@
|
||||
|
||||
const string searchTerm = "%%EOF";
|
||||
|
||||
var minimumEndOffset = bytes.Length - searchTerm.Length;
|
||||
var minimumEndOffset = bytes.Length - searchTerm.Length + 1; // Issue #512 - Unable to open PDF - BruteForceScan starts from earlier of two EOF marker due to min end offset off by 1
|
||||
|
||||
bytes.Seek(minimumEndOffset);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user