Fix for Issue#512: Unable to open PDF. BruteForceSearcher::GetLastEndOfFileMarker() minimumEndOffset out by 1.

This commit is contained in:
Fred Natzke 2022-11-29 17:31:23 +10:00
parent 9c9c7c99ea
commit afe473e10e
4 changed files with 12 additions and 5 deletions

View File

@ -10,7 +10,8 @@
public static class AdvancedTextExtraction
{
public static void Run(string filePath)
{
{
#if YET_TO_BE_DONE
var sb = new StringBuilder();
using (var document = PdfDocument.Open(filePath))
@ -86,6 +87,7 @@
}
Console.WriteLine(sb.ToString());
#endif
}
}
}

View File

@ -45,9 +45,14 @@
},
{7,
("Advance text extraction using layout analysis algorithms",
() => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
}
};
() => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
},
{
8,
("Extract Words with newline detection (example with algorithm). Issue 512",
() => OpenDocumentAndExtractWords.Run(Path.Combine(filesDirectory, "OPEN.RABBIT.ENGLISH.LOP.pdf")))
}
};
var choices = string.Join(Environment.NewLine, examples.Select(x => $"{x.Key}: {x.Value.name}"));

View File

@ -176,7 +176,7 @@
const string searchTerm = "%%EOF";
var minimumEndOffset = bytes.Length - searchTerm.Length;
var minimumEndOffset = bytes.Length - searchTerm.Length + 1; // Issue #512 - Unable to open PDF - BruteForceScan starts from earlier of two EOF marker due to min end offset off by 1
bytes.Seek(minimumEndOffset);