Fix for Issue#512: Unable to open PDF. BruteForceSearcher::GetLastEndOfFileMarker() minimumEndOffset out by 1.

This commit is contained in:
Fred Natzke 2022-11-29 17:31:23 +10:00
parent 9c9c7c99ea
commit afe473e10e
4 changed files with 12 additions and 5 deletions

View File

@ -10,7 +10,8 @@
public static class AdvancedTextExtraction public static class AdvancedTextExtraction
{ {
public static void Run(string filePath) public static void Run(string filePath)
{ {
#if YET_TO_BE_DONE
var sb = new StringBuilder(); var sb = new StringBuilder();
using (var document = PdfDocument.Open(filePath)) using (var document = PdfDocument.Open(filePath))
@ -86,6 +87,7 @@
} }
Console.WriteLine(sb.ToString()); Console.WriteLine(sb.ToString());
#endif
} }
} }
} }

View File

@ -45,9 +45,14 @@
}, },
{7, {7,
("Advance text extraction using layout analysis algorithms", ("Advance text extraction using layout analysis algorithms",
() => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf"))) () => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
} },
}; {
8,
("Extract Words with newline detection (example with algorithm). Issue 512",
() => OpenDocumentAndExtractWords.Run(Path.Combine(filesDirectory, "OPEN.RABBIT.ENGLISH.LOP.pdf")))
}
};
var choices = string.Join(Environment.NewLine, examples.Select(x => $"{x.Key}: {x.Value.name}")); var choices = string.Join(Environment.NewLine, examples.Select(x => $"{x.Key}: {x.Value.name}"));

View File

@ -176,7 +176,7 @@
const string searchTerm = "%%EOF"; const string searchTerm = "%%EOF";
var minimumEndOffset = bytes.Length - searchTerm.Length; var minimumEndOffset = bytes.Length - searchTerm.Length + 1; // Issue #512 - Unable to open PDF - BruteForceScan starts from earlier of two EOF marker due to min end offset off by 1
bytes.Seek(minimumEndOffset); bytes.Seek(minimumEndOffset);