Updated Document Layout Analysis (markdown)

BobLd
2020-06-20 16:34:34 +01:00
parent 8f4574d324
commit 71f4b6f40f

@@ -72,13 +72,16 @@ using (var document = PdfDocument.Open(@"document.pdf"))
{
// Ignore the letters that are space or belong to 'punctuation' array
// These letters will be put in a single word
FilterPivot = letter => !string.IsNullOrWhiteSpace(letter.Value) && !punctuation.Contains(letter.Value),
FilterPivot = letter => !string.IsNullOrWhiteSpace(letter.Value) &&
!punctuation.Contains(letter.Value),
Filter = (pivot, candidate) =>
{
if (string.IsNullOrWhiteSpace(candidate.Value) || cannotEndWord.Contains(candidate.Value))
if (string.IsNullOrWhiteSpace(candidate.Value) ||
cannotEndWord.Contains(candidate.Value))
{
// start new word if the candidate neighbour is a space or belongs to 'cannotEndWord' array
// start new word if the candidate neighbour is
// a space or belongs to 'cannotEndWord' array
return false;
}
else if (cannotStartWord.Contains(pivot.Value))