From f67cce31b5ad34641f933175cc2cff96701e88a5 Mon Sep 17 00:00:00 2001 From: BobLd Date: Thu, 2 Jan 2020 00:34:32 +0000 Subject: [PATCH] Adding a 'minimumEditDistanceNormalised' parameter to allow for other edit distance implementations. --- .../DecorationTextBlockClassifier.cs | 66 +++++++++++++++---- .../DocumentLayoutAnalysis/Distances.cs | 14 ++-- 2 files changed, 61 insertions(+), 19 deletions(-) diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs index 8c7f0703..134d90aa 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs @@ -39,6 +39,28 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis public static IReadOnlyList> Get(IReadOnlyList pages, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1) + { + return Get(pages, wordExtractor, pageSegmenter, Distances.MinimumEditDistanceNormalised, similarityThreshold, n, maxDegreeOfParallelism); + } + + /// + /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure. + /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. + /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the + /// left or right edge of the page. + /// + /// The s in the document. All of them are needed for the algorithm to work. + /// + /// + /// Minimum edit distance normalised. A value of 0 means both strings are exactly equal. + /// Minimum similarity score to decide wether a block is labelled as decoration or not. + /// Number of blocks in a page to be considered when looking for decoration blocks. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. + public static IReadOnlyList> Get(IReadOnlyList pages, + IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Func minimumEditDistanceNormalised, + double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1) { if (pages.Count < 2) { @@ -60,6 +82,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis }); return Get(pagesBlocks.OrderBy(x => x.Key).Select(x => x.Value).ToList(), + minimumEditDistanceNormalised, similarityThreshold, n, maxDegreeOfParallelism); @@ -79,6 +102,25 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// If it is -1, there is no limit on the number of concurrently running operations. public static IReadOnlyList> Get(IReadOnlyList> pagesTextBlocks, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1) + { + return Get(pagesTextBlocks, Distances.MinimumEditDistanceNormalised, similarityThreshold, n, maxDegreeOfParallelism); + } + + /// + /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure. + /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. + /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the + /// left or right edge of the page. + /// + /// The s of every pages in the document. All of them are needed for the algorithm to work. + /// Minimum edit distance normalised. A value of 0 means both strings are exactly equal. + /// Minimum similarity score to decide wether a block is labelled as decoration or not. + /// Number of blocks in a page to be considered when looking for decoration blocks. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. + public static IReadOnlyList> Get(IReadOnlyList> pagesTextBlocks, + Func minimumEditDistanceNormalised, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1) { if (pagesTextBlocks.Count < 2) { @@ -119,7 +161,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis for (int i = 0; i < nCurrent; i++) { var current = currentPage[i]; - var score = Score(current, previousPage, nextPage, similarityThreshold, n); + var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n); if (score >= similarityThreshold) { if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current); @@ -134,7 +176,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis for (int i = 0; i < nCurrent; i++) { var current = currentPage[i]; - var score = Score(current, previousPage, nextPage, similarityThreshold, n); + var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n); if (score >= similarityThreshold) { if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current); @@ -149,7 +191,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis for (int i = 0; i < nCurrent; i++) { var current = currentPage[i]; - var score = Score(current, previousPage, nextPage, similarityThreshold, n); + var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n); if (score >= similarityThreshold) { if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current); @@ -164,7 +206,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis for (int i = 0; i < nCurrent; i++) { var current = currentPage[i]; - var score = Score(current, previousPage, nextPage, similarityThreshold, n); + var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n); if (score >= similarityThreshold) { if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current); @@ -180,9 +222,9 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// distance between the two content strings, where digits are replaced with “@” chars. /// A content similarity of 1 is reached when both strings are exactly equal. /// - private static double ContentSimilarity(TextBlock b1, TextBlock b2) + private static double ContentSimilarity(TextBlock b1, TextBlock b2, Func minimumEditDistanceNormalised) { - double similarity = 1.0 - Distances.MinimumEditDistanceNormalised( + double similarity = 1.0 - minimumEditDistanceNormalised( numbersPattern.Replace(b1.Text, replacementChar), numbersPattern.Replace(b2.Text, replacementChar)); @@ -208,25 +250,25 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// This similarity score is a value in the range [0,1] and given /// by the product between the content and the geometric similarity. /// - private static double Similarity(TextBlock b1, TextBlock b2) + private static double Similarity(TextBlock b1, TextBlock b2, Func minimumEditDistanceNormalised) { - return ContentSimilarity(b1, b2) * GeomSimilarity(b1, b2); + return ContentSimilarity(b1, b2, minimumEditDistanceNormalised) * GeomSimilarity(b1, b2); } - private static double ScoreI(TextBlock current, TextBlock previous, TextBlock next) + private static double ScoreI(TextBlock current, TextBlock previous, TextBlock next, Func minimumEditDistanceNormalised) { - return 0.5 * (Similarity(current, next) + Similarity(current, previous)); + return 0.5 * (Similarity(current, next, minimumEditDistanceNormalised) + Similarity(current, previous, minimumEditDistanceNormalised)); } private static double Score(TextBlock current, IReadOnlyList previous, IReadOnlyList next, - double threshold, int n) + Func minimumEditDistanceNormalised, double threshold, int n) { n = Math.Min(n, Math.Min(previous.Count, next.Count)); double score = 0; for (int i = 0; i < n; i++) { - var s = ScoreI(current, previous[i], next[i]); + var s = ScoreI(current, previous[i], next[i], minimumEditDistanceNormalised); if (s > score) score = s; if (score >= threshold) return score; } diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs index 298609b1..ff5a05e0 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs @@ -88,16 +88,16 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis /// public static int MinimumEditDistance(string string1, string string2) { - int[,] d = new int[string1.Length + 1, string2.Length + 1]; + ushort[,] d = new ushort[string1.Length + 1, string2.Length + 1]; for (int i = 1; i <= string1.Length; i++) { - d[i, 0] = i; + d[i, 0] = (ushort)i; } for (int j = 1; j <= string2.Length; j++) { - d[0, j] = j; + d[0, j] = (ushort)j; } for (int j = 1; j <= string2.Length; j++) @@ -105,9 +105,9 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis for (int i = 1; i <= string1.Length; i++) { d[i, j] = Math.Min(Math.Min( - d[i - 1, j] + 1, - d[i, j - 1] + 1), - d[i - 1, j - 1] + (string1[i - 1] == string2[j - 1] ? 0 : 1)); // substitution, set cost to 1 + (ushort)(d[i - 1, j] + 1), + (ushort)(d[i, j - 1] + 1)), + (ushort)(d[i - 1, j - 1] + (string1[i - 1] == string2[j - 1] ? 0 : 1))); // substitution, set cost to 1 } } return d[string1.Length, string2.Length]; @@ -123,7 +123,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { return MinimumEditDistance(string1, string2) / (double)Math.Max(string1.Length, string2.Length); } - + /// /// Find the index of the nearest point, excluding itself. ///