diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs
new file mode 100644
index 00000000..3b6ef81a
--- /dev/null
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs
@@ -0,0 +1,267 @@
+using System;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+using UglyToad.PdfPig.Content;
+using UglyToad.PdfPig.Geometry;
+using UglyToad.PdfPig.Util;
+
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis
+{
+ ///
+ /// Algorithm that retrieve blocks that are labelled as decoration (e.g. headers, footers) for each page in the document, using a content and a geometric similarity measure.
+ /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
+ /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
+ /// left or right edge of the page.
+ /// See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.
+ ///
+ public static class DecorationTextBlockClassifier
+ {
+ private static System.Text.RegularExpressions.Regex numbersPattern = new System.Text.RegularExpressions.Regex(@"\d"); // TODO: add roman numbers pattern
+ private static string replacementChar = "@";
+
+ ///
+ /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
+ /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
+ /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
+ /// left or right edge of the page.
+ ///
+ /// The s in the document. All of them are needed for the algorithm to work.
+ ///
+ ///
+ /// Minimum similarity score to decide wether a block is labelled as decoration or not.
+ /// Number of blocks in a page to be considered when looking for decoration blocks.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
+ public static IReadOnlyList> Get(IReadOnlyList pages,
+ IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+ double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
+ {
+ if (pages.Count < 2)
+ {
+ throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pages));
+ }
+
+ ConcurrentDictionary> pagesBlocks = new ConcurrentDictionary>();
+
+ ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
+
+ Parallel.For(0, pages.Count(), parallelOptions, p =>
+ {
+ var words = pages[p].GetWords(wordExtractor);
+ var blocks = pageSegmenter.GetBlocks(words);
+ if (!pagesBlocks.TryAdd(p, blocks))
+ {
+ throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary.");
+ }
+ });
+
+ return Get(pagesBlocks.OrderBy(x => x.Key).Select(x => x.Value).ToList(),
+ similarityThreshold,
+ n,
+ maxDegreeOfParallelism);
+ }
+
+ ///
+ /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
+ /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
+ /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
+ /// left or right edge of the page.
+ ///
+ /// The s of every pages in the document. All of them are needed for the algorithm to work.
+ /// Minimum similarity score to decide wether a block is labelled as decoration or not.
+ /// Number of blocks in a page to be considered when looking for decoration blocks.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
+ /// If it is -1, there is no limit on the number of concurrently running operations.
+ public static IReadOnlyList> Get(IReadOnlyList> pagesTextBlocks,
+ double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
+ {
+ if (pagesTextBlocks.Count < 2)
+ {
+ throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pagesTextBlocks));
+ }
+
+ ConcurrentDictionary> pageDecorations = new ConcurrentDictionary>();
+
+ ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
+
+ Parallel.For(0, pagesTextBlocks.Count, parallelOptions, p =>
+ {
+ if (!pageDecorations.TryAdd(p, new List()))
+ {
+ throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary.");
+ }
+
+ int pMinus1 = GetPreviousPageNumber(p, pagesTextBlocks.Count);
+ int pPlus1 = GetNextPageNumber(p, pagesTextBlocks.Count);
+
+ var previousPage = pagesTextBlocks[pMinus1];
+ var currentPage = pagesTextBlocks[p];
+ var nextPage = pagesTextBlocks[pPlus1];
+
+ int nCurrent = Math.Min(n, currentPage.Count);
+
+ // First, for each page, we sort all blocks on the page in four different orders:
+ // - from top to bottom (based on the minimum y coordinate),
+ // - from bottom to top (maximum y coordinate),
+ // - from left to right (minimum x coordinate),
+ // - from right to left (maximumx coordinate).
+
+ // From top to bottom (based on the minimum y coordinate)
+ previousPage = previousPage.OrderByDescending(b => b.BoundingBox.Bottom).ThenBy(b => b.BoundingBox.Left).ToList();
+ currentPage = currentPage.OrderByDescending(b => b.BoundingBox.Bottom).ThenBy(b => b.BoundingBox.Left).ToList();
+ nextPage = nextPage.OrderByDescending(b => b.BoundingBox.Bottom).ThenBy(b => b.BoundingBox.Left).ToList();
+
+ for (int i = 0; i < nCurrent; i++)
+ {
+ var current = currentPage[i];
+ var score = Score(current, previousPage, nextPage, similarityThreshold, n);
+ if (score >= similarityThreshold)
+ {
+ if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
+ }
+ }
+
+ // From bottom to top (maximum y coordinate)
+ previousPage = previousPage.OrderBy(b => b.BoundingBox.Top).ThenBy(b => b.BoundingBox.Left).ToList();
+ currentPage = currentPage.OrderBy(b => b.BoundingBox.Top).ThenBy(b => b.BoundingBox.Left).ToList();
+ nextPage = nextPage.OrderBy(b => b.BoundingBox.Top).ThenBy(b => b.BoundingBox.Left).ToList();
+
+ for (int i = 0; i < nCurrent; i++)
+ {
+ var current = currentPage[i];
+ var score = Score(current, previousPage, nextPage, similarityThreshold, n);
+ if (score >= similarityThreshold)
+ {
+ if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
+ }
+ }
+
+ // From left to right (minimum x coordinate)
+ previousPage = previousPage.OrderBy(b => b.BoundingBox.Left).ThenBy(b => b.BoundingBox.Top).ToList();
+ currentPage = currentPage.OrderBy(b => b.BoundingBox.Left).ThenBy(b => b.BoundingBox.Top).ToList();
+ nextPage = nextPage.OrderBy(b => b.BoundingBox.Left).ThenBy(b => b.BoundingBox.Top).ToList();
+
+ for (int i = 0; i < nCurrent; i++)
+ {
+ var current = currentPage[i];
+ var score = Score(current, previousPage, nextPage, similarityThreshold, n);
+ if (score >= similarityThreshold)
+ {
+ if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
+ }
+ }
+
+ // From right to left (maximumx coordinate)
+ previousPage = previousPage.OrderByDescending(b => b.BoundingBox.Right).ThenBy(b => b.BoundingBox.Top).ToList();
+ currentPage = currentPage.OrderByDescending(b => b.BoundingBox.Right).ThenBy(b => b.BoundingBox.Top).ToList();
+ nextPage = nextPage.OrderByDescending(b => b.BoundingBox.Right).ThenBy(b => b.BoundingBox.Top).ToList();
+
+ for (int i = 0; i < nCurrent; i++)
+ {
+ var current = currentPage[i];
+ var score = Score(current, previousPage, nextPage, similarityThreshold, n);
+ if (score >= similarityThreshold)
+ {
+ if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
+ }
+ }
+ });
+
+ return pageDecorations.OrderBy(x => x.Key).Select(x => x.Value).ToList();
+ }
+
+ ///
+ /// [The content similarity] is calculated from the normalized edit
+ /// distance between the two content strings, where digits are replaced with “@” chars.
+ /// A content similarity of 1 is reached when both strings are exactly equal.
+ ///
+ private static double ContentSimilarity(TextBlock b1, TextBlock b2)
+ {
+ double similarity = 1.0 - Distances.MinimumEditDistanceNormalised(
+ numbersPattern.Replace(b1.Text, replacementChar),
+ numbersPattern.Replace(b2.Text, replacementChar));
+
+ return similarity;
+ }
+
+ ///
+ /// The geometric similarity is the area of the intersection between the two boundingbox rectangles divided by the larger of the two boundingboxes.
+ ///
+ private static double GeomSimilarity(TextBlock b1, TextBlock b2)
+ {
+ double similarity = 0;
+ var intersect = b1.BoundingBox.Intersect(b2.BoundingBox);
+ if (intersect.HasValue)
+ {
+ similarity = intersect.Value.Area / Math.Max(b1.BoundingBox.Area, b2.BoundingBox.Area);
+ }
+
+ return similarity;
+ }
+
+ ///
+ /// This similarity score is a value in the range [0,1] and given
+ /// by the product between the content and the geometric similarity.
+ ///
+ private static double Similarity(TextBlock b1, TextBlock b2)
+ {
+ return ContentSimilarity(b1, b2) * GeomSimilarity(b1, b2);
+ }
+
+ private static double ScoreI(TextBlock current, TextBlock previous, TextBlock next)
+ {
+ return 0.5 * (Similarity(current, next) + Similarity(current, previous));
+ }
+
+ private static double Score(TextBlock current, IReadOnlyList previous, IReadOnlyList next,
+ double threshold, int n)
+ {
+ n = Math.Min(n, Math.Min(previous.Count, next.Count));
+ double score = 0;
+
+ for (int i = 0; i < n; i++)
+ {
+ var s = ScoreI(current, previous[i], next[i]);
+ if (s > score) score = s;
+ if (score >= threshold) return score;
+ }
+ return score;
+ }
+
+ ///
+ /// If the document has more than three pages, we compare blocks on the next or previous page with an even or odd number,
+ /// depending on whether the current page number is even or odd, to account for cases with a two-sided layout.
+ ///
+ /// Current page number.
+ /// Total number of pages in the document.
+ private static int GetPreviousPageNumber(int currentPage, int pagesCount)
+ {
+ int pMinus1 = currentPage - 1 >= 0 ? currentPage - 1 : pagesCount - 1;
+ if (pagesCount > 3)
+ {
+ pMinus1 = pMinus1 - 1 >= 0 ? pMinus1 - 1 : pagesCount - 1;
+ }
+ return pMinus1;
+ }
+
+ ///
+ /// If the document has more than three pages, we compare blocks on the next or previous page with an even or odd number,
+ /// depending on whether the current page number is even or odd, to account for cases with a two-sided layout.
+ ///
+ /// Current page number.
+ /// Total number of pages in the document.
+ private static int GetNextPageNumber(int currentPage, int pagesCount)
+ {
+ int pPlus1 = currentPage + 1 < pagesCount ? currentPage + 1 : 0;
+ if (pagesCount > 3)
+ {
+ pPlus1 = pPlus1 + 1 < pagesCount ? pPlus1 + 1 : 0;
+ }
+ return pPlus1;
+ }
+ }
+}
diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
index b47e1f21..298609b1 100644
--- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
+++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs
@@ -80,6 +80,50 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
return Math.Abs(point2.X - point1.X);
}
+ ///
+ /// Get the minimum edit distance between two strings.
+ ///
+ /// The first string.
+ /// The second string.
+ ///
+ public static int MinimumEditDistance(string string1, string string2)
+ {
+ int[,] d = new int[string1.Length + 1, string2.Length + 1];
+
+ for (int i = 1; i <= string1.Length; i++)
+ {
+ d[i, 0] = i;
+ }
+
+ for (int j = 1; j <= string2.Length; j++)
+ {
+ d[0, j] = j;
+ }
+
+ for (int j = 1; j <= string2.Length; j++)
+ {
+ for (int i = 1; i <= string1.Length; i++)
+ {
+ d[i, j] = Math.Min(Math.Min(
+ d[i - 1, j] + 1,
+ d[i, j - 1] + 1),
+ d[i - 1, j - 1] + (string1[i - 1] == string2[j - 1] ? 0 : 1)); // substitution, set cost to 1
+ }
+ }
+ return d[string1.Length, string2.Length];
+ }
+
+ ///
+ /// Get the minimum edit distance between two strings.
+ /// Returned values are between 0 and 1 included. A value of 0 means that the two strings are indentical.
+ ///
+ /// The first string.
+ /// The second string.
+ public static double MinimumEditDistanceNormalised(string string1, string string2)
+ {
+ return MinimumEditDistance(string1, string2) / (double)Math.Max(string1.Length, string2.Length);
+ }
+
///
/// Find the index of the nearest point, excluding itself.
///