diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs new file mode 100644 index 00000000..3b6ef81a --- /dev/null +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs @@ -0,0 +1,267 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Geometry; +using UglyToad.PdfPig.Util; + +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + /// + /// Algorithm that retrieve blocks that are labelled as decoration (e.g. headers, footers) for each page in the document, using a content and a geometric similarity measure. + /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. + /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the + /// left or right edge of the page. + /// See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern. + /// + public static class DecorationTextBlockClassifier + { + private static System.Text.RegularExpressions.Regex numbersPattern = new System.Text.RegularExpressions.Regex(@"\d"); // TODO: add roman numbers pattern + private static string replacementChar = "@"; + + /// + /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure. + /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. + /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the + /// left or right edge of the page. + /// + /// The s in the document. All of them are needed for the algorithm to work. + /// + /// + /// Minimum similarity score to decide wether a block is labelled as decoration or not. + /// Number of blocks in a page to be considered when looking for decoration blocks. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. + public static IReadOnlyList> Get(IReadOnlyList pages, + IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, + double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1) + { + if (pages.Count < 2) + { + throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pages)); + } + + ConcurrentDictionary> pagesBlocks = new ConcurrentDictionary>(); + + ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; + + Parallel.For(0, pages.Count(), parallelOptions, p => + { + var words = pages[p].GetWords(wordExtractor); + var blocks = pageSegmenter.GetBlocks(words); + if (!pagesBlocks.TryAdd(p, blocks)) + { + throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary."); + } + }); + + return Get(pagesBlocks.OrderBy(x => x.Key).Select(x => x.Value).ToList(), + similarityThreshold, + n, + maxDegreeOfParallelism); + } + + /// + /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure. + /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. + /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the + /// left or right edge of the page. + /// + /// The s of every pages in the document. All of them are needed for the algorithm to work. + /// Minimum similarity score to decide wether a block is labelled as decoration or not. + /// Number of blocks in a page to be considered when looking for decoration blocks. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. + /// If it is -1, there is no limit on the number of concurrently running operations. + public static IReadOnlyList> Get(IReadOnlyList> pagesTextBlocks, + double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1) + { + if (pagesTextBlocks.Count < 2) + { + throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pagesTextBlocks)); + } + + ConcurrentDictionary> pageDecorations = new ConcurrentDictionary>(); + + ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; + + Parallel.For(0, pagesTextBlocks.Count, parallelOptions, p => + { + if (!pageDecorations.TryAdd(p, new List())) + { + throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary."); + } + + int pMinus1 = GetPreviousPageNumber(p, pagesTextBlocks.Count); + int pPlus1 = GetNextPageNumber(p, pagesTextBlocks.Count); + + var previousPage = pagesTextBlocks[pMinus1]; + var currentPage = pagesTextBlocks[p]; + var nextPage = pagesTextBlocks[pPlus1]; + + int nCurrent = Math.Min(n, currentPage.Count); + + // First, for each page, we sort all blocks on the page in four different orders: + // - from top to bottom (based on the minimum y coordinate), + // - from bottom to top (maximum y coordinate), + // - from left to right (minimum x coordinate), + // - from right to left (maximumx coordinate). + + // From top to bottom (based on the minimum y coordinate) + previousPage = previousPage.OrderByDescending(b => b.BoundingBox.Bottom).ThenBy(b => b.BoundingBox.Left).ToList(); + currentPage = currentPage.OrderByDescending(b => b.BoundingBox.Bottom).ThenBy(b => b.BoundingBox.Left).ToList(); + nextPage = nextPage.OrderByDescending(b => b.BoundingBox.Bottom).ThenBy(b => b.BoundingBox.Left).ToList(); + + for (int i = 0; i < nCurrent; i++) + { + var current = currentPage[i]; + var score = Score(current, previousPage, nextPage, similarityThreshold, n); + if (score >= similarityThreshold) + { + if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current); + } + } + + // From bottom to top (maximum y coordinate) + previousPage = previousPage.OrderBy(b => b.BoundingBox.Top).ThenBy(b => b.BoundingBox.Left).ToList(); + currentPage = currentPage.OrderBy(b => b.BoundingBox.Top).ThenBy(b => b.BoundingBox.Left).ToList(); + nextPage = nextPage.OrderBy(b => b.BoundingBox.Top).ThenBy(b => b.BoundingBox.Left).ToList(); + + for (int i = 0; i < nCurrent; i++) + { + var current = currentPage[i]; + var score = Score(current, previousPage, nextPage, similarityThreshold, n); + if (score >= similarityThreshold) + { + if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current); + } + } + + // From left to right (minimum x coordinate) + previousPage = previousPage.OrderBy(b => b.BoundingBox.Left).ThenBy(b => b.BoundingBox.Top).ToList(); + currentPage = currentPage.OrderBy(b => b.BoundingBox.Left).ThenBy(b => b.BoundingBox.Top).ToList(); + nextPage = nextPage.OrderBy(b => b.BoundingBox.Left).ThenBy(b => b.BoundingBox.Top).ToList(); + + for (int i = 0; i < nCurrent; i++) + { + var current = currentPage[i]; + var score = Score(current, previousPage, nextPage, similarityThreshold, n); + if (score >= similarityThreshold) + { + if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current); + } + } + + // From right to left (maximumx coordinate) + previousPage = previousPage.OrderByDescending(b => b.BoundingBox.Right).ThenBy(b => b.BoundingBox.Top).ToList(); + currentPage = currentPage.OrderByDescending(b => b.BoundingBox.Right).ThenBy(b => b.BoundingBox.Top).ToList(); + nextPage = nextPage.OrderByDescending(b => b.BoundingBox.Right).ThenBy(b => b.BoundingBox.Top).ToList(); + + for (int i = 0; i < nCurrent; i++) + { + var current = currentPage[i]; + var score = Score(current, previousPage, nextPage, similarityThreshold, n); + if (score >= similarityThreshold) + { + if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current); + } + } + }); + + return pageDecorations.OrderBy(x => x.Key).Select(x => x.Value).ToList(); + } + + /// + /// [The content similarity] is calculated from the normalized edit + /// distance between the two content strings, where digits are replaced with “@” chars. + /// A content similarity of 1 is reached when both strings are exactly equal. + /// + private static double ContentSimilarity(TextBlock b1, TextBlock b2) + { + double similarity = 1.0 - Distances.MinimumEditDistanceNormalised( + numbersPattern.Replace(b1.Text, replacementChar), + numbersPattern.Replace(b2.Text, replacementChar)); + + return similarity; + } + + /// + /// The geometric similarity is the area of the intersection between the two boundingbox rectangles divided by the larger of the two boundingboxes. + /// + private static double GeomSimilarity(TextBlock b1, TextBlock b2) + { + double similarity = 0; + var intersect = b1.BoundingBox.Intersect(b2.BoundingBox); + if (intersect.HasValue) + { + similarity = intersect.Value.Area / Math.Max(b1.BoundingBox.Area, b2.BoundingBox.Area); + } + + return similarity; + } + + /// + /// This similarity score is a value in the range [0,1] and given + /// by the product between the content and the geometric similarity. + /// + private static double Similarity(TextBlock b1, TextBlock b2) + { + return ContentSimilarity(b1, b2) * GeomSimilarity(b1, b2); + } + + private static double ScoreI(TextBlock current, TextBlock previous, TextBlock next) + { + return 0.5 * (Similarity(current, next) + Similarity(current, previous)); + } + + private static double Score(TextBlock current, IReadOnlyList previous, IReadOnlyList next, + double threshold, int n) + { + n = Math.Min(n, Math.Min(previous.Count, next.Count)); + double score = 0; + + for (int i = 0; i < n; i++) + { + var s = ScoreI(current, previous[i], next[i]); + if (s > score) score = s; + if (score >= threshold) return score; + } + return score; + } + + /// + /// If the document has more than three pages, we compare blocks on the next or previous page with an even or odd number, + /// depending on whether the current page number is even or odd, to account for cases with a two-sided layout. + /// + /// Current page number. + /// Total number of pages in the document. + private static int GetPreviousPageNumber(int currentPage, int pagesCount) + { + int pMinus1 = currentPage - 1 >= 0 ? currentPage - 1 : pagesCount - 1; + if (pagesCount > 3) + { + pMinus1 = pMinus1 - 1 >= 0 ? pMinus1 - 1 : pagesCount - 1; + } + return pMinus1; + } + + /// + /// If the document has more than three pages, we compare blocks on the next or previous page with an even or odd number, + /// depending on whether the current page number is even or odd, to account for cases with a two-sided layout. + /// + /// Current page number. + /// Total number of pages in the document. + private static int GetNextPageNumber(int currentPage, int pagesCount) + { + int pPlus1 = currentPage + 1 < pagesCount ? currentPage + 1 : 0; + if (pagesCount > 3) + { + pPlus1 = pPlus1 + 1 < pagesCount ? pPlus1 + 1 : 0; + } + return pPlus1; + } + } +} diff --git a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs index b47e1f21..298609b1 100644 --- a/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs +++ b/src/UglyToad.PdfPig/DocumentLayoutAnalysis/Distances.cs @@ -80,6 +80,50 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis return Math.Abs(point2.X - point1.X); } + /// + /// Get the minimum edit distance between two strings. + /// + /// The first string. + /// The second string. + /// + public static int MinimumEditDistance(string string1, string string2) + { + int[,] d = new int[string1.Length + 1, string2.Length + 1]; + + for (int i = 1; i <= string1.Length; i++) + { + d[i, 0] = i; + } + + for (int j = 1; j <= string2.Length; j++) + { + d[0, j] = j; + } + + for (int j = 1; j <= string2.Length; j++) + { + for (int i = 1; i <= string1.Length; i++) + { + d[i, j] = Math.Min(Math.Min( + d[i - 1, j] + 1, + d[i, j - 1] + 1), + d[i - 1, j - 1] + (string1[i - 1] == string2[j - 1] ? 0 : 1)); // substitution, set cost to 1 + } + } + return d[string1.Length, string2.Length]; + } + + /// + /// Get the minimum edit distance between two strings. + /// Returned values are between 0 and 1 included. A value of 0 means that the two strings are indentical. + /// + /// The first string. + /// The second string. + public static double MinimumEditDistanceNormalised(string string1, string string2) + { + return MinimumEditDistance(string1, string2) / (double)Math.Max(string1.Length, string2.Length); + } + /// /// Find the index of the nearest point, excluding itself. ///