From 48ad309edee27488c547daa11e9b32f9fe933cc1 Mon Sep 17 00:00:00 2001 From: BobLd Date: Fri, 10 Apr 2020 14:38:59 +0100 Subject: [PATCH 1/4] Make DateFormatHelper public --- src/UglyToad.PdfPig/Util/DateFormatHelper.cs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig/Util/DateFormatHelper.cs b/src/UglyToad.PdfPig/Util/DateFormatHelper.cs index c0c089ec..9f4f01f2 100644 --- a/src/UglyToad.PdfPig/Util/DateFormatHelper.cs +++ b/src/UglyToad.PdfPig/Util/DateFormatHelper.cs @@ -2,8 +2,20 @@ { using System; - internal static class DateFormatHelper + /// + /// Helper class for dates. + /// + public static class DateFormatHelper { + /// + /// Try parsing a pdf formated date string into a . + /// Date values used in a PDF shall conform to a standard date format, which closely + /// follows that of the international standard ASN.1, defined in ISO/IEC 8824. A date shall be a text string + /// of the form (D:YYYYMMDDHHmmSSOHH'mm). + /// + /// The pdf formated date string, e.g. D:199812231952-08'00. + /// The parsed date. + /// True if parsed. public static bool TryParseDateTimeOffset(string s, out DateTimeOffset offset) { offset = DateTimeOffset.MinValue; From 416d980a5cb0547001f08cc7b3763b694f73a8cb Mon Sep 17 00:00:00 2001 From: BobLd Date: Fri, 10 Apr 2020 15:33:37 +0100 Subject: [PATCH 2/4] update PublicApiScannerTests --- src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 650063c1..586bc4bc 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -196,6 +196,7 @@ "UglyToad.PdfPig.Util.Adler32Checksum", "UglyToad.PdfPig.Util.IWordExtractor", "UglyToad.PdfPig.Util.DefaultWordExtractor", + "UglyToad.PdfPig.Util.DateFormatHelper", "UglyToad.PdfPig.Writer.PdfAStandard", "UglyToad.PdfPig.Writer.PdfDocumentBuilder", "UglyToad.PdfPig.Writer.PdfMerger", From 395c5a7fd902cb425bbce04657ccf9cbba429311 Mon Sep 17 00:00:00 2001 From: BobLd Date: Sun, 12 Apr 2020 19:49:30 +0100 Subject: [PATCH 3/4] Improve default RecursiveXYCut dominant font width and height functions --- .../MathExtensions.cs | 10 ++-- .../PageSegmenter/RecursiveXYCut.cs | 47 +++++++++++++------ 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs index 5da09106..de0dc77c 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs @@ -9,9 +9,10 @@ public static class MathExtensions { /// - /// Computes the mode of a sequence of float values. + /// Computes the mode of a sequence of values. /// - /// The array of floats. + /// The sequence of floats. + /// The mode of the sequence. Returns if the sequence has no mode or if it is not unique. public static float Mode(this IEnumerable array) { if (array == null || array.Count() == 0) return float.NaN; @@ -22,9 +23,10 @@ } /// - /// Computes the mode of a sequence of decimal values. + /// Computes the mode of a sequence of values. /// - /// The array of decimal. + /// The sequence of doubles. + /// The mode of the sequence. Returns if the sequence has no mode or if it is not unique. public static double Mode(this IEnumerable array) { if (array == null || array.Count() == 0) return double.NaN; diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs index 4ea35bcf..e393184e 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs @@ -39,7 +39,28 @@ /// The minimum width for a block. public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth) { - return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5, 3)); + return GetBlocks(pageWords, minimumWidth, + (letters) => + { + var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3))); + var mode = widths.Mode(); + if (double.IsNaN(mode) || mode == 0) + { + mode = widths.Average(); + } + return mode; + }, + (letters) => + { + var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3)); + var mode = heights.Mode(); + if (double.IsNaN(mode) || mode == 0) + { + mode = heights.Average(); + } + return mode * 1.5; + } + ); } /// @@ -63,8 +84,8 @@ /// The function that determines the dominant font width. /// The function that determines the dominant font height. public IReadOnlyList GetBlocks(IEnumerable pageWords, double minimumWidth, - Func, double> dominantFontWidthFunc, - Func, double> dominantFontHeightFunc) + Func, double> dominantFontWidthFunc, + Func, double> dominantFontHeightFunc) { if (pageWords.Count() == 0) return EmptyArray.Instance; @@ -89,13 +110,13 @@ } private XYNode VerticalCut(XYLeaf leaf, double minimumWidth, - Func, double> dominantFontWidthFunc, - Func, double> dominantFontHeightFunc, int level = 0) + Func, double> dominantFontWidthFunc, + Func, double> dominantFontHeightFunc, int level = 0) { // Order words left to right var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Left).ToArray(); - if (!words.Any()) + if (words.Length == 0) { return new XYNode(null); } @@ -112,8 +133,7 @@ } // Determine dominant font width - double dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) - .Select(x => x.GlyphRectangle.Normalise().Width)); + double dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)); List projectionProfile = new List(); @@ -197,13 +217,13 @@ } private XYNode HorizontalCut(XYLeaf leaf, double minimumWidth, - Func, double> dominantFontWidthFunc, - Func, double> dominantFontHeightFunc, int level = 0) + Func, double> dominantFontWidthFunc, + Func, double> dominantFontHeightFunc, int level = 0) { // Order words bottom to top var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray(); - if (!words.Any()) + if (words.Length == 0) { return new XYNode(null); } @@ -219,14 +239,13 @@ } // Determine dominant font height - double dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) - .Select(x => x.GlyphRectangle.Normalise().Height)); + double dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)); List projectionProfile = new List(); var firstWordBound = words[0].BoundingBox.Normalise(); Projection currentProjection = new Projection(firstWordBound.Bottom, firstWordBound.Top); - int wordsCount = words.Count(); + int wordsCount = words.Length; for (int i = 1; i < wordsCount; i++) { From 0a6ec3946b12f7b22633dfa46cb22c629cc18baf Mon Sep 17 00:00:00 2001 From: BobLd Date: Mon, 20 Apr 2020 13:09:35 +0100 Subject: [PATCH 4/4] NearestNeighbourWordExtractor: - Improve results by using PointSize - Make 'filterFunction' public for ad hoc GetWords() - Allow text in different direction Make Letter.PointSize public and add warning (needed for NNWordExtractor) Remove Page.GetPointSize(Letter letter) --- .../ContentOrderTextExtractor.cs | 4 +- .../NearestNeighbourWordExtractor.cs | 60 ++++++++++--------- src/UglyToad.PdfPig/Content/Letter.cs | 5 +- src/UglyToad.PdfPig/Content/Page.cs | 9 --- 4 files changed, 36 insertions(+), 42 deletions(-) diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs index 6371e143..0085307b 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs @@ -107,8 +107,8 @@ return false; } - var ptSizePrevious = (int)Math.Round(page.ExperimentalAccess.GetPointSize(previous)); - var ptSize = (int)Math.Round(page.ExperimentalAccess.GetPointSize(letter)); + var ptSizePrevious = (int)Math.Round(previous.PointSize); + var ptSize = (int)Math.Round(letter.PointSize); var minPtSize = ptSize < ptSizePrevious ? ptSize : ptSizePrevious; var gap = Math.Abs(previous.StartBaseLine.Y - letter.StartBaseLine.Y); diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs index 05989bfe..be3f87c6 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs @@ -8,7 +8,7 @@ using Util; /// - /// Nearest Neighbour Word Extractor, using the distance. + /// Nearest Neighbour Word Extractor. /// This implementation leverages bounding boxes. /// public class NearestNeighbourWordExtractor : IWordExtractor @@ -26,55 +26,61 @@ public int MaxDegreeOfParallelism { get; set; } = -1; /// - /// Gets the words. + /// Gets the words, using the distance. /// /// The letters in the page. public IEnumerable GetWords(IReadOnlyList letters) { - double baseMaxFunc(Letter l1, Letter l2) + double maxDistFunc(Letter l1, Letter l2) { - return Math.Max(Math.Max(Math.Max( + return Math.Max(Math.Max(Math.Max(Math.Max(Math.Max( Math.Abs(l1.GlyphRectangle.Width), Math.Abs(l2.GlyphRectangle.Width)), Math.Abs(l1.Width)), - Math.Abs(l2.Width)); + Math.Abs(l2.Width)), + l1.PointSize), l2.PointSize) * 0.2; + } + + bool filterFunc(Letter l1, Letter l2) + { + return !string.IsNullOrWhiteSpace(l2.Value); } List wordsH = GetWords( letters.Where(l => l.TextDirection == TextDirection.Horizontal).ToList(), - (l1, l2) => baseMaxFunc(l1, l2) * 0.2, - Distances.Manhattan, MaxDegreeOfParallelism) + (l1, l2) => maxDistFunc(l1, l2), + Distances.Manhattan, filterFunc, MaxDegreeOfParallelism) .OrderByDescending(x => x.BoundingBox.Bottom) .ThenBy(x => x.BoundingBox.Left).ToList(); var words270 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate270).ToList(), - (l1, l2) => baseMaxFunc(l1, l2) * 0.2, - Distances.Manhattan, MaxDegreeOfParallelism) + (l1, l2) => maxDistFunc(l1, l2), + Distances.Manhattan, filterFunc, MaxDegreeOfParallelism) .OrderBy(x => x.BoundingBox.Right) .ThenByDescending(x => x.BoundingBox.Bottom); wordsH.AddRange(words270); var words180 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate180).ToList(), - (l1, l2) => baseMaxFunc(l1, l2) * 0.2, - Distances.Manhattan, MaxDegreeOfParallelism) + (l1, l2) => maxDistFunc(l1, l2), + Distances.Manhattan, filterFunc, MaxDegreeOfParallelism) .OrderBy(x => x.BoundingBox.Top) .ThenByDescending(x => x.BoundingBox.Right); wordsH.AddRange(words180); var words90 = GetWords( letters.Where(l => l.TextDirection == TextDirection.Rotate90).ToList(), - (l1, l2) => baseMaxFunc(l1, l2) * 0.2, - Distances.Manhattan, MaxDegreeOfParallelism) + (l1, l2) => maxDistFunc(l1, l2), + Distances.Manhattan, filterFunc, MaxDegreeOfParallelism) .OrderByDescending(x => x.BoundingBox.Left) .ThenBy(x => x.BoundingBox.Top); wordsH.AddRange(words90); var wordsU = GetWords( letters.Where(l => l.TextDirection == TextDirection.Other).ToList(), - (l1, l2) => baseMaxFunc(l1, l2) * 0.5, - Distances.Euclidean, MaxDegreeOfParallelism) + (l1, l2) => maxDistFunc(l1, l2) * 2.0, // allow twice the distance for oriented text + Distances.Euclidean, filterFunc, MaxDegreeOfParallelism) .OrderByDescending(x => x.BoundingBox.Bottom) .ThenBy(x => x.BoundingBox.Left); wordsH.AddRange(wordsU); @@ -83,34 +89,30 @@ } /// - /// Private method to get the words. + /// Gets the words. /// - /// The letters in the page, they must have - /// the same text directions. - /// The function that determines the maximum distance between two Letters, - /// e.g. Max(GlyphRectangle.Width) x 20%. - /// The distance measure between two start and end base line points, + /// The letters in the page. + /// The function that determines the maximum distance between two letters (start and end base line points), + /// e.g. Max(GlyphRectangle.Width) x 20%. + /// If the distance between the two letters is greater, a new word will be created. + /// The distance measure between two letters (start and end base line points), /// e.g. the Manhattan distance. + /// Function used to filter out connection between letters, e.g. check if the letters have the same color. + /// If the function returns false, a new word will be created. /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public List GetWords(IReadOnlyList pageLetters, Func maxDistanceFunction, Func distMeasure, - int maxDegreeOfParallelism) + Func filterFunction, int maxDegreeOfParallelism) { if (pageLetters == null || pageLetters.Count == 0) return new List(); - TextDirection textDirection = pageLetters[0].TextDirection; - - if (pageLetters.Any(x => textDirection != x.TextDirection)) - { - throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction."); - } var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(pageLetters, distMeasure, maxDistanceFunction, l => l.EndBaseLine, l => l.StartBaseLine, l => !string.IsNullOrWhiteSpace(l.Value), - (l1, l2) => !string.IsNullOrWhiteSpace(l2.Value), + filterFunction, maxDegreeOfParallelism).ToList(); List words = new List(); diff --git a/src/UglyToad.PdfPig/Content/Letter.cs b/src/UglyToad.PdfPig/Content/Letter.cs index 0213d978..a8f27d1b 100644 --- a/src/UglyToad.PdfPig/Content/Letter.cs +++ b/src/UglyToad.PdfPig/Content/Letter.cs @@ -61,9 +61,10 @@ public IColor Color { get; } /// - /// The size of the font in points. This is not ready for public consumption as the calculation is incorrect. + /// The size of the font in points. + /// This is considered experimental because the calculated value is incorrect for some documents at present. /// - internal double PointSize { get; } + public double PointSize { get; } /// /// Sequence number of the ShowText operation that printed this letter. diff --git a/src/UglyToad.PdfPig/Content/Page.cs b/src/UglyToad.PdfPig/Content/Page.cs index 0aaf17c5..faf297a9 100644 --- a/src/UglyToad.PdfPig/Content/Page.cs +++ b/src/UglyToad.PdfPig/Content/Page.cs @@ -191,15 +191,6 @@ { return annotationProvider.GetAnnotations(); } - - /// - /// Gets the calculated letter size in points. - /// This is considered experimental because the calculated value is incorrect for some documents at present. - /// - public double GetPointSize(Letter letter) - { - return letter.PointSize; - } } } }