diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs index 4c03b29a..bd11c7b5 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs @@ -23,8 +23,8 @@ /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. - /// Sets the maximum number of concurrent tasks enabled. - /// A positive property value limits the number of concurrent operations to the set value. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public static IEnumerable> NearestNeighbours(IReadOnlyList elements, Func distMeasure, @@ -91,8 +91,8 @@ /// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. - /// Sets the maximum number of concurrent tasks enabled. - /// A positive property value limits the number of concurrent operations to the set value. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public static IEnumerable> NearestNeighbours(IReadOnlyList elements, int k, Func distMeasure, @@ -159,8 +159,8 @@ /// The candidates' line to use for pairing. /// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space. /// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font. - /// Sets the maximum number of concurrent tasks enabled. - /// A positive property value limits the number of concurrent operations to the set value. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public static IEnumerable> NearestNeighbours(IReadOnlyList elements, Func distMeasure, diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs index a44bea5a..ba7e9d68 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs @@ -14,19 +14,19 @@ /// /// Algorithm that retrieve blocks that are labelled as decoration (e.g. headers, footers) for each page in the document, using a content and a geometric similarity measure. /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. - /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the + /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the /// left or right edge of the page. /// See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern. /// public static class DecorationTextBlockClassifier { private static readonly Regex NumbersPattern = new Regex(@"(\d+)|(\b([MDCLXVI]+)\b)", RegexOptions.IgnoreCase); - private static string replacementChar = "@"; + private const string replacementChar = "@"; /// /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure. /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. - /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the + /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the /// left or right edge of the page. /// /// The s in the document. All of them are needed for the algorithm to work. @@ -34,8 +34,8 @@ /// /// Minimum similarity score to decide wether a block is labelled as decoration or not. /// Number of blocks in a page to be considered when looking for decoration blocks. - /// Sets the maximum number of concurrent tasks enabled. - /// A positive property value limits the number of concurrent operations to the set value. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public static IReadOnlyList> Get(IReadOnlyList pages, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, @@ -47,7 +47,7 @@ /// /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure. /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. - /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the + /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the /// left or right edge of the page. /// /// The s in the document. All of them are needed for the algorithm to work. @@ -56,8 +56,8 @@ /// Minimum edit distance normalised. A value of 0 means both strings are exactly equal. /// Minimum similarity score to decide wether a block is labelled as decoration or not. /// Number of blocks in a page to be considered when looking for decoration blocks. - /// Sets the maximum number of concurrent tasks enabled. - /// A positive property value limits the number of concurrent operations to the set value. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public static IReadOnlyList> Get(IReadOnlyList pages, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Func minimumEditDistanceNormalised, @@ -92,14 +92,14 @@ /// /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure. /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. - /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the + /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the /// left or right edge of the page. /// /// The s of every pages in the document. All of them are needed for the algorithm to work. /// Minimum similarity score to decide wether a block is labelled as decoration or not. /// Number of blocks in a page to be considered when looking for decoration blocks. - /// Sets the maximum number of concurrent tasks enabled. - /// A positive property value limits the number of concurrent operations to the set value. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public static IReadOnlyList> Get(IReadOnlyList> pagesTextBlocks, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1) @@ -110,15 +110,15 @@ /// /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure. /// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. - /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the + /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the /// left or right edge of the page. /// /// The s of every pages in the document. All of them are needed for the algorithm to work. /// Minimum edit distance normalised. A value of 0 means both strings are exactly equal. /// Minimum similarity score to decide wether a block is labelled as decoration or not. /// Number of blocks in a page to be considered when looking for decoration blocks. - /// Sets the maximum number of concurrent tasks enabled. - /// A positive property value limits the number of concurrent operations to the set value. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public static IReadOnlyList> Get(IReadOnlyList> pagesTextBlocks, Func minimumEditDistanceNormalised, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1) @@ -219,7 +219,7 @@ } /// - /// [The content similarity] is calculated from the normalized edit + /// [The content similarity] is calculated from the normalized edit /// distance between the two content strings, where digits are replaced with “@” chars. /// A content similarity of 1 is reached when both strings are exactly equal. /// @@ -248,7 +248,7 @@ } /// - /// This similarity score is a value in the range [0,1] and given + /// This similarity score is a value in the range [0,1] and given /// by the product between the content and the geometric similarity. /// private static double Similarity(TextBlock b1, TextBlock b2, Func minimumEditDistanceNormalised) diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs index 3c149ed8..0b151f75 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs @@ -15,7 +15,7 @@ /// The mode of the sequence. Returns if the sequence has no mode or if it is not unique. public static float Mode(this IEnumerable array) { - if (array == null || !array.Any()) return float.NaN; + if (array?.Any() != true) return float.NaN; var sorted = array.GroupBy(v => v).Select(v => (v.Count(), v.Key)).OrderByDescending(g => g.Item1); var mode = sorted.First(); if (sorted.Count() > 1 && mode.Item1 == sorted.ElementAt(1).Item1) return float.NaN; @@ -29,7 +29,7 @@ /// The mode of the sequence. Returns if the sequence has no mode or if it is not unique. public static double Mode(this IEnumerable array) { - if (array == null || !array.Any()) return double.NaN; + if (array?.Any() != true) return double.NaN; var sorted = array.GroupBy(v => v).Select(v => (v.Count(), v.Key)).OrderByDescending(g => g.Item1); var mode = sorted.First(); if (sorted.Count() > 1 && mode.Item1 == sorted.ElementAt(1).Item1) return double.NaN; diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs index 25a2ba96..2754b888 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs @@ -15,13 +15,13 @@ /// public static UnsupervisedReadingOrderDetector Instance { get; } = new UnsupervisedReadingOrderDetector(); - private double T; + private readonly double T; /// /// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order. /// /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. - /// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the + /// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the /// same column might not be exactly aligned. public UnsupervisedReadingOrderDetector(double T = 5) { @@ -38,10 +38,10 @@ var graph = BuildGraph(textBlocks, T); - while (graph.Any()) + while (graph.Count > 0) { var maxCount = graph.Max(kvp => kvp.Value.Count); - var current = graph.Where(kvp => kvp.Value.Count == maxCount).FirstOrDefault(); + var current = graph.FirstOrDefault(kvp => kvp.Value.Count == maxCount); graph.Remove(current.Key); int index = current.Key; @@ -105,19 +105,14 @@ IntervalRelations xRelation = GetIntervalRelationX(a, b, T); IntervalRelations yRelation = GetIntervalRelationY(a, b, T); - if (xRelation == IntervalRelations.Precedes || - yRelation == IntervalRelations.Precedes || - xRelation == IntervalRelations.Meets || - yRelation == IntervalRelations.Meets || - xRelation == IntervalRelations.Overlaps || - yRelation == IntervalRelations.Overlaps) - { - return true; - } - - return false; + return xRelation == IntervalRelations.Precedes || + yRelation == IntervalRelations.Precedes || + xRelation == IntervalRelations.Meets || + yRelation == IntervalRelations.Meets || + xRelation == IntervalRelations.Overlaps || + yRelation == IntervalRelations.Overlaps; } - + /// /// Column-wise: text-blocks are read in columns, from top-to-bottom and from left-to-right. /// @@ -130,7 +125,7 @@ IntervalRelations xRelation = GetIntervalRelationX(a, b, T); IntervalRelations yRelation = GetIntervalRelationY(a, b, T); - if (xRelation == IntervalRelations.Precedes || + return xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || (xRelation == IntervalRelations.Overlaps && (yRelation == IntervalRelations.Precedes || yRelation == IntervalRelations.Meets || @@ -146,12 +141,7 @@ xRelation == IntervalRelations.DuringI || xRelation == IntervalRelations.Finishes || xRelation == IntervalRelations.StartsI || - xRelation == IntervalRelations.OverlapsI))) - { - return true; - } - - return false; + xRelation == IntervalRelations.OverlapsI)); } /// @@ -160,40 +150,34 @@ /// /// /// The tolerance parameter T. - /// private bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T) { IntervalRelations xRelation = GetIntervalRelationX(a, b, T); IntervalRelations yRelation = GetIntervalRelationY(a, b, T); - if (yRelation == IntervalRelations.Precedes || - yRelation == IntervalRelations.Meets || - (yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes || - xRelation == IntervalRelations.Meets || - xRelation == IntervalRelations.Overlaps)) || - ((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) && - (yRelation == IntervalRelations.Precedes || - yRelation == IntervalRelations.Meets || - yRelation == IntervalRelations.Overlaps || - yRelation == IntervalRelations.Starts || - yRelation == IntervalRelations.FinishesI || - yRelation == IntervalRelations.Equals || - yRelation == IntervalRelations.During || - yRelation == IntervalRelations.DuringI || - yRelation == IntervalRelations.Finishes || - yRelation == IntervalRelations.StartsI || - yRelation == IntervalRelations.OverlapsI))) - { - return true; - } - - return false; + return yRelation == IntervalRelations.Precedes || + yRelation == IntervalRelations.Meets || + (yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes || + xRelation == IntervalRelations.Meets || + xRelation == IntervalRelations.Overlaps)) || + ((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) && + (yRelation == IntervalRelations.Precedes || + yRelation == IntervalRelations.Meets || + yRelation == IntervalRelations.Overlaps || + yRelation == IntervalRelations.Starts || + yRelation == IntervalRelations.FinishesI || + yRelation == IntervalRelations.Equals || + yRelation == IntervalRelations.During || + yRelation == IntervalRelations.DuringI || + yRelation == IntervalRelations.Finishes || + yRelation == IntervalRelations.StartsI || + yRelation == IntervalRelations.OverlapsI)); } /// /// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate. - /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. - /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed + /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. + /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete. /// /// @@ -201,85 +185,83 @@ /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. private IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T) { - IntervalRelations xRelation = IntervalRelations.Unknown; - if (a.BoundingBox.Right < b.BoundingBox.Left - T) { - xRelation = IntervalRelations.Precedes; + return IntervalRelations.Precedes; } else if (a.BoundingBox.Right >= b.BoundingBox.Left - T) { - xRelation = IntervalRelations.PrecedesI; + return IntervalRelations.PrecedesI; } else if (b.BoundingBox.Left - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Left + T) { - xRelation = IntervalRelations.Meets; + return IntervalRelations.Meets; } else if (b.BoundingBox.Left - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Left + T) { - xRelation = IntervalRelations.MeetsI; + return IntervalRelations.MeetsI; } else if (a.BoundingBox.Left < b.BoundingBox.Left - T && (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T)) { - xRelation = IntervalRelations.Overlaps; + return IntervalRelations.Overlaps; } else if (a.BoundingBox.Left >= b.BoundingBox.Left - T && (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T)) { - xRelation = IntervalRelations.OverlapsI; + return IntervalRelations.OverlapsI; } - else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T) + else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T && a.BoundingBox.Right < b.BoundingBox.Right - T) { - xRelation = IntervalRelations.Starts; + return IntervalRelations.Starts; } - else if ((b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T) + else if (b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T && a.BoundingBox.Right >= b.BoundingBox.Right - T) { - xRelation = IntervalRelations.StartsI; + return IntervalRelations.StartsI; } else if (a.BoundingBox.Left > b.BoundingBox.Left + T && a.BoundingBox.Right < b.BoundingBox.Right - T) { - xRelation = IntervalRelations.During; + return IntervalRelations.During; } else if (a.BoundingBox.Left <= b.BoundingBox.Left + T && a.BoundingBox.Right >= b.BoundingBox.Right - T) { - xRelation = IntervalRelations.DuringI; + return IntervalRelations.DuringI; } else if (a.BoundingBox.Left > b.BoundingBox.Left + T && (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T)) { - xRelation = IntervalRelations.Finishes; + return IntervalRelations.Finishes; } else if (a.BoundingBox.Left <= b.BoundingBox.Left + T && (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T)) { - xRelation = IntervalRelations.FinishesI; + return IntervalRelations.FinishesI; } - else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T) + else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T && (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T)) { - xRelation = IntervalRelations.Equals; + return IntervalRelations.Equals; } - return xRelation; + return IntervalRelations.Unknown; } /// /// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate. - /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. - /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed + /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. + /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete. /// /// @@ -287,79 +269,77 @@ /// The tolerance parameter T. If two coordinates are closer than T they are considered equal. private IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T) { - IntervalRelations yRelation = IntervalRelations.Unknown; - if (a.BoundingBox.Bottom < b.BoundingBox.Top - T) { - yRelation = IntervalRelations.PrecedesI; + return IntervalRelations.PrecedesI; } else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T) { - yRelation = IntervalRelations.Precedes; + return IntervalRelations.Precedes; } else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Top + T) { - yRelation = IntervalRelations.MeetsI; + return IntervalRelations.MeetsI; } else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Top + T) { - yRelation = IntervalRelations.Meets; + return IntervalRelations.Meets; } else if (a.BoundingBox.Top < b.BoundingBox.Top - T && (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)) { - yRelation = IntervalRelations.OverlapsI; + return IntervalRelations.OverlapsI; } else if (a.BoundingBox.Top >= b.BoundingBox.Top - T && (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)) { - yRelation = IntervalRelations.Overlaps; + return IntervalRelations.Overlaps; } - else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T) + else if (b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T) { - yRelation = IntervalRelations.StartsI; + return IntervalRelations.StartsI; } - else if ((b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T) + else if (b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T) { - yRelation = IntervalRelations.Starts; + return IntervalRelations.Starts; } else if (a.BoundingBox.Top > b.BoundingBox.Top + T && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T) { - yRelation = IntervalRelations.DuringI; + return IntervalRelations.DuringI; } else if (a.BoundingBox.Top <= b.BoundingBox.Top + T && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T) { - yRelation = IntervalRelations.During; + return IntervalRelations.During; } else if (a.BoundingBox.Top > b.BoundingBox.Top + T && (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T)) { - yRelation = IntervalRelations.FinishesI; + return IntervalRelations.FinishesI; } else if (a.BoundingBox.Top <= b.BoundingBox.Top + T && (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T)) { - yRelation = IntervalRelations.Finishes; + return IntervalRelations.Finishes; } else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T) && (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T)) { - yRelation = IntervalRelations.Equals; + return IntervalRelations.Equals; } - return yRelation; + return IntervalRelations.Unknown; } /// diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs index c0979f26..104aab19 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs @@ -29,15 +29,15 @@ /// /// The words in the page. /// The minimum number of elements to define a text edge. - /// Sets the maximum number of concurrent tasks enabled. - /// A positive property value limits the number of concurrent operations to the set value. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. public static IReadOnlyDictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4, int maxDegreeOfParallelism = -1) { if (minimumElements < 0) { - throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements"); + throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", nameof(minimumElements)); } var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim())); @@ -46,10 +46,7 @@ ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; - Parallel.ForEach(edgesFuncs, parallelOptions, f => - { - dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements)); - }); + Parallel.ForEach(edgesFuncs, parallelOptions, f => dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements))); return dictionary.ToDictionary(x => x.Key, x => x.Value); } diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs index 0085307b..f7485e7c 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs @@ -68,7 +68,7 @@ else if (previous.Value != " ") { var gap = letter.StartBaseLine.X - previous.EndBaseLine.X; - + if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous)) { sb.Append(" "); diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs index 1788639a..fde77b90 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs @@ -39,7 +39,7 @@ /// Lower bounds for the width of rectangles. /// Lower bounds for the height of rectangles. /// The maximum number of rectangles to find. - /// Constant value to allow candidate whitespace rectangle to overlap the + /// Constant value to allow candidate whitespace rectangle to overlap the /// surrounding obstacles by some percent. Default value is 15%. /// The maximum size of the queue used in the algorithm. /// The identified whitespace rectangles. @@ -49,7 +49,7 @@ var bboxes = words.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0) .Select(o => o.BoundingBox).ToList(); - if (images != null && images.Count() > 0) + if (images?.Any() == true) { bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds)); } @@ -69,14 +69,14 @@ /// Lower bounds for the width of rectangles. /// Lower bounds for the height of rectangles. /// The maximum number of rectangles to find. - /// Constant value to allow candidate whitespace rectangle to overlap the + /// Constant value to allow candidate whitespace rectangle to overlap the /// surrounding obstacles by some percent. Default value is 15%. /// The maximum size of the queue used in the algorithm. /// The identified whitespace rectangles. public static IReadOnlyList GetWhitespaces(IEnumerable boundingboxes, double minWidth, double minHeight, int maxRectangleCount = 40, double whitespaceFuzziness = 0.15, int maxBoundQueueSize = 0) { - if (boundingboxes.Count() == 0) return EmptyArray.Instance; + if (!boundingboxes.Any()) return EmptyArray.Instance; var obstacles = new HashSet(boundingboxes); var pageBound = GetBound(obstacles); @@ -195,51 +195,32 @@ return false; } - if (rectangle1.Left == rectangle2.Right || - rectangle1.Right == rectangle2.Left || - rectangle1.Bottom == rectangle2.Top || - rectangle1.Top == rectangle2.Bottom) - { - return true; - } - return false; + return rectangle1.Left == rectangle2.Right || + rectangle1.Right == rectangle2.Left || + rectangle1.Bottom == rectangle2.Top || + rectangle1.Top == rectangle2.Bottom; } private static bool IsAdjacentToPageBounds(PdfRectangle pageBound, PdfRectangle rectangle) { - if (rectangle.Bottom == pageBound.Bottom || - rectangle.Top == pageBound.Top || - rectangle.Left == pageBound.Left || - rectangle.Right == pageBound.Right) - { - return true; - } - - return false; + return rectangle.Bottom == pageBound.Bottom || + rectangle.Top == pageBound.Top || + rectangle.Left == pageBound.Left || + rectangle.Right == pageBound.Right; } private static bool OverlapsHard(PdfRectangle rectangle1, PdfRectangle rectangle2) { - if (rectangle1.Left >= rectangle2.Right || - rectangle2.Left >= rectangle1.Right || - rectangle1.Top <= rectangle2.Bottom || - rectangle2.Top <= rectangle1.Bottom) - { - return false; - } - - return true; + return rectangle1.Left < rectangle2.Right && + rectangle2.Left < rectangle1.Right && + rectangle1.Top > rectangle2.Bottom && + rectangle2.Top > rectangle1.Bottom; } private static bool Inside(PdfRectangle rectangle1, PdfRectangle rectangle2) { - if (rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left && - rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom) - { - return true; - } - - return false; + return rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left && + rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom; } private static PdfRectangle GetBound(IEnumerable obstacles) @@ -254,7 +235,7 @@ #region Sorted Queue private class QueueEntries : SortedSet { - readonly int bound; + private readonly int bound; public QueueEntries(int maximumBound) { @@ -306,7 +287,7 @@ public bool IsEmptyEnough() { - return !Obstacles.Any(); + return Obstacles.Count == 0; } public bool IsEmptyEnough(IEnumerable pageObstacles) @@ -349,12 +330,11 @@ { if (obj is QueueEntry entry) { - if (Bound.Left != entry.Bound.Left || - Bound.Right != entry.Bound.Right || - Bound.Top != entry.Bound.Top || - Bound.Bottom != entry.Bound.Bottom || - Obstacles != entry.Obstacles) return false; - return true; + return Bound.Left == entry.Bound.Left && + Bound.Right == entry.Bound.Right && + Bound.Top == entry.Bound.Top && + Bound.Bottom == entry.Bound.Bottom && + Obstacles == entry.Obstacles; } return false; } @@ -383,16 +363,6 @@ // solution. return rectangle.Area * (rectangle.Height / 4.0); } - - private static double OverlappingArea(PdfRectangle rectangle1, PdfRectangle rectangle2) - { - var intersect = rectangle1.Intersect(rectangle2); - if (intersect.HasValue) - { - return intersect.Value.Area; - } - return 0; - } } #endregion } diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs index cb43e826..2c181231 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs @@ -99,8 +99,8 @@ /// /// Function used to filter out connection between letters, e.g. check if the letters have the same color. /// If the function returns false, a new word will be created. - /// Sets the maximum number of concurrent tasks enabled. - /// A positive property value limits the number of concurrent operations to the set value. + /// Sets the maximum number of concurrent tasks enabled. + /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. private List GetWords(IReadOnlyList letters, Func maxDistanceFunction, Func distMeasure,