diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs
index 4c03b29a..bd11c7b5 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs
@@ -23,8 +23,8 @@
/// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.
/// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
/// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
- /// Sets the maximum number of concurrent tasks enabled.
- /// A positive property value limits the number of concurrent operations to the set value.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
public static IEnumerable> NearestNeighbours(IReadOnlyList elements,
Func distMeasure,
@@ -91,8 +91,8 @@
/// The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.
/// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
/// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
- /// Sets the maximum number of concurrent tasks enabled.
- /// A positive property value limits the number of concurrent operations to the set value.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
public static IEnumerable> NearestNeighbours(IReadOnlyList elements, int k,
Func distMeasure,
@@ -159,8 +159,8 @@
/// The candidates' line to use for pairing.
/// Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.
/// Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.
- /// Sets the maximum number of concurrent tasks enabled.
- /// A positive property value limits the number of concurrent operations to the set value.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
public static IEnumerable> NearestNeighbours(IReadOnlyList elements,
Func distMeasure,
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs
index a44bea5a..ba7e9d68 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs
@@ -14,19 +14,19 @@
///
/// Algorithm that retrieve blocks that are labelled as decoration (e.g. headers, footers) for each page in the document, using a content and a geometric similarity measure.
/// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
- /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
+ /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.
/// See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.
///
public static class DecorationTextBlockClassifier
{
private static readonly Regex NumbersPattern = new Regex(@"(\d+)|(\b([MDCLXVI]+)\b)", RegexOptions.IgnoreCase);
- private static string replacementChar = "@";
+ private const string replacementChar = "@";
///
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
- /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
+ /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.
///
/// The s in the document. All of them are needed for the algorithm to work.
@@ -34,8 +34,8 @@
///
/// Minimum similarity score to decide wether a block is labelled as decoration or not.
/// Number of blocks in a page to be considered when looking for decoration blocks.
- /// Sets the maximum number of concurrent tasks enabled.
- /// A positive property value limits the number of concurrent operations to the set value.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
public static IReadOnlyList> Get(IReadOnlyList pages,
IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
@@ -47,7 +47,7 @@
///
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
- /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
+ /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.
///
/// The s in the document. All of them are needed for the algorithm to work.
@@ -56,8 +56,8 @@
/// Minimum edit distance normalised. A value of 0 means both strings are exactly equal.
/// Minimum similarity score to decide wether a block is labelled as decoration or not.
/// Number of blocks in a page to be considered when looking for decoration blocks.
- /// Sets the maximum number of concurrent tasks enabled.
- /// A positive property value limits the number of concurrent operations to the set value.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
public static IReadOnlyList> Get(IReadOnlyList pages,
IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Func minimumEditDistanceNormalised,
@@ -92,14 +92,14 @@
///
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
- /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
+ /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.
///
/// The s of every pages in the document. All of them are needed for the algorithm to work.
/// Minimum similarity score to decide wether a block is labelled as decoration or not.
/// Number of blocks in a page to be considered when looking for decoration blocks.
- /// Sets the maximum number of concurrent tasks enabled.
- /// A positive property value limits the number of concurrent operations to the set value.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
public static IReadOnlyList> Get(IReadOnlyList> pagesTextBlocks,
double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
@@ -110,15 +110,15 @@
///
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
- /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
+ /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.
///
/// The s of every pages in the document. All of them are needed for the algorithm to work.
/// Minimum edit distance normalised. A value of 0 means both strings are exactly equal.
/// Minimum similarity score to decide wether a block is labelled as decoration or not.
/// Number of blocks in a page to be considered when looking for decoration blocks.
- /// Sets the maximum number of concurrent tasks enabled.
- /// A positive property value limits the number of concurrent operations to the set value.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
public static IReadOnlyList> Get(IReadOnlyList> pagesTextBlocks,
Func minimumEditDistanceNormalised, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
@@ -219,7 +219,7 @@
}
///
- /// [The content similarity] is calculated from the normalized edit
+ /// [The content similarity] is calculated from the normalized edit
/// distance between the two content strings, where digits are replaced with “@” chars.
/// A content similarity of 1 is reached when both strings are exactly equal.
///
@@ -248,7 +248,7 @@
}
///
- /// This similarity score is a value in the range [0,1] and given
+ /// This similarity score is a value in the range [0,1] and given
/// by the product between the content and the geometric similarity.
///
private static double Similarity(TextBlock b1, TextBlock b2, Func minimumEditDistanceNormalised)
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs
index 3c149ed8..0b151f75 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/MathExtensions.cs
@@ -15,7 +15,7 @@
/// The mode of the sequence. Returns if the sequence has no mode or if it is not unique.
public static float Mode(this IEnumerable array)
{
- if (array == null || !array.Any()) return float.NaN;
+ if (array?.Any() != true) return float.NaN;
var sorted = array.GroupBy(v => v).Select(v => (v.Count(), v.Key)).OrderByDescending(g => g.Item1);
var mode = sorted.First();
if (sorted.Count() > 1 && mode.Item1 == sorted.ElementAt(1).Item1) return float.NaN;
@@ -29,7 +29,7 @@
/// The mode of the sequence. Returns if the sequence has no mode or if it is not unique.
public static double Mode(this IEnumerable array)
{
- if (array == null || !array.Any()) return double.NaN;
+ if (array?.Any() != true) return double.NaN;
var sorted = array.GroupBy(v => v).Select(v => (v.Count(), v.Key)).OrderByDescending(g => g.Item1);
var mode = sorted.First();
if (sorted.Count() > 1 && mode.Item1 == sorted.ElementAt(1).Item1) return double.NaN;
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs
index 25a2ba96..2754b888 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs
@@ -15,13 +15,13 @@
///
public static UnsupervisedReadingOrderDetector Instance { get; } = new UnsupervisedReadingOrderDetector();
- private double T;
+ private readonly double T;
///
/// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order.
///
/// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
- /// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
+ /// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
/// same column might not be exactly aligned.
public UnsupervisedReadingOrderDetector(double T = 5)
{
@@ -38,10 +38,10 @@
var graph = BuildGraph(textBlocks, T);
- while (graph.Any())
+ while (graph.Count > 0)
{
var maxCount = graph.Max(kvp => kvp.Value.Count);
- var current = graph.Where(kvp => kvp.Value.Count == maxCount).FirstOrDefault();
+ var current = graph.FirstOrDefault(kvp => kvp.Value.Count == maxCount);
graph.Remove(current.Key);
int index = current.Key;
@@ -105,19 +105,14 @@
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
- if (xRelation == IntervalRelations.Precedes ||
- yRelation == IntervalRelations.Precedes ||
- xRelation == IntervalRelations.Meets ||
- yRelation == IntervalRelations.Meets ||
- xRelation == IntervalRelations.Overlaps ||
- yRelation == IntervalRelations.Overlaps)
- {
- return true;
- }
-
- return false;
+ return xRelation == IntervalRelations.Precedes ||
+ yRelation == IntervalRelations.Precedes ||
+ xRelation == IntervalRelations.Meets ||
+ yRelation == IntervalRelations.Meets ||
+ xRelation == IntervalRelations.Overlaps ||
+ yRelation == IntervalRelations.Overlaps;
}
-
+
///
/// Column-wise: text-blocks are read in columns, from top-to-bottom and from left-to-right.
///
@@ -130,7 +125,7 @@
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
- if (xRelation == IntervalRelations.Precedes ||
+ return xRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
(xRelation == IntervalRelations.Overlaps && (yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
@@ -146,12 +141,7 @@
xRelation == IntervalRelations.DuringI ||
xRelation == IntervalRelations.Finishes ||
xRelation == IntervalRelations.StartsI ||
- xRelation == IntervalRelations.OverlapsI)))
- {
- return true;
- }
-
- return false;
+ xRelation == IntervalRelations.OverlapsI));
}
///
@@ -160,40 +150,34 @@
///
///
/// The tolerance parameter T.
- ///
private bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
- if (yRelation == IntervalRelations.Precedes ||
- yRelation == IntervalRelations.Meets ||
- (yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes ||
- xRelation == IntervalRelations.Meets ||
- xRelation == IntervalRelations.Overlaps)) ||
- ((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) &&
- (yRelation == IntervalRelations.Precedes ||
- yRelation == IntervalRelations.Meets ||
- yRelation == IntervalRelations.Overlaps ||
- yRelation == IntervalRelations.Starts ||
- yRelation == IntervalRelations.FinishesI ||
- yRelation == IntervalRelations.Equals ||
- yRelation == IntervalRelations.During ||
- yRelation == IntervalRelations.DuringI ||
- yRelation == IntervalRelations.Finishes ||
- yRelation == IntervalRelations.StartsI ||
- yRelation == IntervalRelations.OverlapsI)))
- {
- return true;
- }
-
- return false;
+ return yRelation == IntervalRelations.Precedes ||
+ yRelation == IntervalRelations.Meets ||
+ (yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes ||
+ xRelation == IntervalRelations.Meets ||
+ xRelation == IntervalRelations.Overlaps)) ||
+ ((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) &&
+ (yRelation == IntervalRelations.Precedes ||
+ yRelation == IntervalRelations.Meets ||
+ yRelation == IntervalRelations.Overlaps ||
+ yRelation == IntervalRelations.Starts ||
+ yRelation == IntervalRelations.FinishesI ||
+ yRelation == IntervalRelations.Equals ||
+ yRelation == IntervalRelations.During ||
+ yRelation == IntervalRelations.DuringI ||
+ yRelation == IntervalRelations.Finishes ||
+ yRelation == IntervalRelations.StartsI ||
+ yRelation == IntervalRelations.OverlapsI));
}
///
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate.
- /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
- /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
+ /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
+ /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.
///
///
@@ -201,85 +185,83 @@
/// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
private IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T)
{
- IntervalRelations xRelation = IntervalRelations.Unknown;
-
if (a.BoundingBox.Right < b.BoundingBox.Left - T)
{
- xRelation = IntervalRelations.Precedes;
+ return IntervalRelations.Precedes;
}
else if (a.BoundingBox.Right >= b.BoundingBox.Left - T)
{
- xRelation = IntervalRelations.PrecedesI;
+ return IntervalRelations.PrecedesI;
}
else if (b.BoundingBox.Left - T <= a.BoundingBox.Right
&& a.BoundingBox.Right <= b.BoundingBox.Left + T)
{
- xRelation = IntervalRelations.Meets;
+ return IntervalRelations.Meets;
}
else if (b.BoundingBox.Left - T > a.BoundingBox.Right
&& a.BoundingBox.Right > b.BoundingBox.Left + T)
{
- xRelation = IntervalRelations.MeetsI;
+ return IntervalRelations.MeetsI;
}
else if (a.BoundingBox.Left < b.BoundingBox.Left - T
&& (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T))
{
- xRelation = IntervalRelations.Overlaps;
+ return IntervalRelations.Overlaps;
}
else if (a.BoundingBox.Left >= b.BoundingBox.Left - T
&& (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T))
{
- xRelation = IntervalRelations.OverlapsI;
+ return IntervalRelations.OverlapsI;
}
- else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
+ else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
{
- xRelation = IntervalRelations.Starts;
+ return IntervalRelations.Starts;
}
- else if ((b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T)
+ else if (b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
{
- xRelation = IntervalRelations.StartsI;
+ return IntervalRelations.StartsI;
}
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
{
- xRelation = IntervalRelations.During;
+ return IntervalRelations.During;
}
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
{
- xRelation = IntervalRelations.DuringI;
+ return IntervalRelations.DuringI;
}
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
{
- xRelation = IntervalRelations.Finishes;
+ return IntervalRelations.Finishes;
}
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T))
{
- xRelation = IntervalRelations.FinishesI;
+ return IntervalRelations.FinishesI;
}
- else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
+ else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
{
- xRelation = IntervalRelations.Equals;
+ return IntervalRelations.Equals;
}
- return xRelation;
+ return IntervalRelations.Unknown;
}
///
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate.
- /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
- /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
+ /// The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
+ /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.
///
///
@@ -287,79 +269,77 @@
/// The tolerance parameter T. If two coordinates are closer than T they are considered equal.
private IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T)
{
- IntervalRelations yRelation = IntervalRelations.Unknown;
-
if (a.BoundingBox.Bottom < b.BoundingBox.Top - T)
{
- yRelation = IntervalRelations.PrecedesI;
+ return IntervalRelations.PrecedesI;
}
else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T)
{
- yRelation = IntervalRelations.Precedes;
+ return IntervalRelations.Precedes;
}
else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom
&& a.BoundingBox.Bottom <= b.BoundingBox.Top + T)
{
- yRelation = IntervalRelations.MeetsI;
+ return IntervalRelations.MeetsI;
}
else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom
&& a.BoundingBox.Bottom > b.BoundingBox.Top + T)
{
- yRelation = IntervalRelations.Meets;
+ return IntervalRelations.Meets;
}
else if (a.BoundingBox.Top < b.BoundingBox.Top - T
&& (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T))
{
- yRelation = IntervalRelations.OverlapsI;
+ return IntervalRelations.OverlapsI;
}
else if (a.BoundingBox.Top >= b.BoundingBox.Top - T
&& (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T))
{
- yRelation = IntervalRelations.Overlaps;
+ return IntervalRelations.Overlaps;
}
- else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
+ else if (b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
{
- yRelation = IntervalRelations.StartsI;
+ return IntervalRelations.StartsI;
}
- else if ((b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T)
+ else if (b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
{
- yRelation = IntervalRelations.Starts;
+ return IntervalRelations.Starts;
}
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
{
- yRelation = IntervalRelations.DuringI;
+ return IntervalRelations.DuringI;
}
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
{
- yRelation = IntervalRelations.During;
+ return IntervalRelations.During;
}
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
{
- yRelation = IntervalRelations.FinishesI;
+ return IntervalRelations.FinishesI;
}
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
&& (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T))
{
- yRelation = IntervalRelations.Finishes;
+ return IntervalRelations.Finishes;
}
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
{
- yRelation = IntervalRelations.Equals;
+ return IntervalRelations.Equals;
}
- return yRelation;
+ return IntervalRelations.Unknown;
}
///
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs
index c0979f26..104aab19 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextEdgesExtractor.cs
@@ -29,15 +29,15 @@
///
/// The words in the page.
/// The minimum number of elements to define a text edge.
- /// Sets the maximum number of concurrent tasks enabled.
- /// A positive property value limits the number of concurrent operations to the set value.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
public static IReadOnlyDictionary> GetEdges(IEnumerable pageWords, int minimumElements = 4,
int maxDegreeOfParallelism = -1)
{
if (minimumElements < 0)
{
- throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements");
+ throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", nameof(minimumElements));
}
var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim()));
@@ -46,10 +46,7 @@
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
- Parallel.ForEach(edgesFuncs, parallelOptions, f =>
- {
- dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
- });
+ Parallel.ForEach(edgesFuncs, parallelOptions, f => dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements)));
return dictionary.ToDictionary(x => x.Key, x => x.Value);
}
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs
index 0085307b..f7485e7c 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs
@@ -68,7 +68,7 @@
else if (previous.Value != " ")
{
var gap = letter.StartBaseLine.X - previous.EndBaseLine.X;
-
+
if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous))
{
sb.Append(" ");
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs
index 1788639a..fde77b90 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WhitespaceCoverExtractor.cs
@@ -39,7 +39,7 @@
/// Lower bounds for the width of rectangles.
/// Lower bounds for the height of rectangles.
/// The maximum number of rectangles to find.
- /// Constant value to allow candidate whitespace rectangle to overlap the
+ /// Constant value to allow candidate whitespace rectangle to overlap the
/// surrounding obstacles by some percent. Default value is 15%.
/// The maximum size of the queue used in the algorithm.
/// The identified whitespace rectangles.
@@ -49,7 +49,7 @@
var bboxes = words.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0)
.Select(o => o.BoundingBox).ToList();
- if (images != null && images.Count() > 0)
+ if (images?.Any() == true)
{
bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds));
}
@@ -69,14 +69,14 @@
/// Lower bounds for the width of rectangles.
/// Lower bounds for the height of rectangles.
/// The maximum number of rectangles to find.
- /// Constant value to allow candidate whitespace rectangle to overlap the
+ /// Constant value to allow candidate whitespace rectangle to overlap the
/// surrounding obstacles by some percent. Default value is 15%.
/// The maximum size of the queue used in the algorithm.
/// The identified whitespace rectangles.
public static IReadOnlyList GetWhitespaces(IEnumerable boundingboxes,
double minWidth, double minHeight, int maxRectangleCount = 40, double whitespaceFuzziness = 0.15, int maxBoundQueueSize = 0)
{
- if (boundingboxes.Count() == 0) return EmptyArray.Instance;
+ if (!boundingboxes.Any()) return EmptyArray.Instance;
var obstacles = new HashSet(boundingboxes);
var pageBound = GetBound(obstacles);
@@ -195,51 +195,32 @@
return false;
}
- if (rectangle1.Left == rectangle2.Right ||
- rectangle1.Right == rectangle2.Left ||
- rectangle1.Bottom == rectangle2.Top ||
- rectangle1.Top == rectangle2.Bottom)
- {
- return true;
- }
- return false;
+ return rectangle1.Left == rectangle2.Right ||
+ rectangle1.Right == rectangle2.Left ||
+ rectangle1.Bottom == rectangle2.Top ||
+ rectangle1.Top == rectangle2.Bottom;
}
private static bool IsAdjacentToPageBounds(PdfRectangle pageBound, PdfRectangle rectangle)
{
- if (rectangle.Bottom == pageBound.Bottom ||
- rectangle.Top == pageBound.Top ||
- rectangle.Left == pageBound.Left ||
- rectangle.Right == pageBound.Right)
- {
- return true;
- }
-
- return false;
+ return rectangle.Bottom == pageBound.Bottom ||
+ rectangle.Top == pageBound.Top ||
+ rectangle.Left == pageBound.Left ||
+ rectangle.Right == pageBound.Right;
}
private static bool OverlapsHard(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
- if (rectangle1.Left >= rectangle2.Right ||
- rectangle2.Left >= rectangle1.Right ||
- rectangle1.Top <= rectangle2.Bottom ||
- rectangle2.Top <= rectangle1.Bottom)
- {
- return false;
- }
-
- return true;
+ return rectangle1.Left < rectangle2.Right &&
+ rectangle2.Left < rectangle1.Right &&
+ rectangle1.Top > rectangle2.Bottom &&
+ rectangle2.Top > rectangle1.Bottom;
}
private static bool Inside(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
- if (rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left &&
- rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom)
- {
- return true;
- }
-
- return false;
+ return rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left &&
+ rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom;
}
private static PdfRectangle GetBound(IEnumerable obstacles)
@@ -254,7 +235,7 @@
#region Sorted Queue
private class QueueEntries : SortedSet
{
- readonly int bound;
+ private readonly int bound;
public QueueEntries(int maximumBound)
{
@@ -306,7 +287,7 @@
public bool IsEmptyEnough()
{
- return !Obstacles.Any();
+ return Obstacles.Count == 0;
}
public bool IsEmptyEnough(IEnumerable pageObstacles)
@@ -349,12 +330,11 @@
{
if (obj is QueueEntry entry)
{
- if (Bound.Left != entry.Bound.Left ||
- Bound.Right != entry.Bound.Right ||
- Bound.Top != entry.Bound.Top ||
- Bound.Bottom != entry.Bound.Bottom ||
- Obstacles != entry.Obstacles) return false;
- return true;
+ return Bound.Left == entry.Bound.Left &&
+ Bound.Right == entry.Bound.Right &&
+ Bound.Top == entry.Bound.Top &&
+ Bound.Bottom == entry.Bound.Bottom &&
+ Obstacles == entry.Obstacles;
}
return false;
}
@@ -383,16 +363,6 @@
// solution.
return rectangle.Area * (rectangle.Height / 4.0);
}
-
- private static double OverlappingArea(PdfRectangle rectangle1, PdfRectangle rectangle2)
- {
- var intersect = rectangle1.Intersect(rectangle2);
- if (intersect.HasValue)
- {
- return intersect.Value.Area;
- }
- return 0;
- }
}
#endregion
}
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs
index cb43e826..2c181231 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs
@@ -99,8 +99,8 @@
///
/// Function used to filter out connection between letters, e.g. check if the letters have the same color.
/// If the function returns false, a new word will be created.
- /// Sets the maximum number of concurrent tasks enabled.
- /// A positive property value limits the number of concurrent operations to the set value.
+ /// Sets the maximum number of concurrent tasks enabled.
+ /// A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.
private List GetWords(IReadOnlyList letters,
Func maxDistanceFunction, Func distMeasure,