This commit is contained in:
BobLd
2020-06-01 13:54:01 +01:00
47 changed files with 2162 additions and 801 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 237 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 119 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 74 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 237 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 MiB

View File

@@ -23,10 +23,10 @@
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IEnumerable<HashSet<int>> NearestNeighbours<T>(IReadOnlyList<T> elements,
public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T> elements,
Func<PdfPoint, PdfPoint, double> distMeasure,
Func<T, T, double> maxDistanceFunction,
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
@@ -64,18 +64,18 @@
{
var paired = kdTree.FindNearestNeighbour(pivot, pivotPoint, distMeasure, out int index, out double dist);
if (index != -1)
if (index != -1 && filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
{
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
{
indexes[e] = index;
}
indexes[e] = index;
}
}
});
// 2. Group indexes
return GroupIndexes(indexes);
foreach (var group in GroupIndexes(indexes))
{
yield return group.Select(i => elements[i]).ToList();
}
}
/// <summary>
@@ -91,10 +91,10 @@
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IEnumerable<HashSet<int>> NearestNeighbours<T>(IReadOnlyList<T> elements, int k,
public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T> elements, int k,
Func<PdfPoint, PdfPoint, double> distMeasure,
Func<T, T, double> maxDistanceFunction,
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
@@ -130,9 +130,7 @@
if (filterPivot(pivot))
{
var paired = kdTree.FindNearestNeighbours(pivot, k, pivotPoint, distMeasure);
foreach (var c in paired)
foreach (var c in kdTree.FindNearestNeighbours(pivot, k, pivotPoint, distMeasure))
{
if (filterFinal(pivot, c.Item1) && c.Item3 < maxDistanceFunction(pivot, c.Item1))
{
@@ -144,7 +142,10 @@
});
// 2. Group indexes
return GroupIndexes(indexes);
foreach (var group in GroupIndexes(indexes))
{
yield return group.Select(i => elements[i]).ToList();
}
}
/// <summary>
@@ -158,10 +159,10 @@
/// <param name="candidatesLine">The candidates' line to use for pairing.</param>
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IEnumerable<HashSet<int>> NearestNeighbours<T>(IReadOnlyList<T> elements,
public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T> elements,
Func<PdfLine, PdfLine, double> distMeasure,
Func<T, T, double> maxDistanceFunction,
Func<T, PdfLine> pivotLine, Func<T, PdfLine> candidatesLine,
@@ -196,7 +197,7 @@
if (filterPivot(pivot))
{
int index = Distances.FindIndexNearest(pivot, elements, candidatesLine, pivotLine, distMeasure, out double dist);
int index = Distances.FindIndexNearest(pivot, elements, pivotLine, candidatesLine, distMeasure, out double dist);
if (index != -1)
{
@@ -210,7 +211,10 @@
});
// 2. Group indexes
return GroupIndexes(indexes);
foreach (var group in GroupIndexes(indexes))
{
yield return group.Select(i => elements[i]).ToList();
}
}
/// <summary>
@@ -218,7 +222,7 @@
/// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
/// </summary>
/// <param name="edges">The graph. edges[i] = j indicates that there is an edge between i and j.</param>
/// <returns>A List of HashSets containing containing the grouped indexes.</returns>
/// <returns>A List of HashSets containing the grouped indexes.</returns>
internal static List<HashSet<int>> GroupIndexes(int[] edges)
{
int[][] adjacency = new int[edges.Length][];
@@ -249,7 +253,7 @@
/// <para>https://en.wikipedia.org/wiki/Depth-first_search</para>
/// </summary>
/// <param name="edges">The graph. edges[i] = [j, k, l, ...] indicates that there is an edge between i and each element j, k, l, ...</param>
/// <returns>A List of HashSets containing containing the grouped indexes.</returns>
/// <returns>A List of HashSets containing the grouped indexes.</returns>
internal static List<HashSet<int>> GroupIndexes(int[][] edges)
{
int[][] adjacency = new int[edges.Length][];
@@ -292,7 +296,7 @@
Stack<int> S = new Stack<int>();
S.Push(s);
while (S.Any())
while (S.Count > 0)
{
var u = S.Pop();
if (!isDone[u])

View File

@@ -14,19 +14,19 @@
/// <summary>
/// Algorithm that retrieve blocks that are labelled as decoration (e.g. headers, footers) for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// <para>See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.</para>
/// </summary>
public static class DecorationTextBlockClassifier
{
private static readonly Regex NumbersPattern = new Regex(@"(\d+)|(\b([MDCLXVI]+)\b)", RegexOptions.IgnoreCase);
private static string replacementChar = "@";
private const string replacementChar = "@";
/// <summary>
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// </summary>
/// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
@@ -34,8 +34,8 @@
/// <param name="pageSegmenter"></param>
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<Page> pages,
IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
@@ -47,7 +47,7 @@
/// <summary>
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// </summary>
/// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
@@ -56,8 +56,8 @@
/// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param>
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<Page> pages,
IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Func<string, string, double> minimumEditDistanceNormalised,
@@ -92,14 +92,14 @@
/// <summary>
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// </summary>
/// <param name="pagesTextBlocks">The <see cref="TextBlock"/>s of every pages in the document. All of them are needed for the algorithm to work.</param>
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<IReadOnlyList<TextBlock>> pagesTextBlocks,
double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
@@ -110,15 +110,15 @@
/// <summary>
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// </summary>
/// <param name="pagesTextBlocks">The <see cref="TextBlock"/>s of every pages in the document. All of them are needed for the algorithm to work.</param>
/// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param>
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<IReadOnlyList<TextBlock>> pagesTextBlocks,
Func<string, string, double> minimumEditDistanceNormalised, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
@@ -219,7 +219,7 @@
}
/// <summary>
/// [The content similarity] is calculated from the normalized edit
/// [The content similarity] is calculated from the normalized edit
/// distance between the two content strings, where digits are replaced with “@” chars.
/// A content similarity of 1 is reached when both strings are exactly equal.
/// </summary>
@@ -248,7 +248,7 @@
}
/// <summary>
/// This similarity score is a value in the range [0,1] and given
/// This similarity score is a value in the range [0,1] and given
/// by the product between the content and the geometric similarity.
/// </summary>
private static double Similarity(TextBlock b1, TextBlock b2, Func<string, string, double> minimumEditDistanceNormalised)

View File

@@ -78,6 +78,28 @@
return Math.Abs(point2.X - point1.X);
}
/// <summary>
/// Bound angle so that -180 ≤ θ ≤ 180.
/// </summary>
/// <param name="angle">The angle to bound.</param>
public static double BoundAngle180(double angle)
{
angle = (angle + 180) % 360;
if (angle < 0) angle += 360;
return angle - 180;
}
/// <summary>
/// Bound angle so that 0 ≤ θ ≤ 360.
/// </summary>
/// <param name="angle">The angle to bound.</param>
public static double BoundAngle0to360(double angle)
{
angle %= 360;
if (angle < 0) angle += 360;
return angle;
}
/// <summary>
/// Get the minimum edit distance between two strings.
/// </summary>
@@ -127,32 +149,32 @@
/// <typeparam name="T"></typeparam>
/// <param name="element">The reference point, for which to find the nearest neighbour.</param>
/// <param name="candidates">The list of neighbours candidates.</param>
/// <param name="candidatesPoint"></param>
/// <param name="pivotPoint"></param>
/// <param name="candidatePoint"></param>
/// <param name="distanceMeasure">The distance measure to use.</param>
/// <param name="distance">The distance between reference point, and its nearest neighbour.</param>
/// <param name="distance">The distance between the reference element and its nearest neighbour.</param>
public static int FindIndexNearest<T>(T element, IReadOnlyList<T> candidates,
Func<T, PdfPoint> candidatesPoint, Func<T, PdfPoint> pivotPoint,
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatePoint,
Func<PdfPoint, PdfPoint, double> distanceMeasure, out double distance)
{
if (candidates == null || candidates.Count == 0)
{
throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "points");
throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", nameof(candidates));
}
if (distanceMeasure == null)
{
throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", "distanceMeasure");
throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", nameof(distanceMeasure));
}
distance = double.MaxValue;
int closestPointIndex = -1;
var candidatesPoints = candidates.Select(candidatesPoint).ToList();
var candidatesPoints = candidates.Select(candidatePoint).ToList();
var pivot = pivotPoint(element);
for (var i = 0; i < candidates.Count; i++)
{
double currentDistance = distanceMeasure(candidatesPoints[i], pivot);
double currentDistance = distanceMeasure(pivot, candidatesPoints[i]);
if (currentDistance < distance && !candidates[i].Equals(element))
{
distance = currentDistance;
@@ -169,32 +191,32 @@
/// <typeparam name="T"></typeparam>
/// <param name="element">The reference line, for which to find the nearest neighbour.</param>
/// <param name="candidates">The list of neighbours candidates.</param>
/// <param name="candidatesLine"></param>
/// <param name="pivotLine"></param>
/// <param name="candidateLine"></param>
/// <param name="distanceMeasure">The distance measure between two lines to use.</param>
/// <param name="distance">The distance between reference line, and its nearest neighbour.</param>
/// <param name="distance">The distance between the reference element and its nearest neighbour.</param>
public static int FindIndexNearest<T>(T element, IReadOnlyList<T> candidates,
Func<T, PdfLine> candidatesLine, Func<T, PdfLine> pivotLine,
Func<T, PdfLine> pivotLine, Func<T, PdfLine> candidateLine,
Func<PdfLine, PdfLine, double> distanceMeasure, out double distance)
{
if (candidates == null || candidates.Count == 0)
{
throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", "lines");
throw new ArgumentException("Distances.FindIndexNearest(): The list of neighbours candidates is either null or empty.", nameof(candidates));
}
if (distanceMeasure == null)
{
throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", "distanceMeasure");
throw new ArgumentException("Distances.FindIndexNearest(): The distance measure must not be null.", nameof(distanceMeasure));
}
distance = double.MaxValue;
int closestLineIndex = -1;
var candidatesLines = candidates.Select(candidatesLine).ToList();
var candidatesLines = candidates.Select(candidateLine).ToList();
var pivot = pivotLine(element);
for (var i = 0; i < candidates.Count; i++)
{
double currentDistance = distanceMeasure(candidatesLines[i], pivot);
double currentDistance = distanceMeasure(pivot, candidatesLines[i]);
if (currentDistance < distance && !candidates[i].Equals(element))
{
distance = currentDistance;

View File

@@ -0,0 +1,16 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// Abstract class that stores options that configure the operation of methods of the document layout analysis algorithm.
/// </summary>
public abstract class DlaOptions
{
/// <summary>
/// Gets or sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para>
/// <para>Default value is -1.</para>
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1;
}
}

View File

@@ -16,7 +16,7 @@
/// K-D tree data structure of <see cref="PdfPoint"/>.
/// </summary>
/// <param name="points">The points used to build the tree.</param>
public KdTree(PdfPoint[] points) : base(points, p => p)
public KdTree(IReadOnlyList<PdfPoint> points) : base(points, p => p)
{ }
/// <summary>

View File

@@ -15,7 +15,7 @@
/// <returns>The mode of the sequence. Returns <see cref="float.NaN"/> if the sequence has no mode or if it is not unique.</returns>
public static float Mode(this IEnumerable<float> array)
{
if (array == null || array.Count() == 0) return float.NaN;
if (array?.Any() != true) return float.NaN;
var sorted = array.GroupBy(v => v).Select(v => (v.Count(), v.Key)).OrderByDescending(g => g.Item1);
var mode = sorted.First();
if (sorted.Count() > 1 && mode.Item1 == sorted.ElementAt(1).Item1) return float.NaN;
@@ -29,11 +29,32 @@
/// <returns>The mode of the sequence. Returns <see cref="double.NaN"/> if the sequence has no mode or if it is not unique.</returns>
public static double Mode(this IEnumerable<double> array)
{
if (array == null || array.Count() == 0) return double.NaN;
if (array?.Any() != true) return double.NaN;
var sorted = array.GroupBy(v => v).Select(v => (v.Count(), v.Key)).OrderByDescending(g => g.Item1);
var mode = sorted.First();
if (sorted.Count() > 1 && mode.Item1 == sorted.ElementAt(1).Item1) return double.NaN;
return mode.Key;
}
/// <summary>
/// Test for almost equality to 0.
/// </summary>
/// <param name="number"></param>
/// <param name="epsilon"></param>
public static bool AlmostEqualsToZero(this double number, double epsilon = 1e-5)
{
return (number > -epsilon) && (number < epsilon);
}
/// <summary>
/// Test for almost equality.
/// </summary>
/// <param name="number"></param>
/// <param name="other"></param>
/// <param name="epsilon"></param>
public static bool AlmostEquals(this double number, double other, double epsilon = 1e-5)
{
return AlmostEqualsToZero(number - other, epsilon);
}
}
}

View File

@@ -2,9 +2,11 @@
{
using Content;
using Core;
using System;
using System.Collections.Generic;
using System.Linq;
/// <inheritdoc />
/// <summary>
/// Default Page Segmenter. All words are included in one block.
/// </summary>
@@ -15,15 +17,43 @@
/// </summary>
public static DefaultPageSegmenter Instance { get; } = new DefaultPageSegmenter();
/// <inheritdoc />
/// <summary>
/// Get the blocks.
/// Get the blocks using default options values.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords)
/// <param name="words">The page's words to generate text blocks for.</param>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{
if (pageWords.Count() == 0) return EmptyArray<TextBlock>.Instance;
return new List<TextBlock>() { new TextBlock(new XYLeaf(pageWords).GetLines()) };
return GetBlocks(words, new DefaultPageSegmenterOptions());
}
/// <summary>
/// Get the text blocks using options.
/// </summary>
/// <param name="words">The page's words to generate text blocks for.</param>
/// <param name="options">The <see cref="DefaultPageSegmenterOptions"/> to use.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the default method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
{
if (options is DefaultPageSegmenterOptions dOptions)
{
if (words?.Any() != true)
{
return EmptyArray<TextBlock>.Instance;
}
return new List<TextBlock>() { new TextBlock(new XYLeaf(words).GetLines(dOptions.WordSeparator), dOptions.LineSeparator) };
}
else
{
throw new ArgumentException("Options provided must be of type " + nameof(DefaultPageSegmenterOptions) + ".", nameof(options));
}
}
/// <summary>
/// Default page segmenter options.
/// </summary>
public class DefaultPageSegmenterOptions : PageSegmenterOptions
{ }
}
}
}

View File

@@ -2,17 +2,17 @@
{
using Content;
using Core;
using Geometry;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
/// <inheritdoc />
/// <summary>
/// The Document Spectrum (Docstrum) algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood
/// clustering of connected components extracted from the document.
/// The Document Spectrum (Docstrum) algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood
/// clustering of connected components extracted from the document.
/// This implementation leverages bounding boxes and does not exactly replicates the original algorithm.
/// <para>See 'The document spectrum for page layout analysis.' by L. O'Gorman.</para>
/// </summary>
@@ -25,274 +25,499 @@
/// <inheritdoc />
/// <summary>
/// Get the blocks.
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
/// Get the blocks using default options values.
/// </summary>
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{
return GetBlocks(words, -1);
return GetBlocks(words, new DocstrumBoundingBoxesOptions());
}
/// <inheritdoc />
/// <summary>
/// Get the blocks.
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
/// Get the blocks using options values.
/// </summary>
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="options">The <see cref="DocstrumBoundingBoxesOptions"/> to use.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, int maxDegreeOfParallelism)
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
{
return GetBlocks(words, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3, maxDegreeOfParallelism);
}
/// <summary>
/// Get the blocks. See original paper for more information.
/// </summary>
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
/// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
/// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
/// text lines for blocking. Maximum distance will be this number times the between-line
/// distance found by the analysis.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, AngleBounds withinLine,
AngleBounds betweenLine, double betweenLineMultiplier)
{
return GetBlocks(words, withinLine, betweenLine, betweenLineMultiplier, -1);
}
/// <summary>
/// Get the blocks. See original paper for more information.
/// </summary>
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
/// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
/// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
/// text lines for blocking. Maximum distance will be this number times the between-line
/// distance found by the analysis.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, AngleBounds withinLine,
AngleBounds betweenLine, double betweenLineMultiplier, int maxDegreeOfParallelism)
{
if (words == null)
if (options is DocstrumBoundingBoxesOptions dbbOptions)
{
return EmptyArray<TextBlock>.Instance;
}
var wordsList = new List<Word>();
foreach (var word in words)
{
if (string.IsNullOrWhiteSpace(word.Text))
if (words?.Any() != true)
{
continue;
return EmptyArray<TextBlock>.Instance;
}
wordsList.Add(word);
return GetBlocks(words.ToList(),
dbbOptions.WithinLineBounds, dbbOptions.WithinLineMultiplier, dbbOptions.WithinLineBinSize,
dbbOptions.BetweenLineBounds, dbbOptions.BetweenLineMultiplier, dbbOptions.BetweenLineBinSize,
dbbOptions.AngularDifferenceBounds,
dbbOptions.Epsilon,
dbbOptions.WordSeparator, dbbOptions.LineSeparator,
dbbOptions.MaxDegreeOfParallelism);
}
else
{
throw new ArgumentException("Options provided must be of type " + nameof(DocstrumBoundingBoxesOptions) + ".", nameof(options));
}
}
if (wordsList.Count == 0)
/// <summary>
/// Get the blocks. See original paper for more information.
/// </summary>
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="wlBounds">Angle bounds for words to be considered as neighbours on the same line.</param>
/// <param name="wlMultiplier">Multiplier that gives the maximum euclidian distance between words for building lines.
/// Maximum distance will be this number times the within-line distance found by the analysis.</param>
/// <param name="wlBinSize">The bin size used when building the within-line distances distribution.</param>
/// <param name="blBounds">Angle bounds for words to be considered as neighbours on separate lines.</param>
/// <param name="blMultiplier">Multiplier that gives the maximum perpendicular distance between
/// text lines for blocking. Maximum distance will be this number times the between-line
/// distance found by the analysis.</param>
/// <param name="blBinSize">The bin size used when building the between-line distances distribution.</param>
/// <param name="angularDifferenceBounds">The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.</param>
/// <param name="epsilon">Precision when testing equalities.</param>
/// <param name="wordSeparator"></param>
/// <param name="lineSeparator"></param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
private IReadOnlyList<TextBlock> GetBlocks(IReadOnlyList<Word> words,
AngleBounds wlBounds, double wlMultiplier, int wlBinSize,
AngleBounds blBounds, double blMultiplier, int blBinSize,
AngleBounds angularDifferenceBounds,
double epsilon,
string wordSeparator, string lineSeparator,
int maxDegreeOfParallelism)
{
// Filter out white spaces
words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToList();
if (words.Count == 0)
{
return EmptyArray<TextBlock>.Instance;
}
// 1. Estimate within line and between line spacing
if (!GetSpacingEstimation(words, wlBounds, wlBinSize, blBounds, blBinSize,
maxDegreeOfParallelism,
out double withinLineDistance, out double betweenLineDistance))
{
if (double.IsNaN(withinLineDistance)) withinLineDistance = 0;
if (double.IsNaN(betweenLineDistance)) betweenLineDistance = 0;
}
// 2. Determination of Text Lines
double maxWithinLineDistance = wlMultiplier * withinLineDistance; //Math.Min(3 * withinLineDistance.Value, 1.4142 * betweenLineDistance.Value);
var lines = GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();
// 3. Structural Block Determination
double maxBetweenLineDistance = blMultiplier * betweenLineDistance;
return GetStructuralBlocks(lines, maxBetweenLineDistance, angularDifferenceBounds, epsilon, lineSeparator, maxDegreeOfParallelism).ToList();
}
#region Spacing Estimation
/// <summary>
/// Estimation of within-line and between-line spacing.
/// </summary>
/// <returns>False if either 'withinLineDistance' or 'betweenLineDistance' is NaN.</returns>
private static bool GetSpacingEstimation(IReadOnlyList<Word> words,
AngleBounds wlBounds, int wlBinSize,
AngleBounds blBounds, int blBinSize,
int maxDegreeOfParallelism,
out double withinLineDistance, out double betweenLineDistance)
{
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
var withinLineDistList = new ConcurrentBag<double>();
var betweenLineDistList = new ConcurrentBag<double>();
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
// 1. Estimate within line and between line spacing
KdTree<Word> kdTreeWL = new KdTree<Word>(wordsList, w => w.BoundingBox.BottomLeft);
KdTree<Word> kdTreeBL = new KdTree<Word>(wordsList, w => w.BoundingBox.TopLeft);
KdTree<Word> kdTreeBottomLeft = new KdTree<Word>(words, w => w.BoundingBox.BottomLeft);
Parallel.For(0, wordsList.Count, parallelOptions, i =>
Parallel.For(0, words.Count, parallelOptions, i =>
{
var word = wordsList[i];
var word = words[i];
// Within-line distance
var neighbourWL = kdTreeWL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 0.5));
foreach (var n in neighbourWL)
// 1.1.1 Find the 2 closest neighbours words to the candidate, using euclidean distance.
foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, Distances.Euclidean))
{
if (withinLine.Contains(Distances.Angle(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft)))
// 1.1.2 Check if the neighbour word is within the angle of the candidate
if (wlBounds.Contains(AngleWL(word, n.Item1)))
{
withinLineDistList.Add(Distances.Horizontal(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft));
withinLineDistList.Add(Distances.Euclidean(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft));
}
}
// Between-line distance
var neighbourBL = kdTreeBL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomLeft, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 50));
foreach (var n in neighbourBL)
// 1.2.1 Find the 2 closest neighbours words to the candidate, using euclidean distance.
foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.TopLeft, Distances.Euclidean))
{
if (betweenLine.Contains(Distances.Angle(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid)))
// 1.2.2 Check if the candidate words is within the angle
var angle = AngleBL(word, n.Item1);
if (blBounds.Contains(angle))
{
betweenLineDistList.Add(Distances.Vertical(word.BoundingBox.BottomLeft, n.Item1.BoundingBox.TopLeft));
// 1.2.3 Compute the vertical (between-line) distance between the candidate
// and the neighbour and add it to the between-line distances list
double hypotenuse = Distances.Euclidean(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid);
// Angle is kept within [-90, 90]
if (angle > 90)
{
angle -= 180;
}
var dist = Math.Abs(hypotenuse * Math.Cos((90 - angle) * Math.PI / 180))
- word.BoundingBox.Height / 2.0 - n.Item1.BoundingBox.Height / 2.0;
// The perpendicular distance can be negative because of the subtractions.
// Could occur when words are overlapping, we ignore that.
if (dist >= 0) betweenLineDistList.Add(dist);
}
}
});
double? withinLineDistance = GetPeakAverageDistance(withinLineDistList);
double? betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);
// Compute average peak value of distribution
double? withinLinePeak = GetPeakAverageDistance(withinLineDistList, wlBinSize);
double? betweenLinePeak = GetPeakAverageDistance(betweenLineDistList, blBinSize);
if (!withinLineDistance.HasValue || !betweenLineDistance.HasValue)
{
return new[] { new TextBlock(new[] { new TextLine(wordsList) }) };
}
withinLineDistance = withinLinePeak ?? double.NaN;
betweenLineDistance = betweenLinePeak ?? double.NaN;
// 2. Find lines of text
double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value);
var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine, maxDegreeOfParallelism).ToArray();
// 3. Find blocks of text
double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value;
var blocks = GetLinesGroups(lines, maxDistanceBetweenLine, maxDegreeOfParallelism).ToList();
// 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
for (var b = 0; b < blocks.Count; b++)
{
if (blocks[b] == null)
{
continue;
}
// Merge all lines (words)
blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(),
double.MaxValue, withinLine, maxDegreeOfParallelism).ToList());
for (var c = 0; c < blocks.Count; c++)
{
if (b == c || blocks[c] == null)
{
continue;
}
if (blocks[b].BoundingBox.IntersectsWith(blocks[c].BoundingBox))
{
// Merge
// 1. Merge all words
var mergedWords = new List<Word>(blocks[b].TextLines.SelectMany(l => l.Words));
mergedWords.AddRange(blocks[c].TextLines.SelectMany(l => l.Words));
// 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
// same block. Filtering will still be done based on angle.
// Merge all lines (words) sharing same bottom (baseline)
var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine, maxDegreeOfParallelism).ToList();
blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList());
// Remove
blocks[c] = null;
}
}
}
return blocks.Where(b => b != null).ToList();
}
private static IEnumerable<TextLine> GetLines(List<Word> words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism)
{
TextDirection textDirection = words[0].TextDirection;
var groupedIndexes = Clustering.NearestNeighbours(words, 2, Distances.Euclidean,
(pivot, candidate) => maxDist,
pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
pivot => true,
(pivot, candidate) => withinLine.Contains(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft)),
maxDegreeOfParallelism).ToList();
Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
if (textDirection == TextDirection.Rotate180)
{
orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList();
}
else if (textDirection == TextDirection.Rotate90)
{
orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList();
}
else if (textDirection == TextDirection.Rotate270)
{
orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
}
for (var a = 0; a < groupedIndexes.Count; a++)
{
yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i])));
}
}
private static IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist, int maxDegreeOfParallelism)
{
/**************************************************************************************************
* We want to measure the distance between two lines using the following method:
* We check if two lines are overlapping horizontally.
* If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
* We finally compute the Euclidean distance between these two middle points.
* If the two lines are not overlapping, the distance is set to the max distance.
**************************************************************************************************/
double euclidianOverlappingMiddleDistance(PdfLine l1, PdfLine l2)
{
var left = Math.Max(l1.Point1.X, l2.Point1.X);
var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left);
if (d < 0) return double.MaxValue; // not overlapping -> max distance
return Distances.Euclidean(
new PdfPoint(left + d / 2, l1.Point1.Y),
new PdfPoint(left + d / 2, l2.Point1.Y));
}
var groupedIndexes = Clustering.NearestNeighbours(lines,
euclidianOverlappingMiddleDistance,
(pivot, candidate) => maxDist,
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
pivot => true, (pivot, candidate) => true,
maxDegreeOfParallelism).ToList();
for (int a = 0; a < groupedIndexes.Count; a++)
{
yield return new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList());
}
return withinLinePeak.HasValue && betweenLinePeak.HasValue;
}
/// <summary>
/// Get the average distance value of the peak bucket of the histogram.
/// </summary>
/// <param name="distances">The set of distances to average.</param>
private static double? GetPeakAverageDistance(IEnumerable<double> distances)
/// <param name="binLength"></param>
private static double? GetPeakAverageDistance(IEnumerable<double> distances, int binLength = 1)
{
var buckets = new Dictionary<int, List<double>>();
if (!distances.Any())
{
return null;
}
if (binLength <= 0)
{
throw new ArgumentException("DocstrumBoundingBoxes: the bin length must be positive when commputing peak average distance.", nameof(binLength));
}
var max = (int)Math.Ceiling(distances.Max());
if (max == 0)
{
max = binLength;
}
else
{
binLength = binLength > max ? max : binLength;
}
var bins = Enumerable.Range(0, (int)Math.Ceiling(max / (double)binLength) + 1)
.Select(x => x * binLength)
.ToDictionary(x => x, _ => new List<double>());
foreach (var distance in distances)
{
var floor = (int)distance;
if (buckets.ContainsKey(floor))
int bin = (int)Math.Floor(distance / binLength);
if (bin < 0)
{
buckets[floor].Add(distance);
}
else
{
buckets[floor] = new List<double> { distance };
throw new ArgumentOutOfRangeException(nameof(bin), "DocstrumBoundingBoxes: Negative distance found while commputing peak average distance.");
}
bins[bins.Keys.ElementAt(bin)].Add(distance);
}
var best = default(List<double>);
foreach (var bucket in buckets)
foreach (var bin in bins)
{
if (best == null || bucket.Value.Count > best.Count)
if (best == null || bin.Value.Count > best.Count)
{
best = bucket.Value;
best = bin.Value;
}
}
return best?.Average();
}
#endregion
#region Text Lines
private static IEnumerable<TextLine> GetLines(IReadOnlyList<Word> words, double maxWLDistance, AngleBounds withinLine,
string wordSeparator, int maxDegreeOfParallelism)
{
var groupedWords = Clustering.NearestNeighbours(words,
2,
Distances.Euclidean,
(_, __) => maxWLDistance,
pivot => pivot.BoundingBox.BottomRight,
candidate => candidate.BoundingBox.BottomLeft,
_ => true,
(pivot, candidate) => withinLine.Contains(AngleWL(pivot, candidate)),
maxDegreeOfParallelism).ToList();
foreach (var g in groupedWords)
{
yield return new TextLine(g.OrderByReadingOrder(), wordSeparator);
}
}
/// <summary>
/// Helper function to compute the within line angle between the pivot's bottom
/// right and the candidate's bottom left points, taking in account the pivot's rotation.
/// <para>-90 ≤ θ ≤ 90.</para>
/// </summary>
private static double AngleWL(Word pivot, Word candidate)
{
var angle = Distances.BoundAngle180(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft) - pivot.BoundingBox.Rotation);
// Angle is kept within [-90;90] degree to handle overlapping words
if (angle > 90)
{
angle -= 180;
}
else if (angle < -90)
{
angle += 180;
}
return angle;
}
#endregion
#region Blocking
private static IEnumerable<TextBlock> GetStructuralBlocks(IReadOnlyList<TextLine> lines,
double maxBLDistance, AngleBounds angularDifference, double epsilon, string lineSeparator, int maxDegreeOfParallelism)
{
/******************************************************************************************************
* We want to measure the distance between two lines using the following method:
* We check if two lines are overlapping horizontally and compute the perpendicular distance.
* We check if the angle between the two line is within 'angularDifference'.
* If the two lines are not overlapping or the angle is too wide, the distance is set to the infinity.
*
* If two text lines are approximately parallel, close in perpendicular distance, and they either
* overlap to some specified degree or are separated by only a small distance in parallel distance,
* then they are said to meet the criteria to belong to the same structural block.
******************************************************************************************************/
var groupedLines = Clustering.NearestNeighbours(
lines,
(l1, l2) => PerpendicularOverlappingDistance(l1, l2, angularDifference, epsilon),
(_, __) => maxBLDistance,
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
_ => true,
(_, __) => true,
maxDegreeOfParallelism).ToList();
foreach (var g in groupedLines)
{
yield return new TextBlock(g.OrderByReadingOrder(), lineSeparator);
}
}
/// <summary>
/// Perpendicular overlapping distance.
/// TODO: describe checks done
/// </summary>
/// <param name="line1"></param>
/// <param name="line2"></param>
/// <param name="angularDifferenceBounds"></param>
/// <param name="epsilon"></param>
private static double PerpendicularOverlappingDistance(PdfLine line1, PdfLine line2, AngleBounds angularDifferenceBounds, double epsilon)
{
if (GetStructuralBlockingParameters(line1, line2, epsilon, out double theta, out _, out double ed))
{
// Angle is kept within [-90;90]
if (theta > 90)
{
theta -= 180;
}
else if (theta < -90)
{
theta += 180;
}
if (!angularDifferenceBounds.Contains(theta))
{
// exclude because not parallel enough
return double.PositiveInfinity;
}
return Math.Abs(ed);
}
else
{
// nonoverlapped
return double.PositiveInfinity;
}
}
/// <summary>
/// Get the structural blocking parameters.
/// </summary>
/// <param name="i"></param>
/// <param name="j"></param>
/// <param name="epsilon"></param>
/// <param name="angularDifference">The angle between the 2 lines.<para>-180 ≤ θ ≤ 180</para></param>
/// <param name="normalisedOverlap">Overlap of segment i onto j. Positive value if overlapped, negative value if nonoverlapped.<para>[-1, 1]?</para></param>
/// <param name="perpendicularDistance">Signed perpendicular distance.</param>
/// <returns>Return true if overlapped, false if nonoverlapped.</returns>
public static bool GetStructuralBlockingParameters(PdfLine i, PdfLine j, double epsilon,
out double angularDifference, out double normalisedOverlap, out double perpendicularDistance)
{
if (AlmostEquals(i, j, epsilon))
{
angularDifference = 0;
normalisedOverlap = 1;
perpendicularDistance = 0;
return true;
}
double dXi = i.Point2.X - i.Point1.X;
double dYi = i.Point2.Y - i.Point1.Y;
double dXj = j.Point2.X - j.Point1.X;
double dYj = j.Point2.Y - j.Point1.Y;
angularDifference = Distances.BoundAngle180((Math.Atan2(dYj, dXj) - Math.Atan2(dYi, dXi)) * 180 / Math.PI);
PdfPoint? Aj = GetTranslatedPoint(i.Point1.X, i.Point1.Y, j.Point1.X, j.Point1.Y, dXi, dYi, dXj, dYj, epsilon);
PdfPoint? Bj = GetTranslatedPoint(i.Point2.X, i.Point2.Y, j.Point2.X, j.Point2.Y, dXi, dYi, dXj, dYj, epsilon);
if (!Aj.HasValue || !Bj.HasValue)
{
// Might happen because lines are perpendicular
// or have too small lengths
normalisedOverlap = double.NaN;
perpendicularDistance = double.NaN;
return false;
}
// Get middle points
var ps = new[] { j.Point1, j.Point2, Aj.Value, Bj.Value };
if (dXj != 0)
{
ps = ps.OrderBy(p => p.X).ThenBy(p => p.Y).ToArray();
}
else if (dYj != 0)
{
ps = ps.OrderBy(p => p.Y).ToArray();
}
PdfPoint Cj = ps[1];
PdfPoint Dj = ps[2];
bool overlap = true;
// Cj and Dj should be contained within both j and [Aj,Bj] if overlapped
if (!PointInLine(j.Point1, j.Point2, Cj) || !PointInLine(j.Point1, j.Point2, Dj) ||
!PointInLine(Aj.Value, Bj.Value, Cj) || !PointInLine(Aj.Value, Bj.Value, Dj))
{
// nonoverlapped
overlap = false;
}
//double pj = Math.Sqrt((Dj.Y - Cj.Y) * (Dj.Y - Cj.Y) + (Dj.X - Cj.X) * (Dj.X - Cj.X));
double pj = Distances.Euclidean(Cj, Dj);
normalisedOverlap = (overlap ? pj : -pj) / j.Length;
double xMj = (Cj.X + Dj.X) / 2.0;
double yMj = (Cj.Y + Dj.Y) / 2.0;
if (!dXi.AlmostEqualsToZero(epsilon) && !dYi.AlmostEqualsToZero(epsilon))
{
perpendicularDistance = ((xMj - i.Point1.X) - (yMj - i.Point1.Y) * dXi / dYi) / Math.Sqrt(dXi * dXi / (dYi * dYi) + 1);
}
else if (dXi.AlmostEqualsToZero(epsilon))
{
perpendicularDistance = xMj - i.Point1.X;
}
else
{
perpendicularDistance = yMj - i.Point1.Y;
}
return overlap;
}
private static PdfPoint? GetTranslatedPoint(double xPi, double yPi, double xPj, double yPj, double dXi, double dYi, double dXj, double dYj, double epsilon)
{
double dYidYj = dYi * dYj;
double dXidXj = dXi * dXj;
double denominator = dYidYj + dXidXj;
if (denominator.AlmostEqualsToZero(epsilon))
{
// The denominator is 0 when translating points, meaning the lines are perpendicular.
return null;
}
double xTj = (xPi * dXidXj + xPj * dYidYj + dXj * dYi * (yPi - yPj)) / denominator;
double yTj = yPj; // TODO: need to check that
if (dXj > epsilon)
{
yTj = dYj / dXj * (xTj - xPj) + yPj;
}
return new PdfPoint(xTj, yTj);
}
/// <summary>
/// Helper function to check if the point belongs to the line./>
/// </summary>
/// <param name="pl1">Line's first point.</param>
/// <param name="pl2">Line's second point.</param>
/// <param name="point">The point to check.</param>
private static bool PointInLine(PdfPoint pl1, PdfPoint pl2, PdfPoint point)
{
// /!\ Assuming the points are aligned (be careful)
double ax = point.X - pl1.X;
double ay = point.Y - pl1.Y;
double bx = pl2.X - pl1.X;
double by = pl2.Y - pl1.Y;
double dotProd1 = ax * bx + ay * by;
if (dotProd1 < 0) return false;
double dotProd2 = bx * bx + by * by;
return dotProd1 <= dotProd2;
}
/// <summary>
/// Helper function to check if 2 lines are equal.
/// </summary>
/// <param name="line1"></param>
/// <param name="line2"></param>
/// <param name="epsilon"></param>
private static bool AlmostEquals(PdfLine line1, PdfLine line2, double epsilon)
{
return (line1.Point1.X - line2.Point1.X).AlmostEqualsToZero(epsilon) &&
(line1.Point1.Y - line2.Point1.Y).AlmostEqualsToZero(epsilon) &&
(line1.Point2.X - line2.Point2.X).AlmostEqualsToZero(epsilon) &&
(line1.Point2.Y - line2.Point2.Y).AlmostEqualsToZero(epsilon);
}
/// <summary>
/// Helper function to compute the between line angle between the pivot's
/// and the candidate's centroid points, taking in account the pivot's rotation.
/// <para>0 ≤ θ ≤ 180.</para>
/// </summary>
private static double AngleBL(Word pivot, Word candidate)
{
var angle = Distances.BoundAngle180(Distances.Angle(pivot.BoundingBox.Centroid, candidate.BoundingBox.Centroid) - pivot.BoundingBox.Rotation);
// Angle is kept within [0, 180] for the check
if (angle < 0)
{
angle += 180;
}
return angle;
}
#endregion
/// <summary>
/// The bounds for the angle between two words for them to have a certain type of relationship.
@@ -314,6 +539,11 @@
/// </summary>
public AngleBounds(double lowerBound, double upperBound)
{
if (lowerBound >= upperBound)
{
throw new ArgumentException("The lower bound should be smaller than the upper bound.");
}
Lower = lowerBound;
Upper = upperBound;
}
@@ -326,5 +556,65 @@
return angle >= Lower && angle <= Upper;
}
}
/// <summary>
/// Docstrum bounding boxes page segmenter options.
/// </summary>
public class DocstrumBoundingBoxesOptions : PageSegmenterOptions
{
/// <summary>
/// Precision when testing equalities.
/// <para>Default value is 1e-3.</para>
/// </summary>
public double Epsilon { get; set; } = 1e-3;
/// <summary>
/// Angle bounds for words to be considered as neighbours on the same line.
/// <para>Default value is -30 ≤ θ ≤ 30.</para>
/// </summary>
public AngleBounds WithinLineBounds { get; set; } = new AngleBounds(-30, 30);
/// <summary>
/// Multiplier that gives the maximum euclidian distance between
/// words for building lines. Maximum distance will be this number times the within-line
/// distance found by the analysis.
/// <para>Default value is 3.</para>
/// </summary>
public double WithinLineMultiplier { get; set; } = 3.0;
/// <summary>
/// The bin size used when building the within-line distances distribution.
/// <para>Default value is 10.</para>
/// </summary>
public int WithinLineBinSize { get; set; } = 10;
/// <summary>
/// Angle bounds for words to be considered as neighbours on separate lines.
/// <para>Default value is 45 ≤ θ ≤ 135.</para>
/// </summary>
public AngleBounds BetweenLineBounds { get; set; } = new AngleBounds(45, 135);
/// <summary>
/// Multiplier that gives the maximum perpendicular distance between
/// text lines for blocking. Maximum distance will be this number times the between-line
/// distance found by the analysis.
/// <para>Default value is 1.3.</para>
/// </summary>
public double BetweenLineMultiplier { get; set; } = 1.3;
/// <summary>
/// The bin size used when building the between-line distances distribution.
/// <para>Default value is 10.</para>
/// </summary>
public int BetweenLineBinSize { get; set; } = 10;
/// <summary>
/// The angular difference bounds between two lines to be considered in the same block.
/// This defines if two lines are parallel enough.
/// <para>Default value is -30 ≤ θ ≤ 30.</para>
/// </summary>
public AngleBounds AngularDifferenceBounds { get; set; } = new AngleBounds(-30, 30);
}
}
}
}

View File

@@ -10,10 +10,18 @@
public interface IPageSegmenter
{
/// <summary>
/// Get the text blocks.
/// Get the blocks using default options values.
/// </summary>
/// <param name="pageWords">The words to generate text blocks for.</param>
/// <param name="words">The page's words to generate text blocks for.</param>
/// <returns>A list of text blocks from this approach.</returns>
IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords);
IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words);
/// <summary>
/// Get the text blocks using options.
/// </summary>
/// <param name="words">The page's words to generate text blocks for.</param>
/// <param name="options"></param>
/// <returns>A list of text blocks from this approach.</returns>
IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options);
}
}

View File

@@ -0,0 +1,20 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
{
/// <summary>
/// Abstract page segmenter options.
/// </summary>
public abstract class PageSegmenterOptions : DlaOptions
{
/// <summary>
/// Separator used between words when building lines.
/// <para>Default value is ' ' (space).</para>
/// </summary>
public string WordSeparator { get; set; } = " ";
/// <summary>
/// Separator used between lines when building paragraphs.
/// <para>Default value is '\n' (new line).</para>
/// </summary>
public string LineSeparator { get; set; } = "\n";
}
}

View File

@@ -7,8 +7,9 @@
using System.Linq;
using UglyToad.PdfPig.Geometry;
/// <inheritdoc />
/// <summary>
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
/// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
/// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips</para>
@@ -20,81 +21,73 @@
/// </summary>
public static RecursiveXYCut Instance { get; } = new RecursiveXYCut();
/// <inheritdoc />
/// <summary>
/// Get the blocks.
/// <para>Uses 'minimumWidth' = 0, 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height)</para>
/// Get the blocks using default options values.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <returns></returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords)
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{
return GetBlocks(pageWords, 0);
return GetBlocks(words, new RecursiveXYCutOptions());
}
/// <inheritdoc />
/// <summary>
/// Get the blocks.
/// <para>Uses 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height)</para>
/// Get the blocks using options values.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="minimumWidth">The minimum width for a block.</param>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double minimumWidth)
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="options">The <see cref="RecursiveXYCutOptions"/> to use.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
{
return GetBlocks(pageWords, minimumWidth,
(letters) =>
if (options is RecursiveXYCutOptions ryxcOptions)
{
if (words?.Any() != true)
{
var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3)));
var mode = widths.Mode();
if (double.IsNaN(mode) || mode == 0)
{
mode = widths.Average();
}
return mode;
},
(letters) =>
{
var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3));
var mode = heights.Mode();
if (double.IsNaN(mode) || mode == 0)
{
mode = heights.Average();
}
return mode * 1.5;
return EmptyArray<TextBlock>.Instance;
}
);
return GetBlocks(words,
ryxcOptions.MinimumWidth,
ryxcOptions.DominantFontWidthFunc,
ryxcOptions.DominantFontHeightFunc,
ryxcOptions.WordSeparator,
ryxcOptions.LineSeparator);
}
else
{
throw new ArgumentException("Options provided must be of type " + nameof(RecursiveXYCutOptions) + ".", nameof(options));
}
}
/// <summary>
/// Get the blocks.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="minimumWidth">The minimum width for a block.</param>
/// <param name="dominantFontWidth">The dominant font width.</param>
/// <param name="dominantFontHeight">The dominant font height.</param>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double minimumWidth,
double dominantFontWidth, double dominantFontHeight)
{
return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight);
}
/// <summary>
/// Get the blocks.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="words">The words in the page.</param>
/// <param name="minimumWidth">The minimum width for a block.</param>
/// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
/// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double minimumWidth,
/// <param name="wordSeparator"></param>
/// <param name="lineSeparator"></param>
private IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, double minimumWidth,
Func<IEnumerable<Letter>, double> dominantFontWidthFunc,
Func<IEnumerable<Letter>, double> dominantFontHeightFunc)
Func<IEnumerable<Letter>, double> dominantFontHeightFunc,
string wordSeparator, string lineSeparator)
{
if (pageWords.Count() == 0) return EmptyArray<TextBlock>.Instance;
// Filter out white spaces
words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text));
if (!words.Any())
{
return EmptyArray<TextBlock>.Instance;
}
XYLeaf root = new XYLeaf(pageWords); // Create a root node.
XYLeaf root = new XYLeaf(words); // Create a root node.
XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
if (node.IsLeaf)
{
return new List<TextBlock> { new TextBlock((node as XYLeaf).GetLines()) };
return new List<TextBlock> { new TextBlock((node as XYLeaf).GetLines(wordSeparator), lineSeparator) };
}
else
{
@@ -102,7 +95,7 @@
if (leaves.Count > 0)
{
return leaves.Select(l => new TextBlock(l.GetLines())).ToList();
return leaves.Select(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator)).ToList();
}
}
@@ -114,7 +107,7 @@
Func<IEnumerable<Letter>, double> dominantFontHeightFunc, int level = 0)
{
// Order words left to right
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Left).ToArray();
var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Left).ToArray();
if (words.Length == 0)
{
@@ -123,7 +116,7 @@
// Create new leaf with non-whitespace words.
leaf = new XYLeaf(words);
if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
{
// We stop cutting if
@@ -195,7 +188,7 @@
}
if (i == wordsCount - 1) projectionProfile.Add(currentProjection);
}
var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w =>
{
// Get words that are contained in each projection profiles
@@ -203,7 +196,7 @@
return normalisedBB.Left >= p.LowerBound && normalisedBB.Right <= p.UpperBound;
}));
var newLeaves = newLeavesEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e));
var newNodes = newLeaves.Select(l => HorizontalCut(l, minimumWidth,
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
@@ -221,7 +214,7 @@
Func<IEnumerable<Letter>, double> dominantFontHeightFunc, int level = 0)
{
// Order words bottom to top
var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray();
var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray();
if (words.Length == 0)
{
@@ -303,7 +296,7 @@
return normalisedBB.Bottom >= p.LowerBound && normalisedBB.Top <= p.UpperBound;
}));
var newLeaves = newLeavesEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e));
var newNodes = newLeaves.Select(l => VerticalCut(l, minimumWidth,
dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();
@@ -335,5 +328,51 @@
return value >= LowerBound && value <= UpperBound;
}
}
/// <summary>
/// Recursive X-Y cut page segmenter options.
/// </summary>
public class RecursiveXYCutOptions : PageSegmenterOptions
{
/// <summary>
/// The minimum width for a block.
/// <para>Default value is 1.</para>
/// </summary>
public double MinimumWidth { get; set; } = 1;
/// <summary>
/// The function that determines the dominant font width.
/// <para>Default value is the mode of the block's letters width.
/// If the mode is not available, the average is used.</para>
/// </summary>
public Func<IEnumerable<Letter>, double> DominantFontWidthFunc { get; set; } =
(letters) =>
{
var widths = letters.Select(x => Math.Max(Math.Round(x.Width, 3), Math.Round(x.GlyphRectangle.Width, 3)));
var mode = widths.Mode();
if (double.IsNaN(mode) || mode == 0)
{
mode = widths.Average();
}
return mode;
};
/// <summary>
/// The function that determines the dominant font height.
/// <para>Default value is the mode of the block's letters height times 1.5.
/// If the mode is not available, the average is used.</para>
/// </summary>
public Func<IEnumerable<Letter>, double> DominantFontHeightFunc { get; set; } =
(letters) =>
{
var heights = letters.Select(x => Math.Round(x.GlyphRectangle.Height, 3));
var mode = heights.Mode();
if (double.IsNaN(mode) || mode == 0)
{
mode = heights.Average();
}
return mode * 1.5;
};
}
}
}
}

View File

@@ -5,6 +5,7 @@
using System;
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
using UglyToad.PdfPig.Geometry;
/// <summary>
@@ -38,20 +39,19 @@
/// <summary>
/// Gets the lines of the leaf.
/// </summary>
public IReadOnlyList<TextLine> GetLines()
public IReadOnlyList<TextLine> GetLines(string wordSeparator)
{
return Words.GroupBy(x => x.BoundingBox.Bottom).OrderByDescending(x => x.Key)
.Select(x => new TextLine(x.ToList())).ToArray();
return Words.GroupBy(x => x.BoundingBox.Bottom)
.Select(x => new TextLine(x.OrderByReadingOrder(), wordSeparator))
.OrderByReadingOrder();
}
/// <summary>
/// Create a new <see cref="XYLeaf"/>.
/// </summary>
/// <param name="words">The words contained in the leaf.</param>
public XYLeaf(params Word[] words) : this(words == null ? null : words.ToList())
{
}
public XYLeaf(params Word[] words) : this(words?.ToList())
{ }
/// <summary>
/// Create a new <see cref="XYLeaf"/>.
@@ -73,4 +73,4 @@
Words = words.ToArray();
}
}
}
}

View File

@@ -0,0 +1,163 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
{
using System;
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.Content;
/// <summary>
/// Helper functions for words and lines ordering.
/// </summary>
public static class ReadingOrderHelper
{
/// <summary>
/// Order words by reading order in a line.
/// <para>Assumes LtR and accounts for rotation.</para>
/// </summary>
/// <param name="words"></param>
public static List<Word> OrderByReadingOrder(this IEnumerable<Word> words)
{
if (words.Count() <= 1)
{
return words.ToList();
}
var TextOrientation = words.First().TextOrientation;
if (TextOrientation != TextOrientation.Other)
{
foreach (var word in words)
{
if (word.TextOrientation != TextOrientation)
{
TextOrientation = TextOrientation.Other;
break;
}
}
}
switch (TextOrientation)
{
case TextOrientation.Horizontal:
return words.OrderBy(w => w.BoundingBox.BottomLeft.X).ToList();
case TextOrientation.Rotate180:
return words.OrderByDescending(w => w.BoundingBox.BottomLeft.X).ToList();
case TextOrientation.Rotate90:
return words.OrderByDescending(w => w.BoundingBox.BottomLeft.Y).ToList();
case TextOrientation.Rotate270:
return words.OrderBy(w => w.BoundingBox.BottomLeft.Y).ToList();
case TextOrientation.Other:
default:
// We consider the words roughly have the same rotation.
var avgAngle = words.Average(w => w.BoundingBox.Rotation);
if (double.IsNaN(avgAngle))
{
throw new NotFiniteNumberException("OrderByReadingOrder: NaN bounding box rotation found when ordering words.", avgAngle);
}
if (0 < avgAngle && avgAngle <= 90)
{
// quadrant 1, 0 < θ < π/2
return words.OrderBy(w => w.BoundingBox.BottomLeft.X).ThenBy(w => w.BoundingBox.BottomLeft.Y).ToList();
}
else if (90 < avgAngle && avgAngle <= 180)
{
// quadrant 2, π/2 < θ ≤ π
return words.OrderByDescending(w => w.BoundingBox.BottomLeft.X).ThenBy(w => w.BoundingBox.BottomLeft.Y).ToList();
}
else if (-180 < avgAngle && avgAngle <= -90)
{
// quadrant 3, -π < θ < -π/2
return words.OrderByDescending(w => w.BoundingBox.BottomLeft.X).ThenByDescending(w => w.BoundingBox.BottomLeft.Y).ToList();
}
else if (-90 < avgAngle && avgAngle <= 0)
{
// quadrant 4, -π/2 < θ < 0
return words.OrderBy(w => w.BoundingBox.BottomLeft.X).ThenByDescending(w => w.BoundingBox.BottomLeft.Y).ToList();
}
else
{
throw new ArgumentException("OrderByReadingOrder: unknown bounding box rotation found when ordering words.", nameof(avgAngle));
}
}
}
/// <summary>
/// Order lines by reading order in a block.
/// <para>Assumes TtB and accounts for rotation.</para>
/// </summary>
/// <param name="lines"></param>
public static IReadOnlyList<TextLine> OrderByReadingOrder(this IEnumerable<TextLine> lines)
{
if (lines.Count() <= 1)
{
return lines.ToList();
}
var TextOrientation = lines.First().TextOrientation;
if (TextOrientation != TextOrientation.Other)
{
foreach (var line in lines)
{
if (line.TextOrientation != TextOrientation)
{
TextOrientation = TextOrientation.Other;
break;
}
}
}
switch (TextOrientation)
{
case TextOrientation.Horizontal:
return lines.OrderByDescending(w => w.BoundingBox.BottomLeft.Y).ToList();
case TextOrientation.Rotate180:
return lines.OrderBy(w => w.BoundingBox.BottomLeft.Y).ToList();
case TextOrientation.Rotate90:
return lines.OrderByDescending(w => w.BoundingBox.BottomLeft.X).ToList();
case TextOrientation.Rotate270:
return lines.OrderBy(w => w.BoundingBox.BottomLeft.X).ToList();
case TextOrientation.Other:
default:
// We consider the lines roughly have the same rotation.
var avgAngle = lines.Average(w => w.BoundingBox.Rotation);
if (double.IsNaN(avgAngle))
{
throw new NotFiniteNumberException("OrderByReadingOrder: NaN bounding box rotation found when ordering lines.", avgAngle);
}
if (0 < avgAngle && avgAngle <= 90)
{
// quadrant 1, 0 < θ < π/2
return lines.OrderByDescending(w => w.BoundingBox.BottomLeft.Y).ThenBy(w => w.BoundingBox.BottomLeft.X).ToList();
}
else if (90 < avgAngle && avgAngle <= 180)
{
// quadrant 2, π/2 < θ ≤ π
return lines.OrderBy(w => w.BoundingBox.BottomLeft.X).ThenBy(w => w.BoundingBox.BottomLeft.Y).ToList();
}
else if (-180 < avgAngle && avgAngle <= -90)
{
// quadrant 3, -π < θ < -π/2
return lines.OrderBy(w => w.BoundingBox.BottomLeft.Y).ThenByDescending(w => w.BoundingBox.BottomLeft.X).ToList();
}
else if (-90 < avgAngle && avgAngle <= 0)
{
// quadrant 4, -π/2 < θ < 0
return lines.OrderByDescending(w => w.BoundingBox.BottomLeft.X).ThenByDescending(w => w.BoundingBox.BottomLeft.Y).ToList();
}
else
{
throw new ArgumentException("OrderByReadingOrder: unknown bounding box rotation found when ordering lines.", nameof(avgAngle));
}
}
}
}
}

View File

@@ -15,13 +15,13 @@
/// </summary>
public static UnsupervisedReadingOrderDetector Instance { get; } = new UnsupervisedReadingOrderDetector();
private double T;
private readonly double T;
/// <summary>
/// Algorithm that retrieve the blocks' reading order using both (spatial) Allens interval relations and rendering order.
/// </summary>
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.
/// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
/// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
/// same column might not be exactly aligned.</param>
public UnsupervisedReadingOrderDetector(double T = 5)
{
@@ -38,10 +38,10 @@
var graph = BuildGraph(textBlocks, T);
while (graph.Any())
while (graph.Count > 0)
{
var maxCount = graph.Max(kvp => kvp.Value.Count);
var current = graph.Where(kvp => kvp.Value.Count == maxCount).FirstOrDefault();
var current = graph.FirstOrDefault(kvp => kvp.Value.Count == maxCount);
graph.Remove(current.Key);
int index = current.Key;
@@ -105,19 +105,14 @@
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
if (xRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
yRelation == IntervalRelations.Meets ||
xRelation == IntervalRelations.Overlaps ||
yRelation == IntervalRelations.Overlaps)
{
return true;
}
return false;
return xRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
yRelation == IntervalRelations.Meets ||
xRelation == IntervalRelations.Overlaps ||
yRelation == IntervalRelations.Overlaps;
}
/// <summary>
/// Column-wise: text-blocks are read in columns, from top-to-bottom and from left-to-right.
/// </summary>
@@ -130,7 +125,7 @@
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
if (xRelation == IntervalRelations.Precedes ||
return xRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
(xRelation == IntervalRelations.Overlaps && (yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
@@ -146,12 +141,7 @@
xRelation == IntervalRelations.DuringI ||
xRelation == IntervalRelations.Finishes ||
xRelation == IntervalRelations.StartsI ||
xRelation == IntervalRelations.OverlapsI)))
{
return true;
}
return false;
xRelation == IntervalRelations.OverlapsI));
}
/// <summary>
@@ -160,40 +150,34 @@
/// <param name="a"></param>
/// <param name="b"></param>
/// <param name="T">The tolerance parameter T.</param>
/// <returns></returns>
private bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
if (yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
(yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
xRelation == IntervalRelations.Overlaps)) ||
((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) &&
(yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
yRelation == IntervalRelations.Overlaps ||
yRelation == IntervalRelations.Starts ||
yRelation == IntervalRelations.FinishesI ||
yRelation == IntervalRelations.Equals ||
yRelation == IntervalRelations.During ||
yRelation == IntervalRelations.DuringI ||
yRelation == IntervalRelations.Finishes ||
yRelation == IntervalRelations.StartsI ||
yRelation == IntervalRelations.OverlapsI)))
{
return true;
}
return false;
return yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
(yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
xRelation == IntervalRelations.Overlaps)) ||
((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) &&
(yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
yRelation == IntervalRelations.Overlaps ||
yRelation == IntervalRelations.Starts ||
yRelation == IntervalRelations.FinishesI ||
yRelation == IntervalRelations.Equals ||
yRelation == IntervalRelations.During ||
yRelation == IntervalRelations.DuringI ||
yRelation == IntervalRelations.Finishes ||
yRelation == IntervalRelations.StartsI ||
yRelation == IntervalRelations.OverlapsI));
}
/// <summary>
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate.
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
/// </summary>
/// <param name="a"></param>
@@ -201,85 +185,83 @@
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
private IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = IntervalRelations.Unknown;
if (a.BoundingBox.Right < b.BoundingBox.Left - T)
{
xRelation = IntervalRelations.Precedes;
return IntervalRelations.Precedes;
}
else if (a.BoundingBox.Right >= b.BoundingBox.Left - T)
{
xRelation = IntervalRelations.PrecedesI;
return IntervalRelations.PrecedesI;
}
else if (b.BoundingBox.Left - T <= a.BoundingBox.Right
&& a.BoundingBox.Right <= b.BoundingBox.Left + T)
{
xRelation = IntervalRelations.Meets;
return IntervalRelations.Meets;
}
else if (b.BoundingBox.Left - T > a.BoundingBox.Right
&& a.BoundingBox.Right > b.BoundingBox.Left + T)
{
xRelation = IntervalRelations.MeetsI;
return IntervalRelations.MeetsI;
}
else if (a.BoundingBox.Left < b.BoundingBox.Left - T
&& (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T))
{
xRelation = IntervalRelations.Overlaps;
return IntervalRelations.Overlaps;
}
else if (a.BoundingBox.Left >= b.BoundingBox.Left - T
&& (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T))
{
xRelation = IntervalRelations.OverlapsI;
return IntervalRelations.OverlapsI;
}
else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
{
xRelation = IntervalRelations.Starts;
return IntervalRelations.Starts;
}
else if ((b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T)
else if (b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
{
xRelation = IntervalRelations.StartsI;
return IntervalRelations.StartsI;
}
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
{
xRelation = IntervalRelations.During;
return IntervalRelations.During;
}
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
{
xRelation = IntervalRelations.DuringI;
return IntervalRelations.DuringI;
}
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
{
xRelation = IntervalRelations.Finishes;
return IntervalRelations.Finishes;
}
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T))
{
xRelation = IntervalRelations.FinishesI;
return IntervalRelations.FinishesI;
}
else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
{
xRelation = IntervalRelations.Equals;
return IntervalRelations.Equals;
}
return xRelation;
return IntervalRelations.Unknown;
}
/// <summary>
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate.
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
/// </summary>
/// <param name="a"></param>
@@ -287,79 +269,77 @@
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
private IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T)
{
IntervalRelations yRelation = IntervalRelations.Unknown;
if (a.BoundingBox.Bottom < b.BoundingBox.Top - T)
{
yRelation = IntervalRelations.PrecedesI;
return IntervalRelations.PrecedesI;
}
else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T)
{
yRelation = IntervalRelations.Precedes;
return IntervalRelations.Precedes;
}
else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom
&& a.BoundingBox.Bottom <= b.BoundingBox.Top + T)
{
yRelation = IntervalRelations.MeetsI;
return IntervalRelations.MeetsI;
}
else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom
&& a.BoundingBox.Bottom > b.BoundingBox.Top + T)
{
yRelation = IntervalRelations.Meets;
return IntervalRelations.Meets;
}
else if (a.BoundingBox.Top < b.BoundingBox.Top - T
&& (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T))
{
yRelation = IntervalRelations.OverlapsI;
return IntervalRelations.OverlapsI;
}
else if (a.BoundingBox.Top >= b.BoundingBox.Top - T
&& (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T))
{
yRelation = IntervalRelations.Overlaps;
return IntervalRelations.Overlaps;
}
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
else if (b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
{
yRelation = IntervalRelations.StartsI;
return IntervalRelations.StartsI;
}
else if ((b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T)
else if (b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
{
yRelation = IntervalRelations.Starts;
return IntervalRelations.Starts;
}
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
{
yRelation = IntervalRelations.DuringI;
return IntervalRelations.DuringI;
}
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
{
yRelation = IntervalRelations.During;
return IntervalRelations.During;
}
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
{
yRelation = IntervalRelations.FinishesI;
return IntervalRelations.FinishesI;
}
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
&& (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T))
{
yRelation = IntervalRelations.Finishes;
return IntervalRelations.Finishes;
}
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
{
yRelation = IntervalRelations.Equals;
return IntervalRelations.Equals;
}
return yRelation;
return IntervalRelations.Unknown;
}
/// <summary>

View File

@@ -11,15 +11,20 @@
/// </summary>
public class TextBlock
{
/// <summary>
/// The separator used between lines in the block.
/// </summary>
public readonly string Separator;
/// <summary>
/// The text of the block.
/// </summary>
public string Text { get; }
/// <summary>
/// The text direction of the block.
/// The text orientation of the block.
/// </summary>
public TextDirection TextDirection { get; }
public TextOrientation TextOrientation { get; }
/// <summary>
/// The rectangle completely containing the block.
@@ -39,8 +44,9 @@
/// <summary>
/// Create a new <see cref="TextBlock"/>.
/// </summary>
/// <param name="lines"></param>
public TextBlock(IReadOnlyList<TextLine> lines)
/// <param name="lines">The words contained in the line, in the correct order.</param>
/// <param name="separator">The separator used between lines in the block.</param>
public TextBlock(IReadOnlyList<TextLine> lines, string separator = "\n")
{
if (lines == null)
{
@@ -52,21 +58,252 @@
throw new ArgumentException("Empty lines provided.", nameof(lines));
}
Separator = separator;
ReadingOrder = -1;
TextLines = lines;
Text = string.Join(" ", lines.Select(x => x.Text));
if (lines.Count == 1)
{
BoundingBox = lines[0].BoundingBox;
Text = lines[0].Text;
TextOrientation = lines[0].TextOrientation;
}
else
{
var tempTextOrientation = lines[0].TextOrientation;
if (tempTextOrientation != TextOrientation.Other)
{
foreach (var letter in lines)
{
if (letter.TextOrientation != tempTextOrientation)
{
tempTextOrientation = TextOrientation.Other;
break;
}
}
}
var minX = lines.Min(x => x.BoundingBox.Left);
var minY = lines.Min(x => x.BoundingBox.Bottom);
var maxX = lines.Max(x => x.BoundingBox.Right);
var maxY = lines.Max(x => x.BoundingBox.Top);
BoundingBox = new PdfRectangle(minX, minY, maxX, maxY);
switch (tempTextOrientation)
{
case TextOrientation.Horizontal:
BoundingBox = GetBoundingBoxH(lines);
break;
TextDirection = lines[0].TextDirection;
case TextOrientation.Rotate180:
BoundingBox = GetBoundingBox180(lines);
break;
case TextOrientation.Rotate90:
BoundingBox = GetBoundingBox90(lines);
break;
case TextOrientation.Rotate270:
BoundingBox = GetBoundingBox270(lines);
break;
case TextOrientation.Other:
default:
BoundingBox = GetBoundingBoxOther(lines);
break;
}
Text = string.Join(separator, lines.Select(x => x.Text));
TextOrientation = tempTextOrientation;
}
}
#region Bounding box
private PdfRectangle GetBoundingBoxH(IReadOnlyList<TextLine> lines)
{
var blX = double.MaxValue;
var trX = double.MinValue;
var blY = double.MaxValue;
var trY = double.MinValue;
for (var i = 0; i < lines.Count; i++)
{
var line = lines[i];
if (line.BoundingBox.BottomLeft.X < blX)
{
blX = line.BoundingBox.BottomLeft.X;
}
if (line.BoundingBox.BottomLeft.Y < blY)
{
blY = line.BoundingBox.BottomLeft.Y;
}
var right = line.BoundingBox.BottomLeft.X + line.BoundingBox.Width;
if (right > trX)
{
trX = right;
}
if (line.BoundingBox.TopLeft.Y > trY)
{
trY = line.BoundingBox.TopLeft.Y;
}
}
return new PdfRectangle(blX, blY, trX, trY);
}
private PdfRectangle GetBoundingBox180(IReadOnlyList<TextLine> lines)
{
var blX = double.MinValue;
var blY = double.MinValue;
var trX = double.MaxValue;
var trY = double.MaxValue;
for (var i = 0; i < lines.Count; i++)
{
var line = lines[i];
if (line.BoundingBox.BottomLeft.X > blX)
{
blX = line.BoundingBox.BottomLeft.X;
}
if (line.BoundingBox.BottomLeft.Y > blY)
{
blY = line.BoundingBox.BottomLeft.Y;
}
var right = line.BoundingBox.BottomLeft.X - line.BoundingBox.Width;
if (right < trX)
{
trX = right;
}
if (line.BoundingBox.TopRight.Y < trY)
{
trY = line.BoundingBox.TopRight.Y;
}
}
return new PdfRectangle(blX, blY, trX, trY);
}
private PdfRectangle GetBoundingBox90(IReadOnlyList<TextLine> lines)
{
var b = double.MaxValue;
var r = double.MaxValue;
var t = double.MinValue;
var l = double.MinValue;
for (var i = 0; i < lines.Count; i++)
{
var line = lines[i];
if (line.BoundingBox.BottomLeft.X < b)
{
b = line.BoundingBox.BottomLeft.X;
}
if (line.BoundingBox.BottomRight.Y < r)
{
r = line.BoundingBox.BottomRight.Y;
}
var right = line.BoundingBox.BottomLeft.X + line.BoundingBox.Height;
if (right > t)
{
t = right;
}
if (line.BoundingBox.BottomLeft.Y > l)
{
l = line.BoundingBox.BottomLeft.Y;
}
}
return new PdfRectangle(new PdfPoint(t, l), new PdfPoint(t, r),
new PdfPoint(b, l), new PdfPoint(b, r));
}
private PdfRectangle GetBoundingBox270(IReadOnlyList<TextLine> lines)
{
var t = double.MaxValue;
var b = double.MinValue;
var l = double.MaxValue;
var r = double.MinValue;
for (var i = 0; i < lines.Count; i++)
{
var line = lines[i];
if (line.BoundingBox.BottomLeft.X > b)
{
b = line.BoundingBox.BottomLeft.X;
}
if (line.BoundingBox.BottomLeft.Y < l)
{
l = line.BoundingBox.BottomLeft.Y;
}
var right = line.BoundingBox.BottomLeft.X - line.BoundingBox.Height;
if (right < t)
{
t = right;
}
if (line.BoundingBox.BottomRight.Y > r)
{
r = line.BoundingBox.BottomRight.Y;
}
}
return new PdfRectangle(new PdfPoint(t, l), new PdfPoint(t, r),
new PdfPoint(b, l), new PdfPoint(b, r));
}
private PdfRectangle GetBoundingBoxOther(IReadOnlyList<TextLine> lines)
{
var points = lines.SelectMany(l => new[]
{
l.BoundingBox.BottomLeft,
l.BoundingBox.BottomRight,
l.BoundingBox.TopLeft,
l.BoundingBox.TopRight
});
// Candidates bounding boxes
var obb = Geometry.GeometryExtensions.MinimumAreaRectangle(points);
var obb1 = new PdfRectangle(obb.BottomLeft, obb.TopLeft, obb.BottomRight, obb.TopRight);
var obb2 = new PdfRectangle(obb.BottomRight, obb.BottomLeft, obb.TopRight, obb.TopLeft);
var obb3 = new PdfRectangle(obb.TopRight, obb.BottomRight, obb.TopLeft, obb.BottomLeft);
// Find the orientation of the OBB, using the baseline angle
// Assumes line order is correct
var lastLine = lines[lines.Count - 1];
var baseLineAngle = Distances.BoundAngle180(Distances.Angle(lastLine.BoundingBox.BottomLeft, lastLine.BoundingBox.BottomRight));
double deltaAngle = Math.Abs(Distances.BoundAngle180(obb.Rotation - baseLineAngle));
double deltaAngle1 = Math.Abs(Distances.BoundAngle180(obb1.Rotation - baseLineAngle));
if (deltaAngle1 < deltaAngle)
{
deltaAngle = deltaAngle1;
obb = obb1;
}
double deltaAngle2 = Math.Abs(Distances.BoundAngle180(obb2.Rotation - baseLineAngle));
if (deltaAngle2 < deltaAngle)
{
deltaAngle = deltaAngle2;
obb = obb2;
}
double deltaAngle3 = Math.Abs(Distances.BoundAngle180(obb3.Rotation - baseLineAngle));
if (deltaAngle3 < deltaAngle)
{
obb = obb3;
}
return obb;
}
#endregion
/// <summary>
/// Sets the <see cref="TextBlock"/>'s reading order.
/// </summary>

View File

@@ -29,15 +29,15 @@
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="minimumElements">The minimum number of elements to define a text edge.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyDictionary<EdgeType, List<PdfLine>> GetEdges(IEnumerable<Word> pageWords, int minimumElements = 4,
int maxDegreeOfParallelism = -1)
{
if (minimumElements < 0)
{
throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements");
throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", nameof(minimumElements));
}
var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim()));
@@ -46,10 +46,7 @@
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
Parallel.ForEach(edgesFuncs, parallelOptions, f =>
{
dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
});
Parallel.ForEach(edgesFuncs, parallelOptions, f => dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements)));
return dictionary.ToDictionary(x => x.Key, x => x.Value);
}

View File

@@ -68,7 +68,7 @@
else if (previous.Value != " ")
{
var gap = letter.StartBaseLine.X - previous.EndBaseLine.X;
if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous))
{
sb.Append(" ");

View File

@@ -11,15 +11,20 @@
/// </summary>
public class TextLine
{
/// <summary>
/// The separator used between words in the line.
/// </summary>
public readonly string Separator;
/// <summary>
/// The text of the line.
/// </summary>
public string Text { get; }
/// <summary>
/// The text direction of the line.
/// The text orientation of the line.
/// </summary>
public TextDirection TextDirection { get; }
public TextOrientation TextOrientation { get; }
/// <summary>
/// The rectangle completely containing the line.
@@ -34,8 +39,9 @@
/// <summary>
/// Create a new <see cref="TextLine"/>.
/// </summary>
/// <param name="words">The words contained in the line.</param>
public TextLine(IReadOnlyList<Word> words)
/// <param name="words">The words contained in the line, in the correct order.</param>
/// <param name="separator">The separator used between words in the line.</param>
public TextLine(IReadOnlyList<Word> words, string separator = " ")
{
if (words == null)
{
@@ -47,36 +53,299 @@
throw new ArgumentException("Empty words provided.", nameof(words));
}
Separator = separator;
Words = words;
Text = string.Join(" ", words.Where(s => !string.IsNullOrWhiteSpace(s.Text)).Select(x => x.Text));
var normalisedBoundingBoxes = words.Select(x => NormaliseRectangle(x.BoundingBox)).ToList();
var minX = normalisedBoundingBoxes.Min(x => x.Left);
var minY = normalisedBoundingBoxes.Min(x => x.Bottom);
var maxX = normalisedBoundingBoxes.Max(x => x.Right);
var maxY = normalisedBoundingBoxes.Max(x => x.Top);
BoundingBox = new PdfRectangle(minX, minY, maxX, maxY);
if (words.All(x => x.TextDirection == words[0].TextDirection))
if (Words.Count == 1)
{
TextDirection = words[0].TextDirection;
BoundingBox = Words[0].BoundingBox;
Text = Words[0].Text;
TextOrientation = words[0].TextOrientation;
}
else
{
TextDirection = TextDirection.Other;
var tempTextOrientation = words[0].TextOrientation;
if (tempTextOrientation != TextOrientation.Other)
{
foreach (var letter in words)
{
if (letter.TextOrientation != tempTextOrientation)
{
tempTextOrientation = TextOrientation.Other;
break;
}
}
}
switch (tempTextOrientation)
{
case TextOrientation.Horizontal:
BoundingBox = GetBoundingBoxH(words);
break;
case TextOrientation.Rotate180:
BoundingBox = GetBoundingBox180(words);
break;
case TextOrientation.Rotate90:
BoundingBox = GetBoundingBox90(words);
break;
case TextOrientation.Rotate270:
BoundingBox = GetBoundingBox270(words);
break;
case TextOrientation.Other:
default:
BoundingBox = GetBoundingBoxOther(words);
break;
}
Text = string.Join(Separator, words.Where(s => !string.IsNullOrWhiteSpace(s.Text)).Select(x => x.Text));
TextOrientation = tempTextOrientation;
}
}
private PdfRectangle NormaliseRectangle(PdfRectangle rectangle)
#region Bounding box
private PdfRectangle GetBoundingBoxH(IReadOnlyList<Word> words)
{
return new PdfRectangle(Math.Min(Math.Min(Math.Min(rectangle.TopLeft.X, rectangle.TopRight.X), rectangle.BottomLeft.X), rectangle.BottomRight.X),
Math.Min(Math.Min(Math.Min(rectangle.TopLeft.Y, rectangle.TopRight.Y), rectangle.BottomLeft.Y), rectangle.BottomRight.Y),
Math.Max(Math.Max(Math.Max(rectangle.TopLeft.X, rectangle.TopRight.X), rectangle.BottomLeft.X), rectangle.BottomRight.X),
Math.Max(Math.Max(Math.Max(rectangle.TopLeft.Y, rectangle.TopRight.Y), rectangle.BottomLeft.Y), rectangle.BottomRight.Y));
var blX = double.MaxValue;
var trX = double.MinValue;
var blY = double.MaxValue;
var trY = double.MinValue;
for (var i = 0; i < words.Count; i++)
{
var word = words[i];
if (word.BoundingBox.BottomLeft.X < blX)
{
blX = word.BoundingBox.BottomLeft.X;
}
if (word.BoundingBox.BottomLeft.Y < blY)
{
blY = word.BoundingBox.BottomLeft.Y;
}
var right = word.BoundingBox.BottomLeft.X + word.BoundingBox.Width;
if (right > trX)
{
trX = right;
}
if (word.BoundingBox.TopLeft.Y > trY)
{
trY = word.BoundingBox.TopLeft.Y;
}
}
return new PdfRectangle(blX, blY, trX, trY);
}
private PdfRectangle GetBoundingBox180(IReadOnlyList<Word> words)
{
var blX = double.MinValue;
var blY = double.MinValue;
var trX = double.MaxValue;
var trY = double.MaxValue;
for (var i = 0; i < words.Count; i++)
{
var word = words[i];
if (word.BoundingBox.BottomLeft.X > blX)
{
blX = word.BoundingBox.BottomLeft.X;
}
if (word.BoundingBox.BottomLeft.Y > blY)
{
blY = word.BoundingBox.BottomLeft.Y;
}
var right = word.BoundingBox.BottomLeft.X - word.BoundingBox.Width;
if (right < trX)
{
trX = right;
}
if (word.BoundingBox.TopRight.Y < trY)
{
trY = word.BoundingBox.TopRight.Y;
}
}
return new PdfRectangle(blX, blY, trX, trY);
}
private PdfRectangle GetBoundingBox90(IReadOnlyList<Word> words)
{
var b = double.MaxValue;
var r = double.MaxValue;
var t = double.MinValue;
var l = double.MinValue;
for (var i = 0; i < words.Count; i++)
{
var word = words[i];
if (word.BoundingBox.BottomLeft.X < b)
{
b = word.BoundingBox.BottomLeft.X;
}
if (word.BoundingBox.BottomRight.Y < r)
{
r = word.BoundingBox.BottomRight.Y;
}
var right = word.BoundingBox.BottomLeft.X + word.BoundingBox.Height;
if (right > t)
{
t = right;
}
if (word.BoundingBox.BottomLeft.Y > l)
{
l = word.BoundingBox.BottomLeft.Y;
}
}
return new PdfRectangle(new PdfPoint(t, l), new PdfPoint(t, r),
new PdfPoint(b, l), new PdfPoint(b, r));
}
private PdfRectangle GetBoundingBox270(IReadOnlyList<Word> words)
{
var t = double.MaxValue;
var b = double.MinValue;
var l = double.MaxValue;
var r = double.MinValue;
for (var i = 0; i < words.Count; i++)
{
var word = words[i];
if (word.BoundingBox.BottomLeft.X > b)
{
b = word.BoundingBox.BottomLeft.X;
}
if (word.BoundingBox.BottomLeft.Y < l)
{
l = word.BoundingBox.BottomLeft.Y;
}
var right = word.BoundingBox.BottomLeft.X - word.BoundingBox.Height;
if (right < t)
{
t = right;
}
if (word.BoundingBox.BottomRight.Y > r)
{
r = word.BoundingBox.BottomRight.Y;
}
}
return new PdfRectangle(new PdfPoint(t, l), new PdfPoint(t, r),
new PdfPoint(b, l), new PdfPoint(b, r));
}
private static PdfRectangle GetBoundingBoxOther(IReadOnlyList<Word> words)
{
var baseLinePoints = words.SelectMany(r => new[]
{
r.BoundingBox.BottomLeft,
r.BoundingBox.BottomRight,
}).ToList();
// Fitting a line through the base lines points
// to find the orientation (slope)
double x0 = baseLinePoints.Average(p => p.X);
double y0 = baseLinePoints.Average(p => p.Y);
double sumProduct = 0;
double sumDiffSquaredX = 0;
for (int i = 0; i < baseLinePoints.Count; i++)
{
var point = baseLinePoints[i];
var x_diff = point.X - x0;
var y_diff = point.Y - y0;
sumProduct += x_diff * y_diff;
sumDiffSquaredX += x_diff * x_diff;
}
double cos = 0;
double sin = 1;
if (sumDiffSquaredX > 1e-3)
{
// not a vertical line
double angleRad = Math.Atan(sumProduct / sumDiffSquaredX); // -π/2 ≤ θ ≤ π/2
cos = Math.Cos(angleRad);
sin = Math.Sin(angleRad);
}
// Rotate the points to build the axis-aligned bounding box (AABB)
var inverseRotation = new TransformationMatrix(
cos, -sin, 0,
sin, cos, 0,
0, 0, 1);
var transformedPoints = words.SelectMany(r => new[]
{
r.BoundingBox.BottomLeft,
r.BoundingBox.BottomRight,
r.BoundingBox.TopLeft,
r.BoundingBox.TopRight
}).Distinct().Select(p => inverseRotation.Transform(p));
var aabb = new PdfRectangle(transformedPoints.Min(p => p.X),
transformedPoints.Min(p => p.Y),
transformedPoints.Max(p => p.X),
transformedPoints.Max(p => p.Y));
// Rotate back the AABB to obtain to oriented bounding box (OBB)
var rotateBack = new TransformationMatrix(
cos, sin, 0,
-sin, cos, 0,
0, 0, 1);
// Candidates bounding boxes
var obb = rotateBack.Transform(aabb);
var obb1 = new PdfRectangle(obb.BottomLeft, obb.TopLeft, obb.BottomRight, obb.TopRight);
var obb2 = new PdfRectangle(obb.BottomRight, obb.BottomLeft, obb.TopRight, obb.TopLeft);
var obb3 = new PdfRectangle(obb.TopRight, obb.BottomRight, obb.TopLeft, obb.BottomLeft);
// Find the orientation of the OBB, using the baseline angle
// Assumes word order is correct
var firstWord = words[0];
var lastWord = words[words.Count - 1];
var baseLineAngle = Distances.Angle(firstWord.BoundingBox.BottomLeft, lastWord.BoundingBox.BottomRight);
double deltaAngle = Math.Abs(Distances.BoundAngle180(obb.Rotation - baseLineAngle));
double deltaAngle1 = Math.Abs(Distances.BoundAngle180(obb1.Rotation - baseLineAngle));
if (deltaAngle1 < deltaAngle)
{
deltaAngle = deltaAngle1;
obb = obb1;
}
double deltaAngle2 = Math.Abs(Distances.BoundAngle180(obb2.Rotation - baseLineAngle));
if (deltaAngle2 < deltaAngle)
{
deltaAngle = deltaAngle2;
obb = obb2;
}
double deltaAngle3 = Math.Abs(Distances.BoundAngle180(obb3.Rotation - baseLineAngle));
if (deltaAngle3 < deltaAngle)
{
obb = obb3;
}
return obb;
}
#endregion
/// <inheritdoc />
public override string ToString()
{

View File

@@ -39,7 +39,7 @@
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
/// surrounding obstacles by some percent. Default value is 15%.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
@@ -49,7 +49,7 @@
var bboxes = words.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0)
.Select(o => o.BoundingBox).ToList();
if (images != null && images.Count() > 0)
if (images?.Any() == true)
{
bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds));
}
@@ -69,14 +69,14 @@
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
/// surrounding obstacles by some percent. Default value is 15%.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<PdfRectangle> boundingboxes,
double minWidth, double minHeight, int maxRectangleCount = 40, double whitespaceFuzziness = 0.15, int maxBoundQueueSize = 0)
{
if (boundingboxes.Count() == 0) return EmptyArray<PdfRectangle>.Instance;
if (!boundingboxes.Any()) return EmptyArray<PdfRectangle>.Instance;
var obstacles = new HashSet<PdfRectangle>(boundingboxes);
var pageBound = GetBound(obstacles);
@@ -195,51 +195,32 @@
return false;
}
if (rectangle1.Left == rectangle2.Right ||
rectangle1.Right == rectangle2.Left ||
rectangle1.Bottom == rectangle2.Top ||
rectangle1.Top == rectangle2.Bottom)
{
return true;
}
return false;
return rectangle1.Left == rectangle2.Right ||
rectangle1.Right == rectangle2.Left ||
rectangle1.Bottom == rectangle2.Top ||
rectangle1.Top == rectangle2.Bottom;
}
private static bool IsAdjacentToPageBounds(PdfRectangle pageBound, PdfRectangle rectangle)
{
if (rectangle.Bottom == pageBound.Bottom ||
rectangle.Top == pageBound.Top ||
rectangle.Left == pageBound.Left ||
rectangle.Right == pageBound.Right)
{
return true;
}
return false;
return rectangle.Bottom == pageBound.Bottom ||
rectangle.Top == pageBound.Top ||
rectangle.Left == pageBound.Left ||
rectangle.Right == pageBound.Right;
}
private static bool OverlapsHard(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
if (rectangle1.Left >= rectangle2.Right ||
rectangle2.Left >= rectangle1.Right ||
rectangle1.Top <= rectangle2.Bottom ||
rectangle2.Top <= rectangle1.Bottom)
{
return false;
}
return true;
return rectangle1.Left < rectangle2.Right &&
rectangle2.Left < rectangle1.Right &&
rectangle1.Top > rectangle2.Bottom &&
rectangle2.Top > rectangle1.Bottom;
}
private static bool Inside(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
if (rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left &&
rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom)
{
return true;
}
return false;
return rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left &&
rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom;
}
private static PdfRectangle GetBound(IEnumerable<PdfRectangle> obstacles)
@@ -254,7 +235,7 @@
#region Sorted Queue
private class QueueEntries : SortedSet<QueueEntry>
{
readonly int bound;
private readonly int bound;
public QueueEntries(int maximumBound)
{
@@ -306,7 +287,7 @@
public bool IsEmptyEnough()
{
return !Obstacles.Any();
return Obstacles.Count == 0;
}
public bool IsEmptyEnough(IEnumerable<PdfRectangle> pageObstacles)
@@ -349,12 +330,11 @@
{
if (obj is QueueEntry entry)
{
if (Bound.Left != entry.Bound.Left ||
Bound.Right != entry.Bound.Right ||
Bound.Top != entry.Bound.Top ||
Bound.Bottom != entry.Bound.Bottom ||
Obstacles != entry.Obstacles) return false;
return true;
return Bound.Left == entry.Bound.Left &&
Bound.Right == entry.Bound.Right &&
Bound.Top == entry.Bound.Top &&
Bound.Bottom == entry.Bound.Bottom &&
Obstacles == entry.Obstacles;
}
return false;
}
@@ -383,16 +363,6 @@
// solution.
return rectangle.Area * (rectangle.Height / 4.0);
}
private static double OverlappingArea(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
var intersect = rectangle1.Intersect(rectangle2);
if (intersect.HasValue)
{
return intersect.Value.Area;
}
return 0;
}
}
#endregion
}

View File

@@ -19,109 +19,171 @@
public static NearestNeighbourWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
/// <summary>
/// Gets or sets the maximum number of concurrent tasks enabled. Default value is -1.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para>
/// Get the words using default options values.
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1;
/// <summary>
/// Gets the words, using the <see cref="Distances.Manhattan"/> distance.
/// </summary>
/// <param name="letters">The letters in the page.</param>
/// <param name="letters">The page's letters to group into <see cref="Word"/>s.</param>
/// <returns>The <see cref="Word"/>s generated by the nearest neighbour method.</returns>
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
{
double maxDistFunc(Letter l1, Letter l2)
return GetWords(letters, new NearestNeighbourWordExtractorOptions());
}
/// <summary>
/// Get the words using options values.
/// </summary>
/// <param name="letters">The page's letters to group into <see cref="Word"/>s.</param>
/// <param name="options">The <see cref="NearestNeighbourWordExtractorOptions"/> to use.</param>
/// <returns>The <see cref="Word"/>s generated by the nearest neighbour method.</returns>
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters, DlaOptions options)
{
if (options is NearestNeighbourWordExtractorOptions nnOptions)
{
return Math.Max(Math.Max(Math.Max(Math.Max(Math.Max(
Math.Abs(l1.GlyphRectangle.Width),
Math.Abs(l2.GlyphRectangle.Width)),
Math.Abs(l1.Width)),
Math.Abs(l2.Width)),
l1.PointSize), l2.PointSize) * 0.2;
}
if (letters == null || letters.Count == 0)
{
return EmptyArray<Word>.Instance;
}
bool filterFunc(Letter l1, Letter l2)
if (nnOptions.GroupByOrientation)
{
// axis aligned
List<Word> words = GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism);
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
// not axis aligned
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
return words;
}
else
{
return GetWords(letters,
nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism);
}
}
else
{
return !string.IsNullOrWhiteSpace(l2.Value);
throw new ArgumentException("Options provided must be of type " + nameof(NearestNeighbourWordExtractorOptions) + ".", nameof(options));
}
List<Word> wordsH = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Horizontal).ToList(),
(l1, l2) => maxDistFunc(l1, l2),
Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Bottom)
.ThenBy(x => x.BoundingBox.Left).ToList();
var words270 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate270).ToList(),
(l1, l2) => maxDistFunc(l1, l2),
Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
.OrderBy(x => x.BoundingBox.Right)
.ThenByDescending(x => x.BoundingBox.Bottom);
wordsH.AddRange(words270);
var words180 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate180).ToList(),
(l1, l2) => maxDistFunc(l1, l2),
Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
.OrderBy(x => x.BoundingBox.Top)
.ThenByDescending(x => x.BoundingBox.Right);
wordsH.AddRange(words180);
var words90 = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Rotate90).ToList(),
(l1, l2) => maxDistFunc(l1, l2),
Distances.Manhattan, filterFunc, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Left)
.ThenBy(x => x.BoundingBox.Top);
wordsH.AddRange(words90);
var wordsU = GetWords(
letters.Where(l => l.TextDirection == TextDirection.Other).ToList(),
(l1, l2) => maxDistFunc(l1, l2) * 2.0, // allow twice the distance for oriented text
Distances.Euclidean, filterFunc, MaxDegreeOfParallelism)
.OrderByDescending(x => x.BoundingBox.Bottom)
.ThenBy(x => x.BoundingBox.Left);
wordsH.AddRange(wordsU);
return wordsH;
}
/// <summary>
/// Gets the words.
/// </summary>
/// <param name="pageLetters">The letters in the page.</param>
/// <param name="letters">The letters in the page.</param>
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two letters (start and end base line points),
/// e.g. Max(GlyphRectangle.Width) x 20%.
/// e.g. Max(GlyphRectangle.Width) x 20%.
/// <para>If the distance between the two letters is greater, a new word will be created.</para></param>
/// <param name="distMeasure">The distance measure between two letters (start and end base line points),
/// e.g. the Manhattan distance.</param>
/// <param name="filterPivotFunction"></param>
/// <param name="filterFunction">Function used to filter out connection between letters, e.g. check if the letters have the same color.
/// <para>If the function returns false, a new word will be created.</para></param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public List<Word> GetWords(IReadOnlyList<Letter> pageLetters,
private List<Word> GetWords(IReadOnlyList<Letter> letters,
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure,
Func<Letter, bool> filterPivotFunction,
Func<Letter, Letter, bool> filterFunction, int maxDegreeOfParallelism)
{
if (pageLetters == null || pageLetters.Count == 0) return new List<Word>();
if (letters == null || letters.Count == 0) return new List<Word>();
var groupedIndexes = Clustering.NearestNeighbours(pageLetters,
var groupedLetters = Clustering.NearestNeighbours(letters,
distMeasure, maxDistanceFunction,
l => l.EndBaseLine, l => l.StartBaseLine,
l => !string.IsNullOrWhiteSpace(l.Value),
filterPivotFunction,
filterFunction,
maxDegreeOfParallelism).ToList();
List<Word> words = new List<Word>();
for (int a = 0; a < groupedIndexes.Count; a++)
foreach (var g in groupedLetters)
{
words.Add(new Word(groupedIndexes[a].Select(i => pageLetters[i]).ToList()));
words.Add(new Word(g));
}
return words;
}
/// <summary>
/// Nearest neighbour word extractor options.
/// </summary>
public class NearestNeighbourWordExtractorOptions : DlaOptions
{
/// <summary>
/// The maximum distance between two letters (start and end base line points) within the same word, as a function of the two letters.
/// If the distance between the two letters is greater than this maximum, they will belong to different words.
/// <para>Default value is 20% of the Max(Width, PointSize) of both letters. If <see cref="TextOrientation"/> is Other, this distance is doubled.</para>
/// </summary>
public Func<Letter, Letter, double> MaximumDistance { get; set; } = (l1, l2) =>
{
double maxDist = Math.Max(Math.Max(Math.Max(Math.Max(Math.Max(
Math.Abs(l1.GlyphRectangle.Width),
Math.Abs(l2.GlyphRectangle.Width)),
Math.Abs(l1.Width)),
Math.Abs(l2.Width)),
l1.PointSize), l2.PointSize) * 0.2;
if (l1.TextOrientation == TextOrientation.Other || l2.TextOrientation == TextOrientation.Other)
{
return 2.0 * maxDist;
}
return maxDist;
};
/// <summary>
/// The default distance measure used between two letters (start and end base line points).
/// <para>Default value is the Euclidean distance.</para>
/// </summary>
public Func<PdfPoint, PdfPoint, double> DistanceMeasure { get; set; } = Distances.Euclidean;
/// <summary>
/// The distance measure used between two letters (start and end base line points) with axis aligned <see cref="TextOrientation"/>.
/// Only used if GroupByOrientation is set to true.
/// <para>Default value is the Manhattan distance.</para>
/// </summary>
public Func<PdfPoint, PdfPoint, double> DistanceMeasureAA { get; set; } = Distances.Manhattan;
/// <summary>
/// Function used to filter out connection between letters, e.g. check if the letters have the same color.
/// If the function returns false, letters will belong to different words.
/// <para>Default value checks whether the neighbour is a white space or not. If it is the case, it returns false.</para>
/// </summary>
public Func<Letter, Letter, bool> Filter { get; set; } = (_, l2) => !string.IsNullOrWhiteSpace(l2.Value);
/// <summary>
/// Function used prior searching for the nearest neighbour. If return false, no search will be done.
/// <para>Default value checks whether the current letter is a white space or not. If it is the case, it returns false and no search is done.</para>
/// </summary>
public Func<Letter, bool> FilterPivot { get; set; } = l => !string.IsNullOrWhiteSpace(l.Value);
/// <summary>
/// If true, letters will be grouped by <see cref="TextOrientation"/> before processing.
/// The DistanceMeasureAA will be used on axis aligned letters, and the DistanceMeasure on others.
/// If false, DistanceMeasure will be used for all letters and DistanceMeasureAA won't be used.
/// <para>Default value is true.</para>
/// </summary>
public bool GroupByOrientation { get; set; } = true;
}
}
}
}

View File

@@ -0,0 +1,20 @@
namespace UglyToad.PdfPig.Tests.Dla
{
using System;
using System.IO;
internal static class DlaHelper
{
public static string GetDocumentPath(string name, bool isPdf = true)
{
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Dla", "Documents"));
if (!name.EndsWith(".pdf") && isPdf)
{
name += ".pdf";
}
return Path.Combine(documentFolder, name);
}
}
}

View File

@@ -0,0 +1,95 @@
namespace UglyToad.PdfPig.Tests.Dla
{
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
using Xunit;
public class DocstrumBoundingBoxesTests
{
public static IEnumerable<object[]> DataExtract => new[]
{
new object[]
{
"complex rotated.pdf",
new string[]
{
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Suspendisse at ullamcorper libero. Cras sit amet dui laoreet tellus tristique commodo. Nam pretium id ligula ac malesuada. Mauris at lacinia magna. Curabitur ex lectus, lobortis lobortis turpis ac, congue aliquet quam. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Etiam consectetur sem et ex sagittis pretium. Praesent urna velit, mollis vitae ex vel, hendrerit finibus mauris. In at luctus orci. Nunc odio justo, rhoncus et euismod nec, bibendum vitae lacus. Aenean maximus sapien lacus, ut pellentesque tellus egestas eget. Nulla semper massa ut vehicula faucibus. Nam rhoncus, dolor consectetur pulvinar gravida, nisi sem luctus nibh, non venenatis nunc lorem et velit.",
"Morbi euismod mattis libero, nec porta neque aliquam et. Nunc sed felis id libero tincidunt malesuada et laoreet orci. Phasellus massa libero, cursus imperdiet rhoncus quis, consequat eu eros. Nullam imperdiet felis sed ligula faucibus bibendum. Vestibulum rhoncus metus eu congue cursus. Maecenas vulputate dignissim dolor a iaculis. Praesent vel diam congue, dapibus lorem nec, viverra dolor. Vestibulum quis odio a risus semper aliquam.",
"Mauris tincidunt massa id lorem consectetur, in vestibulum nibh feugiat. Quisque eget commodo tortor. Duis iaculis, urna eget porttitor consectetur, metus lacus tempus urna, vehicula facilisis quam est scelerisque dui. Vestibulum imperdiet, tellus vel vulputate pretium, dolor mauris aliquet erat, sit amet fringilla nisi dui ut felis.",
"Cras gravida vel risus sit amet sagittis. Vestibulum et purus pretium, accumsan turpis ac, consectetur augue. Nam viverra purus in urna mollis eleifend. Donec non imperdiet justo. In commodo tortor in diam feugiat, eget placerat augue posuere. Donec justo arcu, rutrum in massa quis, dictum condimentum risus. Nunc euismod et dolor at elementum. Duis pretium risus rhoncus mauris pulvinar, vel semper elit tempus. Quisque imperdiet, odio et hendrerit laoreet, justo dolor blandit sapien, ut mollis risus elit sit amet lacus. Vivamus id tortor eleifend, gravida tortor vitae, dignissim mauris. Integer efficitur ac neque id venenatis. Suspendisse pharetra neque sit amet ornare convallis. Sed eget eros dignissim risus eleifend elementum. Duis non bibendum ipsum.",
"Morbi euismod mattis libero, nec porta neque aliquam et. Nunc sed felis id libero tincidunt malesuada et laoreet orci. Phasellus massa libero, cursus imperdiet rhoncus quis, consequat eu eros. Nullam imperdiet felis sed ligula faucibus bibendum. Vestibulum rhoncus metus eu congue cursus. Maecenas vulputate dignissim dolor a iaculis. Praesent vel diam congue, dapibus lorem nec, viverra dolor. Vestibulum quis odio a risus semper aliquam."
}
},
new object[]
{
"90 180 270 rotated.pdf",
new string[]
{
"Morbi euismod mattis libero, nec porta neque aliquam et. Nunc sed felis id libero tincidunt malesuada et laoreet orci. Phasellus massa libero, cursus imperdiet rhoncus quis, consequat eu eros. Nullam imperdiet felis sed ligula faucibus bibendum. Vestibulum rhoncus metus eu congue cursus. Maecenas vulputate dignissim dolor a iaculis. Praesent vel diam congue, dapibus lorem nec, viverra dolor. Vestibulum quis odio a risus semper aliquam.",
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Suspendisse at ullamcorper libero. Cras sit amet dui laoreet tellus tristique commodo. Nam pretium id ligula ac malesuada. Mauris at lacinia magna. Curabitur ex lectus, lobortis lobortis turpis ac, congue aliquet quam. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Etiam consectetur sem et ex sagittis pretium. Praesent urna velit, mollis vitae ex vel, hendrerit finibus mauris. In at luctus orci. Nunc odio justo, rhoncus et euismod nec, bibendum vitae lacus. Aenean maximus sapien lacus, ut pellentesque tellus egestas eget. Nulla semper massa ut vehicula faucibus. Nam rhoncus, dolor consectetur pulvinar gravida, nisi sem luctus nibh, non venenatis nunc lorem et velit.",
"Cras gravida vel risus sit amet sagittis. Vestibulum et purus pretium, accumsan turpis ac, consectetur augue. Nam viverra purus in urna mollis eleifend. Donec non imperdiet justo. In commodo tortor in diam feugiat, eget placerat augue posuere. Donec justo arcu, rutrum in massa quis, dictum condimentum risus. Nunc euismod et dolor at elementum. Duis pretium risus rhoncus mauris pulvinar, vel semper elit tempus. Quisque imperdiet, odio et hendrerit laoreet, justo dolor blandit sapien, ut mollis risus elit sit amet lacus. Vivamus id tortor eleifend, gravida tortor vitae, dignissim mauris. Integer efficitur ac neque id venenatis. Suspendisse pharetra neque sit amet ornare convallis. Sed eget eros dignissim risus eleifend elementum. Duis non bibendum ipsum.",
}
},
new object[]
{
"Random 2 Columns Lists Hyph - Justified.pdf",
new string[]
{
"Random Big Title",
"Lorem Ipsum text with lists Lorem ipsum dolor sit amet, consectetur adipiscing elit. In sodales gravida felis, in rhoncus velit rutrum at. Curabitur hendrerit dapibus nulla, ut hendrerit diam imperdiet quis. Pellentesque id neque ali- quam, pulvinar neque in, vulputate elit. Pel- lentesque ut erat sit amet massa suscipit ullamcor- per. Sed porttitor viverra convallis. Duis vitae sem- per metus. Pellentesque eros purus, egestas eget velit eget, elementum aliquet velit. Suspendisse potenti. Nulla vitae massa rutrum, blandit erat vi- tae, aliquet arcu.",
"Aenean feugiat leo sed enim sodales vehicula. Sus- pendisse tempus hendrerit magna sagittis dictum. Duis ultrices dapibus egestas. Cras eu felis eu lectus suscipit pharetra at at lacus. Nulla facilisi. Proin in- terdum faucibus elit nec rhoncus. Proin sodaless metus sed tincidunt hendrerit.",
"Donec ultricies cursus odio sed rutrum. Nam ven- enatis metus vitae elementum scelerisque. Ali- quam tempor sapien at turpis posuere eleifend. Sed placerat posuere nunc vel efficitur. Quisque auctor felis vel lectus dictum fringilla. Quisque vo- lutpat pulvinar© elit. Aliquam ultrices feugiat ali- quam. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Sus- pendisse imperdiet ex lorem, porta bibendum pu- rus ultricies id.",
"Integer vel lacus sapien. Nam sodales ante eu risus facilisis placerat. Aliquam suscipit pulvinar ultricies. Aenean pulvinar, ex ac fermentum egestas, erat nisi feugiat velit, vitae suscipit tellus odio vitae quam. Morbi elementum sem in elit posuere, non",
"•","•","•","•",
"Duis leo enim, convallis sit amet orci eget, condimentum mattis mi ; Etiam dolor erat, maximus nec mi sed, con- vallis convallis orci ; Morbi viverra diam in diam cursus, vitae aliquet velit tempus ; Donec at nisi fermentum, ultricies odio eget, egestas massa at nisi fermentum, ul- tricies odio eget, egestas massa.",
"rhoncus magna fringilla. Phasellus cursus in dolor laoreet rutrum. Curabitur tincidunt risus ullamcor- per, vehicula velit at, pulvinar metus.",
"Donec quis ante leo. Vivamus pharetra, nisl ac vehi- cula tempor, tellus lacus aliquam sapien, eu congue nibh quam sit amet odio. Quisque metus arcu, sem- per nec consequat eu, pellentesque vel sem. Sed purus risus, tincidunt¹ sit amet dictum vitae, euis- mod id nibh. Praesent ultrices libero quis enim porta, sit amet pellentesque augue pretium. Viva- mus nec molestie nunc. Donec finibus enim nec tel- lus laoreet elementum. Curabitur efficitur placerat dolor et semper.",
"Morbi laoreet dui eu tortor luctus, nec ultrices do- lor ullamcorper. Ut gravida sed nisl a efficitur. In tincidunt orci a condimentum semper. Suspendisse scelerisque fermentum lacinia. Vestibulum sit amet ornare tellus, aliquet euismod mauris. Cras suscipit venenatis ultrices. Sed diam erat, aliquet a tellus ut, viverra 12º ongue magna. Cras id justo tortor. Mauris in tortor vulputate, pellentesque nisl ac, facilisis ligula. Class aptent taciti² sociosqu ad li- tora torquent per conubia nostra³, per inceptos himenaeos. Aliquam eget dolor turpis. Mauris id molestie tellus. Sed elementum molestie nisi, at ali- quet sem vehicula nec. Morbi tempus nulla enim, a vulputate magna €51 luctus £66 eu. Fusce sodales, libero quis suscipit ultrices, metus erat auctor urna, sit amet dictum arcu tortor eu metus.",
"Morbi vestibulum varius ipsum nec molestie. Proin auctor efficitur diam ut luctus. Phasellus cursus maximus ultricies. Mauris eu neque ut sem semper tempus. Curabitur non lorem eu nunc lobortis vi- verra at in diam. Pellentesque euismod purus a leo lobortis tempor. Maecenas mollis ligula at sem sus- cipit fringilla. Mauris sollicitudin tincidunt lectus id tempor. Etiam ut nisi est.",
"1. Ut volutpat, velit at interdum consectetur, nisl lorem consequat mauris, feugiat dignissim tellus massa ut nisl. 2. Praesent at est nisi. Pellentesque rutrum lorem sed dui accumsan gravida. 3. Pellentesque dictum nisl vitae urna luctus, congue pulvinar mi congue.",
}
},
new object[]
{
"no vertical distance.pdf",
new string[]
{
"Documents second line left aligned.",
"Documents first line right aligned."
}
},
new object[]
{
"no horizontal distance.pdf",
new string[]
{
"First. Second."
}
}
};
[Theory]
[MemberData(nameof(DataExtract))]
public void GetBlocks(string name, string[] expected)
{
var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions() { LineSeparator = " " };
using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
{
var page = document.GetPage(1);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words, options);
Assert.Equal(expected.Length, blocks.Count);
var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
.ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();
for (int i = 0; i < orderedBlocks.Count; i++)
{
Assert.Equal(expected[i], orderedBlocks[i].Text);
}
}
}
}
}

View File

@@ -6727,7 +6727,6 @@
// root -> right side -> right side
Assert.Null(kdTree.Root.RightChild.RightChild);
}
[Theory]

View File

@@ -183,7 +183,7 @@
var dataFloat = data.Select(x => (float)x);
Assert.Equal((float)expected, dataFloat.Mode(), PreciseDoubleComparer);
}
[Theory]
[MemberData(nameof(ModeDataNaN))]
public void ModeNaN(double[] data)

View File

@@ -0,0 +1,56 @@
namespace UglyToad.PdfPig.Tests.Dla
{
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
using Xunit;
public class RecursiveXYCutTests
{
public static IEnumerable<object[]> DataExtract => new[]
{
new object[]
{
"Random 2 Columns Lists Hyph - Justified.pdf",
new string[]
{
"Random Big Title",
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. In sodales gravida felis, in rhoncus velit rutrum at. Curabitur hendrerit dapibus nulla, ut hendrerit diam imperdiet quis. Pellentesque id neque ali- quam, pulvinar neque in, vulputate elit. Pel- lentesque ut erat sit amet massa suscipit ullamcor- per. Sed porttitor viverra convallis. Duis vitae sem- per metus. Pellentesque eros purus, egestas eget velit eget, elementum aliquet velit. Suspendisse potenti. Nulla vitae massa rutrum, blandit erat vi- tae, aliquet arcu.",
"Aenean feugiat leo sed enim sodales vehicula. Sus- pendisse tempus hendrerit magna sagittis dictum. Duis ultrices dapibus egestas. Cras eu felis eu lectus suscipit pharetra at at lacus. Nulla facilisi. Proin in- terdum faucibus elit nec rhoncus. Proin sodaless metus sed tincidunt hendrerit.",
"Donec ultricies cursus odio sed rutrum. Nam ven- enatis metus vitae elementum scelerisque. Ali- quam tempor sapien at turpis posuere eleifend. Sed placerat posuere nunc vel efficitur. Quisque auctor felis vel lectus dictum fringilla. Quisque vo- lutpat pulvinar© elit. Aliquam ultrices feugiat ali- quam. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Sus- pendisse imperdiet ex lorem, porta bibendum pu- rus ultricies id.",
"Integer vel lacus sapien. Nam sodales ante eu risus facilisis placerat. Aliquam suscipit pulvinar ultricies. Aenean pulvinar, ex ac fermentum egestas, erat nisi feugiat velit, vitae suscipit tellus odio vitae quam. Morbi elementum sem in elit posuere, non",
"• Duis leo enim, convallis sit amet orci eget, condimentum mattis mi ; • Etiam dolor erat, maximus nec mi sed, con- vallis convallis orci ; • Morbi viverra diam in diam cursus, vitae aliquet velit tempus ; • Donec at nisi fermentum, ultricies odio eget, egestas massa at nisi fermentum, ul- tricies odio eget, egestas massa.",
"Lorem Ipsum text with lists",
"rhoncus magna fringilla. Phasellus cursus in dolor laoreet rutrum. Curabitur tincidunt risus ullamcor- per, vehicula velit at, pulvinar metus.",
"Donec quis ante leo. Vivamus pharetra, nisl ac vehi- cula tempor, tellus lacus aliquam sapien, eu congue nibh quam sit amet odio. Quisque metus arcu, sem- per nec consequat eu, pellentesque vel sem. Sed purus risus, tincidunt¹ sit amet dictum vitae, euis- mod id nibh. Praesent ultrices libero quis enim porta, sit amet pellentesque augue pretium. Viva- mus nec molestie nunc. Donec finibus enim nec tel- lus laoreet elementum. Curabitur efficitur placerat dolor et semper.",
"Morbi laoreet dui eu tortor luctus, nec ultrices do- lor ullamcorper. Ut gravida sed nisl a efficitur. In tincidunt orci a condimentum semper. Suspendisse scelerisque fermentum lacinia. Vestibulum sit amet ornare tellus, aliquet euismod mauris. Cras suscipit venenatis ultrices. Sed diam erat, aliquet a tellus ut, viverra 12º ongue magna. Cras id justo tortor. Mauris in tortor vulputate, pellentesque nisl ac, facilisis ligula. Class aptent taciti² sociosqu ad li- tora torquent per conubia nostra³, per inceptos himenaeos. Aliquam eget dolor turpis. Mauris id molestie tellus. Sed elementum molestie nisi, at ali- quet sem vehicula nec. Morbi tempus nulla enim, a vulputate magna €51 luctus £66 eu. Fusce sodales, libero quis suscipit ultrices, metus erat auctor urna, sit amet dictum arcu tortor eu metus.",
"Morbi vestibulum varius ipsum nec molestie. Proin auctor efficitur diam ut luctus. Phasellus cursus maximus ultricies. Mauris eu neque ut sem semper tempus. Curabitur non lorem eu nunc lobortis vi- verra at in diam. Pellentesque euismod purus a leo lobortis tempor. Maecenas mollis ligula at sem sus- cipit fringilla. Mauris sollicitudin tincidunt lectus id tempor. Etiam ut nisi est.",
"1. Ut volutpat, velit at interdum consectetur, nisl lorem consequat mauris, feugiat dignissim tellus massa ut nisl. 2. Praesent at est nisi. Pellentesque rutrum lorem sed dui accumsan gravida. 3. Pellentesque dictum nisl vitae urna luctus, congue pulvinar mi congue.",
}
}
};
[Theory]
[MemberData(nameof(DataExtract))]
public void GetBlocks(string name, string[] expected)
{
using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
{
var page = document.GetPage(1);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
var options = new RecursiveXYCut.RecursiveXYCutOptions() { MinimumWidth = page.Width / 3.0, LineSeparator = " " };
var blocks = RecursiveXYCut.Instance.GetBlocks(words, options);
Assert.Equal(expected.Length, blocks.Count);
var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
.ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();
for (int i = 0; i < orderedBlocks.Count; i++)
{
Assert.Equal(expected[i], orderedBlocks[i].Text);
}
}
}
}
}

View File

@@ -90,5 +90,21 @@
Assert.False(document.TryGetBookmarks(out _));
}
}
[Fact]
public void StartXRefNotNearEnd()
{
var bytes = File.ReadAllBytes(GetFilename());
var emptyTrailer = new byte[2026];
emptyTrailer[0] = 10;
bytes = bytes.Concat(emptyTrailer).ToArray();
using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff))
{
Assert.Equal(1, document.NumberOfPages);
}
}
}
}

View File

@@ -79,7 +79,7 @@
"UglyToad.PdfPig.Content.PageSize",
"UglyToad.PdfPig.Content.PageTreeNode",
"UglyToad.PdfPig.Content.Word",
"UglyToad.PdfPig.Content.TextDirection",
"UglyToad.PdfPig.Content.TextOrientation",
"UglyToad.PdfPig.Content.XmpMetadata",
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
"UglyToad.PdfPig.CrossReference.CrossReferenceType",

View File

@@ -108,4 +108,25 @@
<ProjectReference Include="..\UglyToad.PdfPig\UglyToad.PdfPig.csproj" />
</ItemGroup>
<ItemGroup>
<None Update="Dla\Documents\90 180 270 rotated.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Dla\Documents\complex rotated.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Dla\Documents\no horizontal distance.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Dla\Documents\no vertical distance.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Dla\Documents\no vertical horizontal distance.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Dla\Documents\Random 2 Columns Lists Hyph - Justified.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>
</Project>

View File

@@ -15,9 +15,9 @@
public string Value { get; }
/// <summary>
/// Text direction of the letter.
/// Text orientation of the letter.
/// </summary>
public TextDirection TextDirection { get; }
public TextOrientation TextOrientation { get; }
/// <summary>
/// The placement position of the character in PDF space. See <see cref="StartBaseLine"/>
@@ -100,32 +100,32 @@
Color = color ?? GrayColor.Black;
PointSize = pointSize;
TextSequence = textSequence;
TextDirection = GetTextDirection();
TextOrientation = GetTextOrientation();
}
private TextDirection GetTextDirection()
private TextOrientation GetTextOrientation()
{
if (System.Math.Abs(StartBaseLine.Y - EndBaseLine.Y) < 10e-5)
{
if (StartBaseLine.X > EndBaseLine.X)
{
return TextDirection.Rotate180;
return TextOrientation.Rotate180;
}
return TextDirection.Horizontal;
return TextOrientation.Horizontal;
}
if (System.Math.Abs(StartBaseLine.X - EndBaseLine.X) < 10e-5)
{
if (StartBaseLine.Y > EndBaseLine.Y)
{
return TextDirection.Rotate90;
return TextOrientation.Rotate90;
}
return TextDirection.Rotate270;
return TextOrientation.Rotate270;
}
return TextDirection.Other;
return TextOrientation.Other;
}
/// <summary>

View File

@@ -1,17 +1,17 @@
namespace UglyToad.PdfPig.Content
{
/// <summary>
/// Direction of the text.
/// Orientation of the text.
/// </summary>
public enum TextDirection : byte
public enum TextOrientation : byte
{
/// <summary>
/// Other text direction.
/// Other text orientation.
/// </summary>
Other = 0,
/// <summary>
/// Usual text direction (Left to Right).
/// Usual text orientation (Left to Right).
/// </summary>
Horizontal = 1,

View File

@@ -5,7 +5,6 @@
using System.Collections.Generic;
using System.Linq;
using System.Text;
using UglyToad.PdfPig.Geometry;
/// <summary>
/// A word.
@@ -18,9 +17,9 @@
public string Text { get; }
/// <summary>
/// The text direction of the word.
/// The text orientation of the word.
/// </summary>
public TextDirection TextDirection { get; }
public TextOrientation TextOrientation { get; }
/// <summary>
/// The rectangle completely containing the word.
@@ -40,7 +39,7 @@
/// <summary>
/// Create a new <see cref="Word"/>.
/// </summary>
/// <param name="letters">The letters contained in the word.</param>
/// <param name="letters">The letters contained in the word, in the correct order.</param>
public Word(IReadOnlyList<Letter> letters)
{
if (letters == null)
@@ -55,40 +54,39 @@
Letters = letters;
var tempTextDirection = letters[0].TextDirection;
if (tempTextDirection != TextDirection.Other)
var tempTextOrientation = letters[0].TextOrientation;
if (tempTextOrientation != TextOrientation.Other)
{
foreach (var letter in letters)
{
if (letter.TextDirection != tempTextDirection)
if (letter.TextOrientation != tempTextOrientation)
{
tempTextDirection = TextDirection.Other;
tempTextOrientation = TextOrientation.Other;
break;
}
}
}
Tuple<string, PdfRectangle> data;
switch (tempTextDirection)
switch (tempTextOrientation)
{
case TextDirection.Horizontal:
case TextOrientation.Horizontal:
data = GetBoundingBoxH(letters);
break;
case TextDirection.Rotate180:
case TextOrientation.Rotate180:
data = GetBoundingBox180(letters);
break;
case TextDirection.Rotate90:
case TextOrientation.Rotate90:
data = GetBoundingBox90(letters);
break;
case TextDirection.Rotate270:
case TextOrientation.Rotate270:
data = GetBoundingBox270(letters);
break;
case TextDirection.Other:
case TextOrientation.Other:
default:
data = GetBoundingBoxOther(letters);
break;
@@ -98,7 +96,7 @@
BoundingBox = data.Item2;
FontName = letters[0].FontName;
TextDirection = tempTextDirection;
TextOrientation = tempTextOrientation;
}
#region Bounding box
@@ -106,162 +104,160 @@
{
var builder = new StringBuilder();
var minX = double.MaxValue;
var maxX = double.MinValue;
var minY = double.MaxValue;
var maxY = double.MinValue;
var blX = double.MaxValue;
var trX = double.MinValue;
var blY = double.MaxValue;
var trY = double.MinValue;
for (var i = 0; i < letters.Count; i++)
{
var letter = letters[i];
builder.Append(letter.Value);
if (letter.StartBaseLine.X < minX)
if (letter.StartBaseLine.X < blX)
{
minX = letter.StartBaseLine.X;
blX = letter.StartBaseLine.X;
}
if (letter.StartBaseLine.Y < minY)
if (letter.StartBaseLine.Y < blY)
{
minY = letter.StartBaseLine.Y;
blY = letter.StartBaseLine.Y;
}
var right = letter.StartBaseLine.X + Math.Max(letter.Width, letter.GlyphRectangle.Width);
if (right > maxX)
if (right > trX)
{
maxX = right;
trX = right;
}
if (letter.GlyphRectangle.Top > maxY)
if (letter.GlyphRectangle.TopLeft.Y > trY)
{
maxY = letter.GlyphRectangle.Top;
trY = letter.GlyphRectangle.TopLeft.Y;
}
}
return new Tuple<string, PdfRectangle>(builder.ToString(), new PdfRectangle(minX, minY, maxX, maxY));
return new Tuple<string, PdfRectangle>(builder.ToString(), new PdfRectangle(blX, blY, trX, trY));
}
private Tuple<string, PdfRectangle> GetBoundingBox180(IReadOnlyList<Letter> letters)
{
var builder = new StringBuilder();
var minX = double.MaxValue;
var maxX = double.MinValue;
var maxY = double.MinValue;
var minY = double.MaxValue;
var blX = double.MinValue;
var blY = double.MinValue;
var trX = double.MaxValue;
var trY = double.MaxValue;
for (var i = 0; i < letters.Count; i++)
{
var letter = letters[i];
builder.Append(letter.Value);
if (letter.StartBaseLine.X > maxX)
if (letter.StartBaseLine.X > blX)
{
maxX = letter.StartBaseLine.X;
blX = letter.StartBaseLine.X;
}
if (letter.StartBaseLine.Y > maxY)
if (letter.StartBaseLine.Y > blY)
{
maxY = letter.StartBaseLine.Y;
blY = letter.StartBaseLine.Y;
}
var right = letter.StartBaseLine.X + Math.Min(letter.Width, letter.GlyphRectangle.Width);
if (right < minX)
var right = letter.StartBaseLine.X - Math.Max(letter.Width, letter.GlyphRectangle.Width);
if (right < trX)
{
minX = right;
trX = right;
}
if (letter.GlyphRectangle.Top < minY)
if (letter.GlyphRectangle.TopRight.Y < trY)
{
minY = letter.GlyphRectangle.Top;
trY = letter.GlyphRectangle.TopRight.Y;
}
}
return new Tuple<string, PdfRectangle>(builder.ToString(), new PdfRectangle(maxX, maxY, minX, minY));
return new Tuple<string, PdfRectangle>(builder.ToString(), new PdfRectangle(blX, blY, trX, trY));
}
private Tuple<string, PdfRectangle> GetBoundingBox90(IReadOnlyList<Letter> letters)
{
var builder = new StringBuilder();
var minX = double.MaxValue;
var maxX = double.MinValue;
var minY = double.MaxValue;
var maxY = double.MinValue;
var b = double.MaxValue;
var r = double.MaxValue;
var t = double.MinValue;
var l = double.MinValue;
for (var i = 0; i < letters.Count; i++)
{
var letter = letters[i];
builder.Append(letter.Value);
if (letter.StartBaseLine.X < minX)
if (letter.StartBaseLine.X < b)
{
minX = letter.StartBaseLine.X;
b = letter.StartBaseLine.X;
}
if (letter.EndBaseLine.Y < minY)
if (letter.EndBaseLine.Y < r)
{
minY = letter.EndBaseLine.Y;
r = letter.EndBaseLine.Y;
}
var right = letter.StartBaseLine.X - letter.GlyphRectangle.Height;
if (right > maxX)
var right = letter.StartBaseLine.X + letter.GlyphRectangle.Height;
if (right > t)
{
maxX = right;
t = right;
}
if (letter.GlyphRectangle.Top > maxY)
if (letter.GlyphRectangle.BottomLeft.Y > l)
{
maxY = letter.GlyphRectangle.Top;
l = letter.GlyphRectangle.BottomLeft.Y;
}
}
return new Tuple<string, PdfRectangle>(builder.ToString(), new PdfRectangle(new PdfPoint(maxX, maxY),
new PdfPoint(maxX, minY),
new PdfPoint(minX, maxY),
new PdfPoint(minX, minY)));
return new Tuple<string, PdfRectangle>(builder.ToString(), new PdfRectangle(
new PdfPoint(t, l), new PdfPoint(t, r),
new PdfPoint(b, l), new PdfPoint(b, r)));
}
private Tuple<string, PdfRectangle> GetBoundingBox270(IReadOnlyList<Letter> letters)
{
var builder = new StringBuilder();
var minX = double.MaxValue;
var maxX = double.MinValue;
var minY = double.MaxValue;
var maxY = double.MinValue;
var t = double.MaxValue;
var b = double.MinValue;
var l = double.MaxValue;
var r = double.MinValue;
for (var i = 0; i < letters.Count; i++)
{
var letter = letters[i];
builder.Append(letter.Value);
if (letter.StartBaseLine.X > maxX)
if (letter.StartBaseLine.X > b)
{
maxX = letter.StartBaseLine.X;
b = letter.StartBaseLine.X;
}
if (letter.StartBaseLine.Y < minY)
if (letter.StartBaseLine.Y < l)
{
minY = letter.StartBaseLine.Y;
l = letter.StartBaseLine.Y;
}
var right = letter.StartBaseLine.X - letter.GlyphRectangle.Height;
if (right < minX)
if (right < t)
{
minX = right;
t = right;
}
if (letter.GlyphRectangle.Bottom > maxY)
if (letter.GlyphRectangle.BottomRight.Y > r)
{
maxY = letter.GlyphRectangle.Bottom;
r = letter.GlyphRectangle.BottomRight.Y;
}
}
return new Tuple<string, PdfRectangle>(builder.ToString(), new PdfRectangle(new PdfPoint(minX, minY),
new PdfPoint(minX, maxY),
new PdfPoint(maxX, minY),
new PdfPoint(maxX, maxY)));
return new Tuple<string, PdfRectangle>(builder.ToString(), new PdfRectangle(
new PdfPoint(t, l), new PdfPoint(t, r),
new PdfPoint(b, l), new PdfPoint(b, r)));
}
private Tuple<string, PdfRectangle> GetBoundingBoxOther(IReadOnlyList<Letter> letters)
@@ -272,97 +268,121 @@
builder.Append(letters[i].Value);
}
var baseLinePoints = letters.SelectMany(r => new[]
if (letters.Count == 1)
{
r.StartBaseLine,
r.EndBaseLine,
}).ToList();
// Fitting a line through the base lines points
// to find the orientation (slope)
double x0 = baseLinePoints.Average(p => p.X);
double y0 = baseLinePoints.Average(p => p.Y);
double sumProduct = 0;
double sumDiffSquaredX = 0;
for (int i = 0; i < baseLinePoints.Count; i++)
{
var point = baseLinePoints[i];
var x_diff = point.X - x0;
var y_diff = point.Y - y0;
sumProduct += x_diff * y_diff;
sumDiffSquaredX += x_diff * x_diff;
return new Tuple<string, PdfRectangle>(builder.ToString(), letters[0].GlyphRectangle);
}
var slope = sumProduct / sumDiffSquaredX;
// Rotate the points to build the axis-aligned bounding box (AABB)
var angleRad = Math.Atan(slope);
var cos = Math.Cos(angleRad);
var sin = Math.Sin(angleRad);
var inverseRotation = new TransformationMatrix(
cos, -sin, 0,
sin, cos, 0,
0, 0, 1);
var transformedPoints = letters.SelectMany(r => new[]
else
{
r.StartBaseLine,
r.EndBaseLine,
r.GlyphRectangle.TopLeft,
r.GlyphRectangle.TopRight
}).Distinct().Select(p => inverseRotation.Transform(p));
var aabb = new PdfRectangle(transformedPoints.Min(p => p.X),
transformedPoints.Min(p => p.Y),
transformedPoints.Max(p => p.X),
transformedPoints.Max(p => p.Y));
var baseLinePoints = letters.SelectMany(r => new[]
{
r.StartBaseLine,
r.EndBaseLine,
}).ToList();
// Rotate back the AABB to obtain to oriented bounding box (OBB)
var rotateBack = new TransformationMatrix(
cos, sin, 0,
-sin, cos, 0,
0, 0, 1);
// Fitting a line through the base lines points
// to find the orientation (slope)
double x0 = baseLinePoints.Average(p => p.X);
double y0 = baseLinePoints.Average(p => p.Y);
double sumProduct = 0;
double sumDiffSquaredX = 0;
// Candidates bounding boxes
var obb = rotateBack.Transform(aabb);
var obb1 = new PdfRectangle(obb.BottomLeft, obb.TopLeft, obb.BottomRight, obb.TopRight);
var obb2 = new PdfRectangle(obb.TopRight, obb.BottomRight, obb.TopLeft, obb.BottomLeft);
var obb3 = new PdfRectangle(obb.BottomRight, obb.BottomLeft, obb.TopRight, obb.TopLeft);
for (int i = 0; i < baseLinePoints.Count; i++)
{
var point = baseLinePoints[i];
var x_diff = point.X - x0;
var y_diff = point.Y - y0;
sumProduct += x_diff * y_diff;
sumDiffSquaredX += x_diff * x_diff;
}
// Find the orientation of the OBB, using the baseline angle
var firstLetter = letters[0];
var lastLetter = letters[letters.Count - 1];
double cos = 0;
double sin = 1;
if (sumDiffSquaredX > 1e-3)
{
// not vertical line
double angleRad = Math.Atan(sumProduct / sumDiffSquaredX); // -π/2 ≤ θ ≤ π/2
cos = Math.Cos(angleRad);
sin = Math.Sin(angleRad);
}
var baseLineAngle = Math.Atan2(
lastLetter.EndBaseLine.Y - firstLetter.StartBaseLine.Y,
lastLetter.EndBaseLine.X - firstLetter.StartBaseLine.X) * 180 / Math.PI;
// Rotate the points to build the axis-aligned bounding box (AABB)
var inverseRotation = new TransformationMatrix(
cos, -sin, 0,
sin, cos, 0,
0, 0, 1);
double deltaAngle = Math.Abs(baseLineAngle - obb.Rotation);
double deltaAngle1 = Math.Abs(baseLineAngle - obb1.Rotation);
if (deltaAngle1 < deltaAngle)
{
deltaAngle = deltaAngle1;
obb = obb1;
var transformedPoints = letters.SelectMany(r => new[]
{
r.StartBaseLine,
r.EndBaseLine,
r.GlyphRectangle.TopLeft,
r.GlyphRectangle.TopRight
}).Distinct().Select(p => inverseRotation.Transform(p));
var aabb = new PdfRectangle(transformedPoints.Min(p => p.X),
transformedPoints.Min(p => p.Y),
transformedPoints.Max(p => p.X),
transformedPoints.Max(p => p.Y));
// Rotate back the AABB to obtain to oriented bounding box (OBB)
var rotateBack = new TransformationMatrix(
cos, sin, 0,
-sin, cos, 0,
0, 0, 1);
// Candidates bounding boxes
var obb = rotateBack.Transform(aabb);
var obb1 = new PdfRectangle(obb.BottomLeft, obb.TopLeft, obb.BottomRight, obb.TopRight);
var obb2 = new PdfRectangle(obb.BottomRight, obb.BottomLeft, obb.TopRight, obb.TopLeft);
var obb3 = new PdfRectangle(obb.TopRight, obb.BottomRight, obb.TopLeft, obb.BottomLeft);
// Find the orientation of the OBB, using the baseline angle
// Assumes word order is correct
var firstLetter = letters[0];
var lastLetter = letters[letters.Count - 1];
var baseLineAngle = Math.Atan2(
lastLetter.EndBaseLine.Y - firstLetter.StartBaseLine.Y,
lastLetter.EndBaseLine.X - firstLetter.StartBaseLine.X) * 180 / Math.PI;
double deltaAngle = Math.Abs(BoundAngle180(obb.Rotation - baseLineAngle));
double deltaAngle1 = Math.Abs(BoundAngle180(obb1.Rotation - baseLineAngle));
if (deltaAngle1 < deltaAngle)
{
deltaAngle = deltaAngle1;
obb = obb1;
}
double deltaAngle2 = Math.Abs(BoundAngle180(obb2.Rotation - baseLineAngle));
if (deltaAngle2 < deltaAngle)
{
deltaAngle = deltaAngle2;
obb = obb2;
}
double deltaAngle3 = Math.Abs(BoundAngle180(obb3.Rotation - baseLineAngle));
if (deltaAngle3 < deltaAngle)
{
obb = obb3;
}
return new Tuple<string, PdfRectangle>(builder.ToString(), obb);
}
double deltaAngle2 = Math.Abs(baseLineAngle - obb2.Rotation);
if (deltaAngle2 < deltaAngle)
{
deltaAngle = deltaAngle2;
obb = obb2;
}
double deltaAngle3 = Math.Abs(baseLineAngle - obb3.Rotation);
if (deltaAngle3 < deltaAngle)
{
obb = obb3;
}
return new Tuple<string, PdfRectangle>(builder.ToString(), obb);
}
#endregion
/// <summary>
/// Bound angle so that -180 ≤ θ ≤ 180.
/// </summary>
/// <param name="angle">The angle to bound.</param>
private static double BoundAngle180(double angle)
{
angle = (angle + 180) % 360;
if (angle < 0) angle += 360;
return angle - 180;
}
/// <inheritdoc />
public override string ToString()
{

View File

@@ -2843,7 +2843,7 @@ namespace UglyToad.PdfPig.Geometry.ClipperLibrary
}
//------------------------------------------------------------------------------
/// <summary>
///
/// Clean polygon.
/// </summary>
/// <param name="path"></param>
/// <param name="distance">proximity in units/pixels below which vertices will be stripped.
@@ -2909,7 +2909,7 @@ namespace UglyToad.PdfPig.Geometry.ClipperLibrary
//------------------------------------------------------------------------------
/// <summary>
///
/// Clean polygon.
/// </summary>
/// <param name="polys"></param>
/// <param name="distance">proximity in units/pixels below which vertices will be stripped.

View File

@@ -38,7 +38,7 @@
(byte) 'e',
(byte) 'f'
};
public static long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
{
if (bytes == null)
@@ -55,10 +55,6 @@
var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange;
var startPosition = fileLength - offsetFromEnd;
bytes.Seek(startPosition);
var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd);
scanner.Seek(startXrefPosition);
@@ -96,38 +92,48 @@
var startXrefs = new List<int>();
var index = 0;
var offset = 0;
// Starting scanning the last 1024 bytes.
while (bytes.MoveNext())
var fileLength = bytes.Length;
var multiple = 1;
var actualStartOffset = Math.Max(0, fileLength - (offsetFromEnd * multiple));
do
{
offset++;
if (bytes.CurrentByte == StartXRefBytes[index])
multiple *= 2;
bytes.Seek(actualStartOffset);
// Starting scanning the file bytes.
while (bytes.MoveNext())
{
// We might be reading "startxref".
index++;
}
else
{
index = 0;
if (bytes.CurrentByte == StartXRefBytes[index])
{
// We might be reading "startxref".
index++;
}
else
{
index = 0;
}
if (index == StartXRefBytes.Length)
{
// Add this "startxref" (position from the start of the document to the first 's').
startXrefs.Add((int)bytes.CurrentOffset - StartXRefBytes.Length);
// Continue scanning in case there are further "startxref"s. Not sure if this ever happens.
index = 0;
}
}
if (index == StartXRefBytes.Length)
{
// Add this "startxref" (position from the end of the document to the first 's').
startXrefs.Add(offsetFromEnd - (offset - StartXRefBytes.Length));
// Continue scanning in case there are further "startxref"s. Not sure if this ever happens.
index = 0;
}
}
actualStartOffset = Math.Max(0, fileLength - (offsetFromEnd * multiple));
} while (startXrefs.Count == 0 && actualStartOffset > 0);
if (startXrefs.Count == 0)
{
throw new PdfDocumentFormatException("Could not find the startxref within the last 1024 characters.");
throw new PdfDocumentFormatException($"Could not find the startxref within the last {offsetFromEnd} characters.");
}
return bytes.Length - startXrefs[startXrefs.Count - 1];
return startXrefs[startXrefs.Count - 1];
}
}
}

View File

@@ -74,9 +74,9 @@
var nextIsWhiteSpace = string.IsNullOrWhiteSpace(letter.Value);
var nextFontDiffers = !string.Equals(letter.FontName, lastLetter.FontName, StringComparison.OrdinalIgnoreCase) && gap > letter.Width * 0.1;
var nextFontSizeDiffers = Math.Abs(letter.FontSize - lastLetter.FontSize) > 0.1;
var nextTextDirectionDiffers = letter.TextDirection != lastLetter.TextDirection;
var nextTextOrientationDiffers = letter.TextOrientation != lastLetter.TextOrientation;
if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextDirectionDiffers)
if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers)
{
if (lettersSoFar.Count > 0)
{