mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-23 04:36:44 +08:00
Document Layout Analysis - IPageSegmenter, Docstrum
- Create a TextBlock class - Creates IPageSegmenter - Add other useful distances: angle, etc. - Update RecursiveXYCut - With IPageSegmenter and TextBlock - Make XYNode and XYLeaf internal - Optimise (faster) NearestNeighbourWordExtractor and isolate the clustering algorithms for use outside of this class - Implement a Docstrum inspired page segmentation algorithm
This commit is contained in:
@@ -54,17 +54,19 @@
|
|||||||
"UglyToad.PdfPig.Content.PageSize",
|
"UglyToad.PdfPig.Content.PageSize",
|
||||||
"UglyToad.PdfPig.Content.Word",
|
"UglyToad.PdfPig.Content.Word",
|
||||||
"UglyToad.PdfPig.Content.TextLine",
|
"UglyToad.PdfPig.Content.TextLine",
|
||||||
|
"UglyToad.PdfPig.Content.TextBlock",
|
||||||
"UglyToad.PdfPig.Content.TextDirection",
|
"UglyToad.PdfPig.Content.TextDirection",
|
||||||
"UglyToad.PdfPig.Core.TransformationMatrix",
|
"UglyToad.PdfPig.Core.TransformationMatrix",
|
||||||
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
|
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
|
||||||
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
|
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
|
||||||
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
|
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
|
||||||
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.ClusteringAlgorithms",
|
||||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.Distances",
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.Distances",
|
||||||
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBB",
|
||||||
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.IPageSegmenter",
|
||||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions",
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions",
|
||||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.NearestNeighbourWordExtractor",
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.NearestNeighbourWordExtractor",
|
||||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
|
||||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
|
|
||||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf",
|
|
||||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
|
||||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
|
"UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
|
||||||
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
|
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
|
||||||
|
68
src/UglyToad.PdfPig/Content/TextBlock.cs
Normal file
68
src/UglyToad.PdfPig/Content/TextBlock.cs
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using UglyToad.PdfPig.Geometry;
|
||||||
|
|
||||||
|
namespace UglyToad.PdfPig.Content
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// A block of text.
|
||||||
|
/// </summary>
|
||||||
|
public class TextBlock
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// The text of the block.
|
||||||
|
/// </summary>
|
||||||
|
public string Text { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The text direction of the block.
|
||||||
|
/// </summary>
|
||||||
|
public TextDirection TextDirection { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The rectangle completely containing the block.
|
||||||
|
/// </summary>
|
||||||
|
public PdfRectangle BoundingBox { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The text lines contained in the block.
|
||||||
|
/// </summary>
|
||||||
|
public IReadOnlyList<TextLine> TextLines { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Create a new <see cref="TextBlock"/>.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="lines"></param>
|
||||||
|
public TextBlock(IReadOnlyList<TextLine> lines)
|
||||||
|
{
|
||||||
|
if (lines == null)
|
||||||
|
{
|
||||||
|
throw new ArgumentNullException(nameof(lines));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lines.Count == 0)
|
||||||
|
{
|
||||||
|
throw new ArgumentException("Empty lines provided.", nameof(lines));
|
||||||
|
}
|
||||||
|
|
||||||
|
TextLines = lines;
|
||||||
|
|
||||||
|
Text = string.Join(" ", lines.Select(x => x.Text));
|
||||||
|
|
||||||
|
var minX = lines.Min(x => x.BoundingBox.Left);
|
||||||
|
var minY = lines.Min(x => x.BoundingBox.Bottom);
|
||||||
|
var maxX = lines.Max(x => x.BoundingBox.Right);
|
||||||
|
var maxY = lines.Max(x => x.BoundingBox.Top);
|
||||||
|
BoundingBox = new PdfRectangle(minX, minY, maxX, maxY);
|
||||||
|
|
||||||
|
TextDirection = lines[0].TextDirection;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
public override string ToString()
|
||||||
|
{
|
||||||
|
return Text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,164 @@
|
|||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using UglyToad.PdfPig.Geometry;
|
||||||
|
|
||||||
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Clustering Algorithms.
|
||||||
|
/// </summary>
|
||||||
|
internal class ClusteringAlgorithms
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
|
||||||
|
/// https://en.wikipedia.org/wiki/Transitive_closure
|
||||||
|
/// </summary>
|
||||||
|
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
|
||||||
|
/// <param name="elements">Array of elements to group.</param>
|
||||||
|
/// <param name="distMeasure">The distance measure between two points.</param>
|
||||||
|
/// <param name="maxDistanceFunction">The function that determines the distance between to points in the same cluster.</param>
|
||||||
|
/// <param name="pivotPoint">The pivot's point to use.</param>
|
||||||
|
/// <param name="candidatesPoint">The candidates to pair point to use.</param>
|
||||||
|
/// <param name="filterPivot">Filter to apply to the pivot point.</param>
|
||||||
|
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point.</param>
|
||||||
|
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(T[] elements,
|
||||||
|
Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||||
|
Func<T, T, double> maxDistanceFunction,
|
||||||
|
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
|
||||||
|
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
|
||||||
|
{
|
||||||
|
/*************************************************************************************
|
||||||
|
* Algorithm steps
|
||||||
|
* 1. Find nearest neighbours indexes (done in parallel)
|
||||||
|
* Iterate every point (pivot) and put its nearest neighbour's index in an array
|
||||||
|
* e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
|
||||||
|
* Only conciders a neighbour if it is within the maximum distance.
|
||||||
|
* If not within the maximum distance, index will be set to -1.
|
||||||
|
* NB: Given the possible asymmetry in the relationship, it is possible
|
||||||
|
* that if indexes[i] = j then indexes[j] != i.
|
||||||
|
*
|
||||||
|
* 2. Group indexes
|
||||||
|
* Group indexes if share neighbours in common - Transitive closure
|
||||||
|
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
|
||||||
|
* (i,j,k) will form a group and (m,n) will form another group.
|
||||||
|
*
|
||||||
|
* 3. Merge groups that have indexes in common - If any
|
||||||
|
* If there are group with indexes in common, merge them.
|
||||||
|
* (Could be improved and put in step 2)
|
||||||
|
*************************************************************************************/
|
||||||
|
|
||||||
|
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
|
||||||
|
var candidatesPoints = elements.Select(x => candidatesPoint(x)).ToList();
|
||||||
|
|
||||||
|
// 1. Find nearest neighbours indexes
|
||||||
|
Parallel.For(0, elements.Length, e =>
|
||||||
|
{
|
||||||
|
var pivot = elements[e];
|
||||||
|
|
||||||
|
if (filterPivot(pivot))
|
||||||
|
{
|
||||||
|
int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist);
|
||||||
|
var paired = elements[index];
|
||||||
|
|
||||||
|
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
|
||||||
|
{
|
||||||
|
indexes[e] = index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// 2. Group indexes
|
||||||
|
List<HashSet<int>> groupedIndexes = new List<HashSet<int>>();
|
||||||
|
HashSet<int> indexDone = new HashSet<int>();
|
||||||
|
|
||||||
|
for (int e = 0; e < elements.Length; e++)
|
||||||
|
{
|
||||||
|
int index = indexes[e];
|
||||||
|
|
||||||
|
if (index == -1) // This element is not connected
|
||||||
|
{
|
||||||
|
// Check if another element index is connected to this element (nb: distance measure is asymetric)
|
||||||
|
if (!indexes.Contains(e))
|
||||||
|
{
|
||||||
|
// If no other element is connected to this element, add it as a standalone element
|
||||||
|
groupedIndexes.Add(new HashSet<int>() { e });
|
||||||
|
indexDone.Add(e);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isDoneC = indexDone.Contains(e);
|
||||||
|
bool isDoneI = indexDone.Contains(index);
|
||||||
|
if (isDoneC || isDoneI)
|
||||||
|
{
|
||||||
|
if (isDoneC && !isDoneI)
|
||||||
|
{
|
||||||
|
foreach (var pair in groupedIndexes.Where(x => x.Contains(e)))
|
||||||
|
{
|
||||||
|
pair.Add(index);
|
||||||
|
}
|
||||||
|
indexDone.Add(index);
|
||||||
|
}
|
||||||
|
else if (!isDoneC && isDoneI)
|
||||||
|
{
|
||||||
|
foreach (var pair in groupedIndexes.Where(x => x.Contains(index)))
|
||||||
|
{
|
||||||
|
pair.Add(e);
|
||||||
|
}
|
||||||
|
indexDone.Add(e);
|
||||||
|
}
|
||||||
|
else // isDoneC && isDoneI
|
||||||
|
{
|
||||||
|
foreach (var pair in groupedIndexes.Where(x => x.Contains(index)))
|
||||||
|
{
|
||||||
|
if (!pair.Contains(e)) pair.Add(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (var pair in groupedIndexes.Where(x => x.Contains(e)))
|
||||||
|
{
|
||||||
|
if (!pair.Contains(index)) pair.Add(index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
groupedIndexes.Add(new HashSet<int>() { e, index });
|
||||||
|
indexDone.Add(e);
|
||||||
|
indexDone.Add(index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that all elements are done
|
||||||
|
if (elements.Length != indexDone.Count)
|
||||||
|
{
|
||||||
|
throw new Exception("ClusteringAlgorithms.GetNNGroupedIndexes(): Some elements were not done.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Merge groups that have indexes in common
|
||||||
|
// Check if duplicates (if duplicates, then same index in different groups)
|
||||||
|
if (indexDone.Count != groupedIndexes.SelectMany(x => x).Count())
|
||||||
|
{
|
||||||
|
for (int e = 0; e < elements.Length; e++)
|
||||||
|
{
|
||||||
|
List<HashSet<int>> candidates = groupedIndexes.Where(x => x.Contains(e)).ToList();
|
||||||
|
int count = candidates.Count();
|
||||||
|
if (count < 2) continue; // Only one group with this index
|
||||||
|
|
||||||
|
HashSet<int> merged = candidates.First();
|
||||||
|
groupedIndexes.Remove(merged);
|
||||||
|
for (int i = 1; i < count; i++)
|
||||||
|
{
|
||||||
|
var current = candidates.ElementAt(i);
|
||||||
|
merged.UnionWith(current);
|
||||||
|
groupedIndexes.Remove(current);
|
||||||
|
}
|
||||||
|
groupedIndexes.Add(merged);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return groupedIndexes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -47,6 +47,39 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y));
|
return (double)(Math.Abs(point1.X - point2.X) + Math.Abs(point1.Y - point2.Y));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The angle in degrees between the horizontal axis and the line between two points.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="point1">The first point.</param>
|
||||||
|
/// <param name="point2">The second point.</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static double Angle(PdfPoint point1, PdfPoint point2)
|
||||||
|
{
|
||||||
|
return Math.Atan2((float)(point2.Y - point1.Y), (float)(point2.X - point1.X)) * 180.0 / Math.PI;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The absolute distance between the Y coordinates of two points.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="point1">The first point.</param>
|
||||||
|
/// <param name="point2">The second point.</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static double Vertical(PdfPoint point1, PdfPoint point2)
|
||||||
|
{
|
||||||
|
return Math.Abs((double)(point2.Y - point1.Y));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The absolute distance between the X coordinates of two points.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="point1">The first point.</param>
|
||||||
|
/// <param name="point2">The second point.</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public static double Horizontal(PdfPoint point1, PdfPoint point2)
|
||||||
|
{
|
||||||
|
return Math.Abs((double)(point2.X - point1.X));
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Find the nearest point.
|
/// Find the nearest point.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
212
src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
Normal file
212
src/UglyToad.PdfPig/DocumentLayoutAnalysis/DocstrumBB.cs
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
using System;
|
||||||
|
using System.Collections.Concurrent;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using UglyToad.PdfPig.Content;
|
||||||
|
using UglyToad.PdfPig.Geometry;
|
||||||
|
|
||||||
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// The Docstrum algorithm is a bottom-up page segmentation technique based on nearest-neighborhood
|
||||||
|
/// clustering of connected components extracted from the document.
|
||||||
|
/// This implementation leverages bounding boxes and does not exactly replicates the original algorithm.
|
||||||
|
/// <para>See 'The document spectrum for page layout analysis.' by L. O’Gorman.</para>
|
||||||
|
/// </summary>
|
||||||
|
public class DocstrumBB : IPageSegmenter
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Create an instance of Docstrum for bounding boxes page segmenter, <see cref="DocstrumBB"/>.
|
||||||
|
/// </summary>
|
||||||
|
public static DocstrumBB Instance { get; } = new DocstrumBB();
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get the blocks.
|
||||||
|
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="pageWords"></param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords)
|
||||||
|
{
|
||||||
|
return GetBlocks(pageWords, -30, 30, -135, -45, 1.3);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get the blocks. See original paper for more information.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="pageWords"></param>
|
||||||
|
/// <param name="wlAngleLB">Within-line lower bound angle.</param>
|
||||||
|
/// <param name="wlAngleUB">Within-line upper bound angle.</param>
|
||||||
|
/// <param name="blAngleLB">Between-line lower bound angle.</param>
|
||||||
|
/// <param name="blAngleUB">Between-line upper bound angle.</param>
|
||||||
|
/// <param name="blMultiplier">Multiplier that gives the maximum perpendicular distance between
|
||||||
|
/// text lines for blocking. Maximum distance will be this number times the between-line
|
||||||
|
/// distance found by the analysis.</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double wlAngleLB, double wlAngleUB,
|
||||||
|
double blAngleLB, double blAngleUB, double blMultiplier)
|
||||||
|
{
|
||||||
|
var pageWordsArr = pageWords.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToArray(); // remove white spaces
|
||||||
|
|
||||||
|
var withinLineDistList = new ConcurrentBag<double[]>();
|
||||||
|
var betweenLineDistList = new ConcurrentBag<double[]>();
|
||||||
|
|
||||||
|
// 1. Estimate in line and between line spacing
|
||||||
|
Parallel.For(0, pageWordsArr.Length, i =>
|
||||||
|
{
|
||||||
|
var word = pageWordsArr[i];
|
||||||
|
|
||||||
|
// Within-line distance
|
||||||
|
var pointWL = GetNearestPointData(pageWordsArr, word,
|
||||||
|
bb => bb.BottomRight, bb => bb.BottomRight,
|
||||||
|
bb => bb.BottomLeft, bb => bb.BottomLeft,
|
||||||
|
wlAngleLB, wlAngleUB, Distances.Horizontal);
|
||||||
|
if (pointWL != null) withinLineDistList.Add(pointWL);
|
||||||
|
|
||||||
|
// Between-line distance
|
||||||
|
var pointBL = GetNearestPointData(pageWordsArr, word,
|
||||||
|
bb => bb.BottomLeft, bb => bb.Centroid,
|
||||||
|
bb => bb.TopLeft, bb => bb.Centroid,
|
||||||
|
blAngleLB, blAngleUB, Distances.Vertical);
|
||||||
|
if (pointBL != null) betweenLineDistList.Add(pointBL);
|
||||||
|
});
|
||||||
|
|
||||||
|
double withinLineDistance = GetPeakAverageDistance(withinLineDistList);
|
||||||
|
double betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);
|
||||||
|
|
||||||
|
// 2. Find lines of text
|
||||||
|
double maxDistWL = Math.Min(3 * withinLineDistance, Math.Sqrt(2) * betweenLineDistance);
|
||||||
|
var lines = GetLines(pageWordsArr, maxDistWL).ToArray();
|
||||||
|
|
||||||
|
// 3. Find blocks of text
|
||||||
|
double maxDistBL = blMultiplier * betweenLineDistance;
|
||||||
|
return GetLinesGroups(lines, maxDistBL).ToList();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get information on the nearest point, filtered for angle.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="words"></param>
|
||||||
|
/// <param name="pivot"></param>
|
||||||
|
/// <param name="funcPivotDist"></param>
|
||||||
|
/// <param name="funcPivotAngle"></param>
|
||||||
|
/// <param name="funcPointsDist"></param>
|
||||||
|
/// <param name="funcPointsAngle"></param>
|
||||||
|
/// <param name="angleStart"></param>
|
||||||
|
/// <param name="angleEnd"></param>
|
||||||
|
/// <param name="finalDistMEasure"></param>
|
||||||
|
/// <returns></returns>
|
||||||
|
private double[] GetNearestPointData(Word[] words, Word pivot, Func<PdfRectangle,
|
||||||
|
PdfPoint> funcPivotDist, Func<PdfRectangle, PdfPoint> funcPivotAngle,
|
||||||
|
Func<PdfRectangle, PdfPoint> funcPointsDist, Func<PdfRectangle, PdfPoint> funcPointsAngle,
|
||||||
|
double angleStart, double angleEnd,
|
||||||
|
Func<PdfPoint, PdfPoint, double> finalDistMEasure)
|
||||||
|
{
|
||||||
|
var pointR = funcPivotDist(pivot.BoundingBox);
|
||||||
|
var filtered = words.Where(w =>
|
||||||
|
{
|
||||||
|
var angleWL = Distances.Angle(funcPivotAngle(pivot.BoundingBox), funcPointsAngle(w.BoundingBox));
|
||||||
|
return (angleWL >= angleStart && angleWL <= angleEnd);
|
||||||
|
}).ToList();
|
||||||
|
filtered.Remove(pivot); // remove itself
|
||||||
|
|
||||||
|
if (filtered.Count > 0)
|
||||||
|
{
|
||||||
|
int index = pointR.FindIndexNearest(
|
||||||
|
filtered.Select(w => funcPointsDist(w.BoundingBox)).ToList(),
|
||||||
|
Distances.Euclidean, out double distWL);
|
||||||
|
|
||||||
|
if (index >= 0)
|
||||||
|
{
|
||||||
|
var matchWL = filtered[index];
|
||||||
|
return new double[]
|
||||||
|
{
|
||||||
|
(double)pivot.Letters.Select(l => l.FontSize).Mode(),
|
||||||
|
finalDistMEasure(pointR, funcPointsDist(matchWL.BoundingBox))
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Build lines via transitive closure.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="words"></param>
|
||||||
|
/// <param name="maxDist"></param>
|
||||||
|
/// <returns></returns>
|
||||||
|
private IEnumerable<TextLine> GetLines(Word[] words, double maxDist)
|
||||||
|
{
|
||||||
|
TextDirection textDirection = words[0].TextDirection;
|
||||||
|
var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean,
|
||||||
|
(w1, w2) => maxDist,
|
||||||
|
w => w.BoundingBox.BottomRight, w => w.BoundingBox.BottomLeft,
|
||||||
|
w => true,
|
||||||
|
(w1, w2) =>
|
||||||
|
{
|
||||||
|
var angleWL = Distances.Angle(w1.BoundingBox.BottomRight, w2.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle
|
||||||
|
return (angleWL >= -30 && angleWL <= 30);
|
||||||
|
}).ToList();
|
||||||
|
|
||||||
|
Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
|
||||||
|
if (textDirection == TextDirection.Rotate180)
|
||||||
|
{
|
||||||
|
orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList();
|
||||||
|
}
|
||||||
|
else if (textDirection == TextDirection.Rotate90)
|
||||||
|
{
|
||||||
|
orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList();
|
||||||
|
}
|
||||||
|
else if (textDirection == TextDirection.Rotate270)
|
||||||
|
{
|
||||||
|
orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int a = 0; a < groupedIndexes.Count(); a++)
|
||||||
|
{
|
||||||
|
yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i])));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Build blocks via transitive closure.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="lines"></param>
|
||||||
|
/// <param name="maxDist"></param>
|
||||||
|
/// <returns></returns>
|
||||||
|
private IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
|
||||||
|
{
|
||||||
|
var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, Distances.Euclidean,
|
||||||
|
(l1, l2) => maxDist,
|
||||||
|
l => l.BoundingBox.TopLeft, l => l.BoundingBox.BottomLeft,
|
||||||
|
l => true, (l1, l2) => true).ToList();
|
||||||
|
|
||||||
|
for (int a = 0; a < groupedIndexes.Count(); a++)
|
||||||
|
{
|
||||||
|
yield return new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get the average distance value of the peak bucket of the histogram.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="values">array[0]=font size, array[1]=distance</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
private double GetPeakAverageDistance(IEnumerable<double[]> values)
|
||||||
|
{
|
||||||
|
int max = (int)values.Max(x => x[1]) + 1;
|
||||||
|
int[] distrib = new int[max];
|
||||||
|
|
||||||
|
// Create histogram with buckets of size 1.
|
||||||
|
for (int i = 0; i < max; i++)
|
||||||
|
{
|
||||||
|
distrib[i] = values.Where(x => x[1] > i && x[1] <= i + 1).Count();
|
||||||
|
}
|
||||||
|
|
||||||
|
var peakIndex = Array.IndexOf(distrib, distrib.Max());
|
||||||
|
|
||||||
|
return values.Where(v => v[1] > peakIndex && v[1] <= peakIndex + 1).Average(x => x[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
19
src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs
Normal file
19
src/UglyToad.PdfPig/DocumentLayoutAnalysis/IPageSegmenter.cs
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
using System.Collections.Generic;
|
||||||
|
using UglyToad.PdfPig.Content;
|
||||||
|
|
||||||
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Page segmentation divides a page into areas, each consisting of a layout structure (blocks, lines, etc.).
|
||||||
|
/// <para> See 'Performance Comparison of Six Algorithms for Page Segmentation' by Faisal Shafait, Daniel Keysers, and Thomas M. Breuel.</para>
|
||||||
|
/// </summary>
|
||||||
|
public interface IPageSegmenter
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Get the text blocks.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="pageWords">The words to generate text blocks for.</param>
|
||||||
|
/// <returns>A list of text blocks from this approach.</returns>
|
||||||
|
IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords);
|
||||||
|
}
|
||||||
|
}
|
@@ -1,7 +1,6 @@
|
|||||||
using System;
|
using System;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
using System.Threading.Tasks;
|
|
||||||
using UglyToad.PdfPig.Content;
|
using UglyToad.PdfPig.Content;
|
||||||
using UglyToad.PdfPig.Geometry;
|
using UglyToad.PdfPig.Geometry;
|
||||||
using UglyToad.PdfPig.Util;
|
using UglyToad.PdfPig.Util;
|
||||||
@@ -71,7 +70,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height.</param>
|
/// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height.</param>
|
||||||
/// <param name="distMeasure">The distance measure between two start and end base line points,
|
/// <param name="distMeasure">The distance measure between two start and end base line points,
|
||||||
/// e.g. the Manhattan distance.</param>
|
/// e.g. the Manhattan distance.</param>
|
||||||
private static List<Word> GetWords(IEnumerable<Letter> pageLetters,
|
private List<Word> GetWords(IEnumerable<Letter> pageLetters,
|
||||||
Func<Letter, decimal> metric, Func<PdfPoint, PdfPoint, double> distMeasure)
|
Func<Letter, decimal> metric, Func<PdfPoint, PdfPoint, double> distMeasure)
|
||||||
{
|
{
|
||||||
if (pageLetters == null || pageLetters.Count() == 0) return new List<Word>();
|
if (pageLetters == null || pageLetters.Count() == 0) return new List<Word>();
|
||||||
@@ -97,116 +96,18 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
}
|
}
|
||||||
|
|
||||||
Letter[] letters = pageLetters.ToArray();
|
Letter[] letters = pageLetters.ToArray();
|
||||||
int lettersCount = letters.Length;
|
|
||||||
List<PdfPoint> startBaseLines = letters.Select(x => x.StartBaseLine).ToList();
|
|
||||||
|
|
||||||
int[] indexes = Enumerable.Repeat((int)-1, lettersCount).ToArray();
|
var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(letters,
|
||||||
|
distMeasure,
|
||||||
// Find nearest neighbours indexes
|
(l1, l2) => Math.Max((double)metric(l1), (double)metric(l2)) * 0.60,
|
||||||
Parallel.For(0, lettersCount, c =>
|
l => l.EndBaseLine, l => l.StartBaseLine,
|
||||||
{
|
l => !string.IsNullOrWhiteSpace(l.Value),
|
||||||
var currentLetter = letters[c];
|
(l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value)).ToList();
|
||||||
// only check neighbours if not a white space
|
|
||||||
if (!string.IsNullOrWhiteSpace(currentLetter.Value))
|
|
||||||
{
|
|
||||||
int index = currentLetter.EndBaseLine.FindIndexNearest(startBaseLines, distMeasure, out double dist);
|
|
||||||
var pairedLetter = letters[index];
|
|
||||||
|
|
||||||
if (!string.IsNullOrWhiteSpace(pairedLetter.Value) &&
|
|
||||||
string.Equals(currentLetter.FontName, pairedLetter.FontName, StringComparison.OrdinalIgnoreCase))
|
|
||||||
{
|
|
||||||
decimal minDist = Math.Max(Math.Abs(metric(currentLetter)), Math.Abs(metric(pairedLetter))) * 0.60m;
|
|
||||||
if ((decimal)dist < minDist)
|
|
||||||
{
|
|
||||||
indexes[c] = index;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Group indexes
|
|
||||||
List<List<int>> groupedIndexes = new List<List<int>>();
|
|
||||||
List<int> indexDone = new List<int>();
|
|
||||||
for (int c = 0; c < lettersCount; c++)
|
|
||||||
{
|
|
||||||
int i = indexes[c];
|
|
||||||
if (i == -1) continue;
|
|
||||||
|
|
||||||
bool isDoneC = indexDone.Contains(c);
|
|
||||||
bool isDoneI = indexDone.Contains(i);
|
|
||||||
if (isDoneC || isDoneI)
|
|
||||||
{
|
|
||||||
if (isDoneC && !isDoneI)
|
|
||||||
{
|
|
||||||
foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
|
|
||||||
{
|
|
||||||
pair.Add(i);
|
|
||||||
}
|
|
||||||
indexDone.Add(i);
|
|
||||||
}
|
|
||||||
else if (!isDoneC && isDoneI)
|
|
||||||
{
|
|
||||||
foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
|
|
||||||
{
|
|
||||||
pair.Add(c);
|
|
||||||
}
|
|
||||||
indexDone.Add(c);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
foreach (var pair in groupedIndexes.Where(x => x.Contains(i)))
|
|
||||||
{
|
|
||||||
if (!pair.Contains(c)) pair.Add(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach (var pair in groupedIndexes.Where(x => x.Contains(c)))
|
|
||||||
{
|
|
||||||
if (!pair.Contains(i)) pair.Add(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
List<int> pair = new List<int>() { c, i };
|
|
||||||
groupedIndexes.Add(pair);
|
|
||||||
indexDone.AddRange(pair);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Merge lists with common index
|
|
||||||
for (int c = 0; c < lettersCount; c++)
|
|
||||||
{
|
|
||||||
List<List<int>> candidates = groupedIndexes.Where(x => x.Any(t => t == c)).ToList();
|
|
||||||
if (candidates.Count < 2) continue; // only one group with this index
|
|
||||||
|
|
||||||
List<int> merged = candidates.First();
|
|
||||||
groupedIndexes.Remove(merged);
|
|
||||||
for (int i = 1; i < candidates.Count; i++)
|
|
||||||
{
|
|
||||||
var current = candidates[i];
|
|
||||||
merged = merged.Union(current).ToList();
|
|
||||||
groupedIndexes.Remove(current);
|
|
||||||
}
|
|
||||||
groupedIndexes.Add(merged);
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Word> words = new List<Word>();
|
List<Word> words = new List<Word>();
|
||||||
for (int a = 0; a < groupedIndexes.Count(); a++)
|
for (int a = 0; a < groupedIndexes.Count(); a++)
|
||||||
{
|
{
|
||||||
List<Letter> groupedLetters = new List<Letter>();
|
words.Add(new Word(orderFunc(groupedIndexes[a].Select(i => letters[i]))));
|
||||||
foreach (int s in groupedIndexes[a])
|
|
||||||
{
|
|
||||||
groupedLetters.Add(letters[s]);
|
|
||||||
}
|
|
||||||
|
|
||||||
words.Add(new Word(orderFunc(groupedLetters)));
|
|
||||||
}
|
|
||||||
|
|
||||||
List<int> indexesNotDone = Enumerable.Range(0, lettersCount).Except(groupedIndexes.SelectMany(x => x)).ToList();
|
|
||||||
for (int n = 0; n < indexesNotDone.Count(); n++)
|
|
||||||
{
|
|
||||||
Letter letter = letters[indexesNotDone[n]];
|
|
||||||
words.Add(new Word(new Letter[] { letter }));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return words;
|
return words;
|
||||||
|
@@ -11,14 +11,31 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
|
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
|
||||||
/// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips</para>
|
/// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public static class RecursiveXYCut
|
public class RecursiveXYCut : IPageSegmenter
|
||||||
{
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Create an instance of Recursive X-Y Cut page segmenter, <see cref="RecursiveXYCut"/>.
|
||||||
|
/// </summary>
|
||||||
|
public static RecursiveXYCut Instance { get; } = new RecursiveXYCut();
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the blocks.
|
/// Get the blocks.
|
||||||
|
/// <para>Uses 'minimumWidth' = 0, 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height)</para>
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="pageWords">The words in the page.</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords)
|
||||||
|
{
|
||||||
|
return GetBlocks(pageWords, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get the blocks.
|
||||||
|
/// <para>Uses 'dominantFontWidthFunc' = Mode(Width), 'dominantFontHeightFunc' = 1.5 x Mode(Height)</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="pageWords">The words in the page.</param>
|
/// <param name="pageWords">The words in the page.</param>
|
||||||
/// <param name="minimumWidth">The minimum width for a block.</param>
|
/// <param name="minimumWidth">The minimum width for a block.</param>
|
||||||
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth = 0)
|
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth)
|
||||||
{
|
{
|
||||||
return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5m, 3));
|
return GetBlocks(pageWords, minimumWidth, k => Math.Round(k.Mode(), 3), k => Math.Round(k.Mode() * 1.5m, 3));
|
||||||
}
|
}
|
||||||
@@ -30,7 +47,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// <param name="minimumWidth">The minimum width for a block.</param>
|
/// <param name="minimumWidth">The minimum width for a block.</param>
|
||||||
/// <param name="dominantFontWidth">The dominant font width.</param>
|
/// <param name="dominantFontWidth">The dominant font width.</param>
|
||||||
/// <param name="dominantFontHeight">The dominant font height.</param>
|
/// <param name="dominantFontHeight">The dominant font height.</param>
|
||||||
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
|
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
|
||||||
decimal dominantFontWidth, decimal dominantFontHeight)
|
decimal dominantFontWidth, decimal dominantFontHeight)
|
||||||
{
|
{
|
||||||
return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight);
|
return GetBlocks(pageWords, minimumWidth, k => dominantFontWidth, k => dominantFontHeight);
|
||||||
@@ -43,15 +60,24 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// <param name="minimumWidth">The minimum width for a block.</param>
|
/// <param name="minimumWidth">The minimum width for a block.</param>
|
||||||
/// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
|
/// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
|
||||||
/// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
|
/// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
|
||||||
public static XYNode GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
|
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, decimal minimumWidth,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc)
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc)
|
||||||
{
|
{
|
||||||
var root = new XYLeaf(pageWords); // Create a root node.
|
XYLeaf root = new XYLeaf(pageWords); // Create a root node.
|
||||||
return VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
|
XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);
|
||||||
|
|
||||||
|
var leafs = node.GetLeafs();
|
||||||
|
|
||||||
|
if (leafs.Count > 0)
|
||||||
|
{
|
||||||
|
return leafs.Select(l => new TextBlock(l.GetLines())).ToList();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth,
|
return new List<TextBlock>();
|
||||||
|
}
|
||||||
|
|
||||||
|
private XYNode VerticalCut(XYLeaf leaf, decimal minimumWidth,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
||||||
{
|
{
|
||||||
@@ -144,7 +170,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
return new XYNode(newNodes);
|
return new XYNode(newNodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth,
|
private XYNode HorizontalCut(XYLeaf leaf, decimal minimumWidth,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
Func<IEnumerable<decimal>, decimal> dominantFontWidthFunc,
|
||||||
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
Func<IEnumerable<decimal>, decimal> dominantFontHeightFunc, int level = 0)
|
||||||
{
|
{
|
||||||
|
@@ -9,7 +9,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// <summary>
|
/// <summary>
|
||||||
/// A Leaf node used in the <see cref="RecursiveXYCut"/> algorithm, i.e. a block.
|
/// A Leaf node used in the <see cref="RecursiveXYCut"/> algorithm, i.e. a block.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public class XYLeaf : XYNode
|
internal class XYLeaf : XYNode
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Returns true if this node is a leaf, false otherwise.
|
/// Returns true if this node is a leaf, false otherwise.
|
||||||
|
@@ -8,7 +8,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
|||||||
/// <summary>
|
/// <summary>
|
||||||
/// A Node used in the <see cref="RecursiveXYCut"/> algorithm.
|
/// A Node used in the <see cref="RecursiveXYCut"/> algorithm.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public class XYNode
|
internal class XYNode
|
||||||
{
|
{
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Returns true if this node is a leaf, false otherwise.
|
/// Returns true if this node is a leaf, false otherwise.
|
||||||
|
Reference in New Issue
Block a user