mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
ReadingOrderDetector and tidying DLA project
This commit is contained in:
@@ -1,13 +1,14 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
using Content;
|
||||
using Geometry;
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Threading.Tasks;
|
||||
using Content;
|
||||
using Geometry;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||
using Util;
|
||||
|
||||
/// <summary>
|
||||
@@ -15,7 +16,7 @@
|
||||
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
|
||||
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
|
||||
/// left or right edge of the page.</para>
|
||||
/// <para>See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.</para>
|
||||
/// <para>See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.</para>
|
||||
/// </summary>
|
||||
public static class DecorationTextBlockClassifier
|
||||
{
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
using Core;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Core;
|
||||
using Geometry;
|
||||
|
||||
/// <summary>
|
||||
/// Contains helpful tools for distance measures.
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export
|
||||
{
|
||||
using Alto;
|
||||
using Content;
|
||||
using Core;
|
||||
using DocumentLayoutAnalysis;
|
||||
using System;
|
||||
using System.Globalization;
|
||||
using System.Linq;
|
||||
using System.Xml;
|
||||
using System.Xml.Serialization;
|
||||
using Alto;
|
||||
using Content;
|
||||
using Core;
|
||||
using DocumentLayoutAnalysis;
|
||||
using Geometry;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||
using Util;
|
||||
|
||||
/// <inheritdoc />
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export
|
||||
{
|
||||
using System;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Core;
|
||||
using DocumentLayoutAnalysis;
|
||||
using Geometry;
|
||||
using System;
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||
using Util;
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export
|
||||
{
|
||||
using Content;
|
||||
using Core;
|
||||
using DocumentLayoutAnalysis;
|
||||
using Graphics.Colors;
|
||||
using PAGE;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Xml;
|
||||
using System.Xml.Serialization;
|
||||
using Content;
|
||||
using Core;
|
||||
using DocumentLayoutAnalysis;
|
||||
using Geometry;
|
||||
using Graphics.Colors;
|
||||
using PAGE;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||
using Util;
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Core;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
||||
/// <summary>
|
||||
/// Default Page Segmenter. All words are included in one block.
|
||||
@@ -1,13 +1,13 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
|
||||
{
|
||||
using Content;
|
||||
using Core;
|
||||
using Geometry;
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
using Geometry;
|
||||
using Content;
|
||||
using Core;
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
@@ -1,7 +1,7 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using Content;
|
||||
using System.Collections.Generic;
|
||||
|
||||
/// <summary>
|
||||
/// Page segmentation divides a page into areas, each consisting of a layout structure (blocks, lines, etc.).
|
||||
@@ -1,10 +1,10 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
|
||||
{
|
||||
using Content;
|
||||
using Core;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Core;
|
||||
|
||||
/// <summary>
|
||||
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
|
||||
@@ -1,10 +1,10 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
|
||||
{
|
||||
using Content;
|
||||
using Core;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Core;
|
||||
|
||||
/// <summary>
|
||||
/// A Leaf node used in the <see cref="RecursiveXYCut"/> algorithm, i.e. a block.
|
||||
@@ -1,8 +1,8 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
|
||||
{
|
||||
using Core;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Core;
|
||||
|
||||
/// <summary>
|
||||
/// A Node used in the <see cref="RecursiveXYCut"/> algorithm.
|
||||
@@ -0,0 +1,25 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
|
||||
/// <summary>
|
||||
/// This detector does nothing, no ordering takes place.
|
||||
/// </summary>
|
||||
public class DefaultReadingOrderDetector : IReadingOrderDetector
|
||||
{
|
||||
/// <summary>
|
||||
/// Create an instance of default reading order detector, <see cref="DefaultReadingOrderDetector"/>.
|
||||
/// <para>This detector does nothing, no ordering takes place.</para>
|
||||
/// </summary>
|
||||
public static DefaultReadingOrderDetector Instance { get; } = new DefaultReadingOrderDetector();
|
||||
|
||||
/// <summary>
|
||||
/// Gets the blocks in reading order and sets the <see cref="TextBlock.ReadingOrder"/>.
|
||||
/// </summary>
|
||||
/// <param name="textBlocks">The <see cref="TextBlock"/>s to order.</param>
|
||||
public IEnumerable<TextBlock> Get(IReadOnlyList<TextBlock> textBlocks)
|
||||
{
|
||||
return textBlocks;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
|
||||
/// <summary>
|
||||
/// Reading order detector determines the page's blocks reading order.
|
||||
/// </summary>
|
||||
public interface IReadingOrderDetector
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the blocks in reading order and sets the <see cref="TextBlock.ReadingOrder"/>.
|
||||
/// </summary>
|
||||
/// <param name="textBlocks">The <see cref="TextBlock"/>s to order.</param>
|
||||
IEnumerable<TextBlock> Get(IReadOnlyList<TextBlock> textBlocks);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm that retrieve the blocks' reading order using rendering order (TextSequence).
|
||||
/// </summary>
|
||||
public class RenderingReadingOrderDetector : IReadingOrderDetector
|
||||
{
|
||||
/// <summary>
|
||||
/// Create an instance of rendering reading order detector, <see cref="RenderingReadingOrderDetector"/>.
|
||||
/// <para>This detector uses the rendering order (TextSequence).</para>
|
||||
/// </summary>
|
||||
public static RenderingReadingOrderDetector Instance { get; } = new RenderingReadingOrderDetector();
|
||||
|
||||
/// <summary>
|
||||
/// Gets the blocks in reading order and sets the <see cref="TextBlock.ReadingOrder"/>.
|
||||
/// </summary>
|
||||
/// <param name="textBlocks">The <see cref="TextBlock"/>s to order.</param>
|
||||
public IEnumerable<TextBlock> Get(IReadOnlyList<TextBlock> textBlocks)
|
||||
{
|
||||
int readingOrder = 0;
|
||||
|
||||
foreach (var block in textBlocks.OrderBy(b => AvgTextSequence(b)))
|
||||
{
|
||||
block.SetReadingOrder(readingOrder++);
|
||||
yield return block;
|
||||
}
|
||||
}
|
||||
|
||||
private double AvgTextSequence(TextBlock textBlock)
|
||||
{
|
||||
return textBlock.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,464 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order (TextSequence).
|
||||
/// <para>See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern and 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz.</para>
|
||||
/// </summary>
|
||||
public class UnsupervisedReadingOrderDetector : IReadingOrderDetector
|
||||
{
|
||||
/// <summary>
|
||||
/// Create an instance of unsupervised reading order detector, <see cref="UnsupervisedReadingOrderDetector"/>.
|
||||
/// <para>This detector uses the (spatial) Allen’s interval relations and rendering order (TextSequence).</para>
|
||||
/// </summary>
|
||||
public static UnsupervisedReadingOrderDetector Instance { get; } = new UnsupervisedReadingOrderDetector();
|
||||
|
||||
private double T;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order.
|
||||
/// </summary>
|
||||
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.
|
||||
/// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
|
||||
/// same column might not be exactly aligned.</param>
|
||||
public UnsupervisedReadingOrderDetector(double T = 5)
|
||||
{
|
||||
this.T = T;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the blocks in reading order and sets the <see cref="TextBlock.ReadingOrder"/>.
|
||||
/// </summary>
|
||||
/// <param name="textBlocks">The <see cref="TextBlock"/>s to order.</param>
|
||||
public IEnumerable<TextBlock> Get(IReadOnlyList<TextBlock> textBlocks)
|
||||
{
|
||||
int readingOrder = 0;
|
||||
|
||||
var graph = BuildGraph(textBlocks, T);
|
||||
|
||||
while (graph.Any())
|
||||
{
|
||||
var maxCount = graph.Max(kvp => kvp.Value.Count);
|
||||
var current = graph.Where(kvp => kvp.Value.Count == maxCount).FirstOrDefault();
|
||||
graph.Remove(current.Key);
|
||||
int index = current.Key;
|
||||
|
||||
foreach (var g in graph)
|
||||
{
|
||||
g.Value.Remove(index);
|
||||
}
|
||||
|
||||
var block = textBlocks[index];
|
||||
block.SetReadingOrder(readingOrder++);
|
||||
|
||||
yield return block;
|
||||
}
|
||||
}
|
||||
|
||||
private Dictionary<int, List<int>> BuildGraph(IReadOnlyList<TextBlock> textBlocks, double T)
|
||||
{
|
||||
// We incorporate both relations into a single partial ordering of blocks by specifying a
|
||||
// directed graph with an edge between every pair of blocks for which at least one of the
|
||||
// two relations hold.
|
||||
|
||||
var graph = new Dictionary<int, List<int>>();
|
||||
|
||||
for (int i = 0; i < textBlocks.Count; i++)
|
||||
{
|
||||
graph.Add(i, new List<int>());
|
||||
}
|
||||
|
||||
for (int i = 0; i < textBlocks.Count; i++)
|
||||
{
|
||||
var a = textBlocks[i];
|
||||
for (int j = 0; j < textBlocks.Count; j++)
|
||||
{
|
||||
if (i == j) continue;
|
||||
var b = textBlocks[j];
|
||||
|
||||
if (GetBeforeInReadingRendering(a, b, T))
|
||||
{
|
||||
graph[i].Add(j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
|
||||
private bool GetBeforeInReadingRendering(TextBlock a, TextBlock b, double T)
|
||||
{
|
||||
return GetBeforeInReadingVertical(a, b, T) || GetBeforeInRendering(a, b);
|
||||
}
|
||||
|
||||
private bool GetBeforeInRendering(TextBlock a, TextBlock b)
|
||||
{
|
||||
var avgTextSequenceA = a.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
|
||||
var avgTextSequenceB = b.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
|
||||
return avgTextSequenceA < avgTextSequenceB;
|
||||
}
|
||||
|
||||
private bool GetBeforeInReading(TextBlock a, TextBlock b, double T)
|
||||
{
|
||||
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
|
||||
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
|
||||
|
||||
if (xRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps ||
|
||||
yRelation == IntervalRelations.Overlaps)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Column-wise: text-blocks are read in columns, from top-to-bottom and from left-to-right.
|
||||
/// </summary>
|
||||
/// <param name="a"></param>
|
||||
/// <param name="b"></param>
|
||||
/// <param name="T">The tolerance parameter T.</param>
|
||||
/// <returns></returns>
|
||||
private bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T)
|
||||
{
|
||||
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
|
||||
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
|
||||
|
||||
if (xRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
(xRelation == IntervalRelations.Overlaps && (yRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Overlaps)) ||
|
||||
((yRelation == IntervalRelations.Precedes || yRelation == IntervalRelations.Meets || yRelation == IntervalRelations.Overlaps) &&
|
||||
(xRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps ||
|
||||
xRelation == IntervalRelations.Starts ||
|
||||
xRelation == IntervalRelations.FinishesI ||
|
||||
xRelation == IntervalRelations.Equals ||
|
||||
xRelation == IntervalRelations.During ||
|
||||
xRelation == IntervalRelations.DuringI ||
|
||||
xRelation == IntervalRelations.Finishes ||
|
||||
xRelation == IntervalRelations.StartsI ||
|
||||
xRelation == IntervalRelations.OverlapsI)))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Row-wise: text-blocks are read in rows from left-to-right, top- to-bottom.
|
||||
/// </summary>
|
||||
/// <param name="a"></param>
|
||||
/// <param name="b"></param>
|
||||
/// <param name="T">The tolerance parameter T.</param>
|
||||
/// <returns></returns>
|
||||
private bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
|
||||
{
|
||||
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
|
||||
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
|
||||
|
||||
if (yRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
(yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps)) ||
|
||||
((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) &&
|
||||
(yRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Overlaps ||
|
||||
yRelation == IntervalRelations.Starts ||
|
||||
yRelation == IntervalRelations.FinishesI ||
|
||||
yRelation == IntervalRelations.Equals ||
|
||||
yRelation == IntervalRelations.During ||
|
||||
yRelation == IntervalRelations.DuringI ||
|
||||
yRelation == IntervalRelations.Finishes ||
|
||||
yRelation == IntervalRelations.StartsI ||
|
||||
yRelation == IntervalRelations.OverlapsI)))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate.
|
||||
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
|
||||
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
|
||||
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
|
||||
/// </summary>
|
||||
/// <param name="a"></param>
|
||||
/// <param name="b"></param>
|
||||
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
|
||||
private IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T)
|
||||
{
|
||||
IntervalRelations xRelation = IntervalRelations.Unknown;
|
||||
|
||||
if (a.BoundingBox.Right < b.BoundingBox.Left - T)
|
||||
{
|
||||
xRelation = IntervalRelations.Precedes;
|
||||
}
|
||||
else if (a.BoundingBox.Right >= b.BoundingBox.Left - T)
|
||||
{
|
||||
xRelation = IntervalRelations.PrecedesI;
|
||||
}
|
||||
|
||||
else if (b.BoundingBox.Left - T <= a.BoundingBox.Right
|
||||
&& a.BoundingBox.Right <= b.BoundingBox.Left + T)
|
||||
{
|
||||
xRelation = IntervalRelations.Meets;
|
||||
}
|
||||
else if (b.BoundingBox.Left - T > a.BoundingBox.Right
|
||||
&& a.BoundingBox.Right > b.BoundingBox.Left + T)
|
||||
{
|
||||
xRelation = IntervalRelations.MeetsI;
|
||||
}
|
||||
|
||||
else if (a.BoundingBox.Left < b.BoundingBox.Left - T
|
||||
&& (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T))
|
||||
{
|
||||
xRelation = IntervalRelations.Overlaps;
|
||||
}
|
||||
else if (a.BoundingBox.Left >= b.BoundingBox.Left - T
|
||||
&& (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T))
|
||||
{
|
||||
xRelation = IntervalRelations.OverlapsI;
|
||||
}
|
||||
|
||||
else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
|
||||
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
|
||||
{
|
||||
xRelation = IntervalRelations.Starts;
|
||||
}
|
||||
else if ((b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T)
|
||||
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
|
||||
{
|
||||
xRelation = IntervalRelations.StartsI;
|
||||
}
|
||||
|
||||
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
|
||||
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
|
||||
{
|
||||
xRelation = IntervalRelations.During;
|
||||
}
|
||||
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
|
||||
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
|
||||
{
|
||||
xRelation = IntervalRelations.DuringI;
|
||||
}
|
||||
|
||||
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
|
||||
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
|
||||
{
|
||||
xRelation = IntervalRelations.Finishes;
|
||||
}
|
||||
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
|
||||
&& (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T))
|
||||
{
|
||||
xRelation = IntervalRelations.FinishesI;
|
||||
}
|
||||
|
||||
else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
|
||||
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
|
||||
{
|
||||
xRelation = IntervalRelations.Equals;
|
||||
}
|
||||
|
||||
return xRelation;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate.
|
||||
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
|
||||
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
|
||||
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
|
||||
/// </summary>
|
||||
/// <param name="a"></param>
|
||||
/// <param name="b"></param>
|
||||
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
|
||||
private IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T)
|
||||
{
|
||||
IntervalRelations yRelation = IntervalRelations.Unknown;
|
||||
|
||||
if (a.BoundingBox.Bottom < b.BoundingBox.Top - T)
|
||||
{
|
||||
yRelation = IntervalRelations.PrecedesI;
|
||||
}
|
||||
else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T)
|
||||
{
|
||||
yRelation = IntervalRelations.Precedes;
|
||||
}
|
||||
|
||||
else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom
|
||||
&& a.BoundingBox.Bottom <= b.BoundingBox.Top + T)
|
||||
{
|
||||
yRelation = IntervalRelations.MeetsI;
|
||||
}
|
||||
else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom
|
||||
&& a.BoundingBox.Bottom > b.BoundingBox.Top + T)
|
||||
{
|
||||
yRelation = IntervalRelations.Meets;
|
||||
}
|
||||
|
||||
else if (a.BoundingBox.Top < b.BoundingBox.Top - T
|
||||
&& (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T))
|
||||
{
|
||||
yRelation = IntervalRelations.OverlapsI;
|
||||
}
|
||||
else if (a.BoundingBox.Top >= b.BoundingBox.Top - T
|
||||
&& (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T))
|
||||
{
|
||||
yRelation = IntervalRelations.Overlaps;
|
||||
}
|
||||
|
||||
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
|
||||
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
|
||||
{
|
||||
yRelation = IntervalRelations.StartsI;
|
||||
}
|
||||
else if ((b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T)
|
||||
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
|
||||
{
|
||||
yRelation = IntervalRelations.Starts;
|
||||
}
|
||||
|
||||
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
|
||||
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
|
||||
{
|
||||
yRelation = IntervalRelations.DuringI;
|
||||
}
|
||||
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
|
||||
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
|
||||
{
|
||||
yRelation = IntervalRelations.During;
|
||||
}
|
||||
|
||||
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
|
||||
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
|
||||
{
|
||||
yRelation = IntervalRelations.FinishesI;
|
||||
}
|
||||
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
|
||||
&& (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T))
|
||||
{
|
||||
yRelation = IntervalRelations.Finishes;
|
||||
}
|
||||
|
||||
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
|
||||
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
|
||||
{
|
||||
yRelation = IntervalRelations.Equals;
|
||||
}
|
||||
|
||||
return yRelation;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Allen’s interval thirteen relations.
|
||||
/// <para>See https://en.wikipedia.org/wiki/Allen%27s_interval_algebra</para>
|
||||
/// </summary>
|
||||
private enum IntervalRelations
|
||||
{
|
||||
/// <summary>
|
||||
/// Unknown interval relations.
|
||||
/// </summary>
|
||||
Unknown,
|
||||
|
||||
/// <summary>
|
||||
/// X takes place before Y.
|
||||
/// <para>|____X____|......................</para>
|
||||
/// <para>......................|____Y____|</para>
|
||||
/// </summary>
|
||||
Precedes,
|
||||
|
||||
/// <summary>
|
||||
/// X meets Y.
|
||||
/// <para>|____X____|.................</para>
|
||||
/// <para>.................|____Y____|</para>
|
||||
/// </summary>
|
||||
Meets,
|
||||
|
||||
/// <summary>
|
||||
/// X overlaps with Y.
|
||||
/// <para>|______X______|.................</para>
|
||||
/// <para>.................|______Y______|</para>
|
||||
/// </summary>
|
||||
Overlaps,
|
||||
|
||||
/// <summary>
|
||||
/// X starts Y.
|
||||
/// <para>|____X____|.................</para>
|
||||
/// <para>|_____Y_____|..............</para>
|
||||
/// </summary>
|
||||
Starts,
|
||||
|
||||
/// <summary>
|
||||
/// X during Y.
|
||||
/// <para>........|____X____|.........</para>
|
||||
/// <para>.....|______Y______|.....</para>
|
||||
/// </summary>
|
||||
During,
|
||||
|
||||
/// <summary>
|
||||
/// X finishes Y.
|
||||
/// <para>.................|____X____|</para>
|
||||
/// <para>..............|_____Y_____|</para>
|
||||
/// </summary>
|
||||
Finishes,
|
||||
|
||||
/// <summary>
|
||||
/// Inverse precedes.
|
||||
/// </summary>
|
||||
PrecedesI,
|
||||
|
||||
/// <summary>
|
||||
/// Inverse meets.
|
||||
/// </summary>
|
||||
MeetsI,
|
||||
|
||||
/// <summary>
|
||||
/// Inverse overlaps.
|
||||
/// </summary>
|
||||
OverlapsI,
|
||||
|
||||
/// <summary>
|
||||
/// Inverse Starts.
|
||||
/// </summary>
|
||||
StartsI,
|
||||
|
||||
/// <summary>
|
||||
/// Inverse during.
|
||||
/// </summary>
|
||||
DuringI,
|
||||
|
||||
/// <summary>
|
||||
/// Inverse finishes.
|
||||
/// </summary>
|
||||
FinishesI,
|
||||
|
||||
/// <summary>
|
||||
/// X is equal to Y.
|
||||
/// <para>..........|____X____|............</para>
|
||||
/// <para>..........|____Y____|............</para>
|
||||
/// </summary>
|
||||
Equals
|
||||
}
|
||||
|
||||
private class NodeComparer : IComparer<KeyValuePair<int, List<int>>>
|
||||
{
|
||||
public int Compare(KeyValuePair<int, List<int>> x, KeyValuePair<int, List<int>> y)
|
||||
{
|
||||
return x.Value.Count.CompareTo(y.Value.Count);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
using Content;
|
||||
using Core;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Core;
|
||||
|
||||
/// <summary>
|
||||
/// A block of text.
|
||||
@@ -31,6 +31,11 @@
|
||||
/// </summary>
|
||||
public IReadOnlyList<TextLine> TextLines { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The reading order index. Starts at 0. A value of -1 means the block is not ordered.
|
||||
/// </summary>
|
||||
public int ReadingOrder { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="TextBlock"/>.
|
||||
/// </summary>
|
||||
@@ -47,6 +52,8 @@
|
||||
throw new ArgumentException("Empty lines provided.", nameof(lines));
|
||||
}
|
||||
|
||||
ReadingOrder = -1;
|
||||
|
||||
TextLines = lines;
|
||||
|
||||
Text = string.Join(" ", lines.Select(x => x.Text));
|
||||
@@ -60,6 +67,15 @@
|
||||
TextDirection = lines[0].TextDirection;
|
||||
}
|
||||
|
||||
internal void SetReadingOrder(int readingOrder)
|
||||
{
|
||||
if (readingOrder < -1)
|
||||
{
|
||||
throw new ArgumentException("The reading order should be more or equal to -1. A value of -1 means the block is not ordered.", nameof(readingOrder));
|
||||
}
|
||||
this.ReadingOrder = readingOrder;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
using Content;
|
||||
using Core;
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
using Content;
|
||||
using Core;
|
||||
using Geometry;
|
||||
|
||||
/// <summary>
|
||||
/// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line.
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
using Content;
|
||||
using Core;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Core;
|
||||
|
||||
/// <summary>
|
||||
/// A line of text.
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Core;
|
||||
using Geometry;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
||||
/// <summary>
|
||||
/// A top-down algorithm that finds a cover of the background whitespace of a document in terms of maximal empty rectangles.
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor
|
||||
{
|
||||
using Content;
|
||||
using Core;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Core;
|
||||
using Util;
|
||||
|
||||
/// <summary>
|
||||
Reference in New Issue
Block a user