#836 Fix UnsupervisedReadingOrder orders 2 blocks on the same row out of order

Add images for documentation
This commit is contained in:
David 2024-05-27 19:51:16 +01:00
parent d86c2f44f0
commit 5bf1ba9f74
6 changed files with 677 additions and 264 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 191 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 154 KiB

View File

@ -0,0 +1,279 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
{
using System;
using System.Collections.Generic;
using System.Text;
using UglyToad.PdfPig.Core;
/// <summary>
/// Gets the Thick Boundary Rectangle Relations (TBRR)
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
/// <para>See also https://en.wikipedia.org/wiki/Allen%27s_interval_algebra</para>
/// </summary>
public static class IntervalRelationsHelper
{
/// <summary>
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate.
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
/// </summary>
/// <param name="a"></param>
/// <param name="b"></param>
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
public static IntervalRelations GetRelationX(PdfRectangle a, PdfRectangle b, double T)
{
// Order is important
if (b.Left - T <= a.Left && a.Left <= b.Left + T
&& (b.Right - T <= a.Right && a.Right <= b.Right + T))
{
return IntervalRelations.Equals;
}
if (b.Left - T <= a.Right
&& a.Right <= b.Left + T)
{
return IntervalRelations.Meets;
}
else if (a.Left - T <= b.Right
&& b.Right <= a.Left + T)
{
return IntervalRelations.MeetsI;
}
if (b.Left - T <= a.Left && a.Left <= b.Left + T
&& a.Right < b.Right - T)
{
return IntervalRelations.Starts;
}
else if (a.Left - T <= b.Left && b.Left <= a.Left + T
&& b.Right < a.Right - T)
{
return IntervalRelations.StartsI;
}
if (a.Left > b.Left + T
&& (b.Right - T <= a.Right && a.Right <= b.Right + T))
{
return IntervalRelations.Finishes;
}
else if (b.Left > a.Left + T
&& (a.Right - T <= b.Right && b.Right <= a.Right + T))
{
return IntervalRelations.FinishesI;
}
if (a.Left > b.Left + T
&& a.Right < b.Right - T)
{
return IntervalRelations.During;
}
else if (b.Left > a.Left + T
&& b.Right < a.Right - T)
{
return IntervalRelations.DuringI;
}
if (a.Left < b.Left - T
&& (b.Left + T < a.Right && a.Right < b.Right - T))
{
return IntervalRelations.Overlaps;
}
else if (b.Left < a.Left - T
&& (a.Left + T < b.Right && b.Right < a.Right - T))
{
return IntervalRelations.OverlapsI;
}
if (a.Right < b.Left - T)
{
return IntervalRelations.Precedes;
}
else if (b.Right < a.Left - T)
{
return IntervalRelations.PrecedesI;
}
return IntervalRelations.Unknown;
}
/// <summary>
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate.
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
/// </summary>
/// <param name="a"></param>
/// <param name="b"></param>
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
public static IntervalRelations GetRelationY(PdfRectangle a, PdfRectangle b, double T)
{
// Order is important
if ((b.Top - T <= a.Top && a.Top <= b.Top + T)
&& (b.Bottom - T <= a.Bottom && a.Bottom <= b.Bottom + T))
{
return IntervalRelations.Equals;
}
if (a.Top - T <= b.Bottom
&& b.Bottom <= a.Top + T)
{
return IntervalRelations.MeetsI;
}
else if (b.Top - T <= a.Bottom
&& a.Bottom <= b.Top + T)
{
return IntervalRelations.Meets;
}
if (b.Top - T <= a.Top && a.Top <= b.Top + T
&& a.Bottom < b.Bottom - T)
{
return IntervalRelations.StartsI;
}
else if (a.Top - T <= b.Top && b.Top <= a.Top + T
&& b.Bottom < a.Bottom - T)
{
return IntervalRelations.Starts;
}
if (a.Top > b.Top + T
&& (b.Bottom - T <= a.Bottom && a.Bottom <= b.Bottom + T))
{
return IntervalRelations.FinishesI;
}
else if (b.Top > a.Top + T
&& (a.Bottom - T <= b.Bottom && b.Bottom <= a.Bottom + T))
{
return IntervalRelations.Finishes;
}
if (a.Top > b.Top + T
&& a.Bottom < b.Bottom - T)
{
return IntervalRelations.DuringI;
}
else if (b.Top > a.Top + T
&& b.Bottom < a.Bottom - T)
{
return IntervalRelations.During;
}
if (a.Top < b.Top - T
&& (b.Bottom + T < a.Top && a.Bottom < b.Bottom - T))
{
return IntervalRelations.OverlapsI;
}
else if (b.Top < a.Top - T
&& (a.Bottom + T < b.Top && b.Bottom < a.Bottom - T))
{
return IntervalRelations.Overlaps;
}
if (a.Bottom < b.Top - T)
{
return IntervalRelations.PrecedesI;
}
else if (b.Bottom < a.Top - T)
{
return IntervalRelations.Precedes;
}
return IntervalRelations.Unknown;
}
}
/// <summary>
/// Allens interval thirteen relations.
/// <para>See https://en.wikipedia.org/wiki/Allen%27s_interval_algebra</para>
/// </summary>
public enum IntervalRelations
{
/// <summary>
/// Unknown interval relations.
/// </summary>
Unknown,
/// <summary>
/// X takes place before Y.
/// <para>|____X____|----------------------</para>
/// <para>----------------------|____Y____|</para>
/// </summary>
Precedes,
/// <summary>
/// X meets Y.
/// <para>|_____X______|--------------</para>
/// <para>--------------|______Y_____|</para>
/// </summary>
Meets,
/// <summary>
/// X overlaps with Y.
/// <para>|________X________|-------------</para>
/// <para>-------------|________Y________|</para>
/// </summary>
Overlaps,
/// <summary>
/// X starts Y.
/// <para>|____X____|-----------------</para>
/// <para>|_______Y_______|-----------</para>
/// </summary>
Starts,
/// <summary>
/// X during Y.
/// <para>--------|____X____|---------</para>
/// <para>-----|_______Y________|-----</para>
/// </summary>
During,
/// <summary>
/// X finishes Y.
/// <para>-----------------|____X____|</para>
/// <para>-----------|_______Y_______|</para>
/// </summary>
Finishes,
/// <summary>
/// Inverse precedes.
/// </summary>
PrecedesI,
/// <summary>
/// Inverse meets.
/// </summary>
MeetsI,
/// <summary>
/// Inverse overlaps.
/// </summary>
OverlapsI,
/// <summary>
/// Inverse Starts.
/// </summary>
StartsI,
/// <summary>
/// Inverse during.
/// </summary>
DuringI,
/// <summary>
/// Inverse finishes.
/// </summary>
FinishesI,
/// <summary>
/// X is equal to Y.
/// <para>----------|____X____|------------</para>
/// <para>----------|____Y____|------------</para>
/// </summary>
Equals
}
}

View File

@ -188,8 +188,8 @@
/// <param name="T">The tolerance parameter T.</param>
private static bool GetBeforeInReading(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
IntervalRelations xRelation = IntervalRelationsHelper.GetRelationX(a.BoundingBox, b.BoundingBox, T);
IntervalRelations yRelation = IntervalRelationsHelper.GetRelationY(a.BoundingBox, b.BoundingBox, T);
return xRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Precedes ||
@ -207,8 +207,8 @@
/// <param name="T">The tolerance parameter T.</param>
private static bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
IntervalRelations xRelation = IntervalRelationsHelper.GetRelationX(a.BoundingBox, b.BoundingBox, T);
IntervalRelations yRelation = IntervalRelationsHelper.GetRelationY(a.BoundingBox, b.BoundingBox, T);
return xRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
@ -237,8 +237,8 @@
/// <param name="T">The tolerance parameter T.</param>
private static bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
IntervalRelations xRelation = IntervalRelationsHelper.GetRelationX(a.BoundingBox, b.BoundingBox, T);
IntervalRelations yRelation = IntervalRelationsHelper.GetRelationY(a.BoundingBox, b.BoundingBox, T);
return yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
@ -259,263 +259,5 @@
yRelation == IntervalRelations.OverlapsI));
}
/// <summary>
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate.
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
/// </summary>
/// <param name="a"></param>
/// <param name="b"></param>
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
private static IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T)
{
if (a.BoundingBox.Right < b.BoundingBox.Left - T)
{
return IntervalRelations.Precedes;
}
else if (a.BoundingBox.Right >= b.BoundingBox.Left - T)
{
return IntervalRelations.PrecedesI;
}
else if (b.BoundingBox.Left - T <= a.BoundingBox.Right
&& a.BoundingBox.Right <= b.BoundingBox.Left + T)
{
return IntervalRelations.Meets;
}
else if (b.BoundingBox.Left - T > a.BoundingBox.Right
&& a.BoundingBox.Right > b.BoundingBox.Left + T)
{
return IntervalRelations.MeetsI;
}
else if (a.BoundingBox.Left < b.BoundingBox.Left - T
&& (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T))
{
return IntervalRelations.Overlaps;
}
else if (a.BoundingBox.Left >= b.BoundingBox.Left - T
&& (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T))
{
return IntervalRelations.OverlapsI;
}
else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
{
return IntervalRelations.Starts;
}
else if (b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
{
return IntervalRelations.StartsI;
}
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
{
return IntervalRelations.During;
}
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
{
return IntervalRelations.DuringI;
}
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
{
return IntervalRelations.Finishes;
}
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T))
{
return IntervalRelations.FinishesI;
}
else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
{
return IntervalRelations.Equals;
}
return IntervalRelations.Unknown;
}
/// <summary>
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate.
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
/// </summary>
/// <param name="a"></param>
/// <param name="b"></param>
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
private static IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T)
{
if (a.BoundingBox.Bottom < b.BoundingBox.Top - T)
{
return IntervalRelations.PrecedesI;
}
else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T)
{
return IntervalRelations.Precedes;
}
else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom
&& a.BoundingBox.Bottom <= b.BoundingBox.Top + T)
{
return IntervalRelations.MeetsI;
}
else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom
&& a.BoundingBox.Bottom > b.BoundingBox.Top + T)
{
return IntervalRelations.Meets;
}
else if (a.BoundingBox.Top < b.BoundingBox.Top - T
&& (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T))
{
return IntervalRelations.OverlapsI;
}
else if (a.BoundingBox.Top >= b.BoundingBox.Top - T
&& (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T))
{
return IntervalRelations.Overlaps;
}
else if (b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
{
return IntervalRelations.StartsI;
}
else if (b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
{
return IntervalRelations.Starts;
}
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
{
return IntervalRelations.DuringI;
}
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
{
return IntervalRelations.During;
}
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
{
return IntervalRelations.FinishesI;
}
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
&& (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T))
{
return IntervalRelations.Finishes;
}
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
{
return IntervalRelations.Equals;
}
return IntervalRelations.Unknown;
}
/// <summary>
/// Allens interval thirteen relations.
/// <para>See https://en.wikipedia.org/wiki/Allen%27s_interval_algebra</para>
/// </summary>
private enum IntervalRelations
{
/// <summary>
/// Unknown interval relations.
/// </summary>
Unknown,
/// <summary>
/// X takes place before Y.
/// <para>|____X____|......................</para>
/// <para>......................|____Y____|</para>
/// </summary>
Precedes,
/// <summary>
/// X meets Y.
/// <para>|____X____|.................</para>
/// <para>.................|____Y____|</para>
/// </summary>
Meets,
/// <summary>
/// X overlaps with Y.
/// <para>|______X______|.................</para>
/// <para>.................|______Y______|</para>
/// </summary>
Overlaps,
/// <summary>
/// X starts Y.
/// <para>|____X____|.................</para>
/// <para>|_____Y_____|..............</para>
/// </summary>
Starts,
/// <summary>
/// X during Y.
/// <para>........|____X____|.........</para>
/// <para>.....|______Y______|.....</para>
/// </summary>
During,
/// <summary>
/// X finishes Y.
/// <para>.................|____X____|</para>
/// <para>..............|_____Y_____|</para>
/// </summary>
Finishes,
/// <summary>
/// Inverse precedes.
/// </summary>
PrecedesI,
/// <summary>
/// Inverse meets.
/// </summary>
MeetsI,
/// <summary>
/// Inverse overlaps.
/// </summary>
OverlapsI,
/// <summary>
/// Inverse Starts.
/// </summary>
StartsI,
/// <summary>
/// Inverse during.
/// </summary>
DuringI,
/// <summary>
/// Inverse finishes.
/// </summary>
FinishesI,
/// <summary>
/// X is equal to Y.
/// <para>..........|____X____|............</para>
/// <para>..........|____Y____|............</para>
/// </summary>
Equals
}
}
}

View File

@ -0,0 +1,319 @@
namespace UglyToad.PdfPig.Tests.Dla
{
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis;
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
using UglyToad.PdfPig.Core;
public class IntervalRelationsHelperTests
{
// Note (0,0) is bottom left of page
/// <summary>
/// A is equal to B.
/// <para>----------|____A____|------------</para>
/// <para>----------|____B____|------------</para>
/// </summary>
[Fact]
public void IntervalRelation_Equals_X()
{
var a = new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(10, 10));
var res = IntervalRelationsHelper.GetRelationX(a, a, 5);
Assert.Equal(IntervalRelations.Equals, res);
}
[Fact]
public void IntervalRelation_Equals_Y()
{
var a = new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(10, 10));
var res = IntervalRelationsHelper.GetRelationY(a, a, 5);
Assert.Equal(IntervalRelations.Equals, res);
}
/// <summary>
/// Precedes: A takes place before B.
/// <para>|____A____|----------------------</para>
/// <para>----------------------|____B____|</para>
/// </summary>
///
[Fact]
public void IntervalRelation_Precedes_X()
{
var a = PdfPointTestExtensions.BoxAtTopLeft();
var b = PdfPointTestExtensions.BoxAtTopLeft().MoveLeft(100);
var res = IntervalRelationsHelper.GetRelationX(a, b, 5);
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5);
Assert.Equal(IntervalRelations.Precedes, res);
Assert.Equal(IntervalRelations.PrecedesI, resInverse);
}
[Fact]
public void IntervalRelation_Precedes_Y()
{
var a = PdfPointTestExtensions.BoxAtTopLeft();
var b = a.MoveDown(200);
var res = IntervalRelationsHelper.GetRelationY(a, b, 5);
var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5);
Assert.Equal(IntervalRelations.Precedes, res);
Assert.Equal(IntervalRelations.PrecedesI, resInverse);
}
/// <summary>
/// A meets B.
/// <para>|_____A______|--------------</para>
/// <para>--------------|______B_____|</para>
/// </summary>
[Fact]
public void IntervalRelation_Meets_X()
{
var a = PdfPointTestExtensions.BoxAtTopLeft(100);
var b = a.MoveLeft(100);
var res = IntervalRelationsHelper.GetRelationX(a, b, 5);
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5);
Assert.Equal(IntervalRelations.Meets, res);
Assert.Equal(IntervalRelations.MeetsI, resInverse);
}
/// <summary>
/// A meets B.
/// <para>|_____A______|--------------</para>
/// <para>--------------|______B_____|</para>
/// </summary>
[Fact]
public void IntervalRelation_Meets_X_WithinTolerance()
{
var a = PdfPointTestExtensions.BoxAtTopLeft(100);
var b = a.MoveLeft(110);
var res = IntervalRelationsHelper.GetRelationX(a, b, 11);
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 11);
Assert.Equal(IntervalRelations.Meets, res);
Assert.Equal(IntervalRelations.MeetsI, resInverse);
}
[Fact]
public void IntervalRelation_Meets_Y()
{
var a = PdfPointTestExtensions.BoxAtTopLeft(100);
var b = a.MoveDown(100);
var res = IntervalRelationsHelper.GetRelationY(a, b, 5);
var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5);
Assert.Equal(IntervalRelations.Meets, res);
Assert.Equal(IntervalRelations.MeetsI, resInverse);
}
[Fact]
public void IntervalRelation_Meets_Y_WhenMovedDown_BecomesPreceeds()
{
// We take an A B that meets and move the B further down so becomes preceeds
var startPoint = new PdfPoint(100, 600);
var a = new PdfRectangle(startPoint, startPoint.MoveDown(100));
var meetsABox = a.MoveDown(100);
var res = IntervalRelationsHelper.GetRelationY(a, meetsABox, 5);
var resInverse = IntervalRelationsHelper.GetRelationY(meetsABox, a, 5);
Assert.Equal(IntervalRelations.Meets, res);
Assert.Equal(IntervalRelations.MeetsI, resInverse);
var preceededByABox = meetsABox.MoveDown(100);
var moveRes = IntervalRelationsHelper.GetRelationY(a, preceededByABox, 5);
var moveResInverse = IntervalRelationsHelper.GetRelationY(preceededByABox, a, 5);
Assert.Equal(IntervalRelations.Precedes, moveRes);
Assert.Equal(IntervalRelations.PrecedesI, moveResInverse);
}
/// <summary>
/// A overlaps with B.
/// <para>|________A________|-------------</para>
/// <para>-------------|________B________|</para>
/// </summary>
[Fact]
public void IntervalRelation_Overlaps_X()
{
var a = PdfPointTestExtensions.BoxAtTopLeft(100);
var b = a.MoveLeft(a.Width/2);
var res = IntervalRelationsHelper.GetRelationX(a, b, 5);
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5);
Assert.Equal(IntervalRelations.Overlaps, res);
Assert.Equal(IntervalRelations.OverlapsI, resInverse);
}
[Fact]
public void IntervalRelation_Overlaps_Y()
{
var a = PdfPointTestExtensions.BoxAtTopLeft(100);
var b = a.MoveLeft(500).MoveDown(a.Height / 2); // Only the move down is important
var res = IntervalRelationsHelper.GetRelationY(a, b, 5);
var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5);
Assert.Equal(IntervalRelations.Overlaps, res);
Assert.Equal(IntervalRelations.OverlapsI, resInverse);
}
/// <summary>
/// A starts B.
/// <para>|____A____|-----------------</para>
/// <para>|_______B_______|-----------</para>
/// </summary>
[Fact]
public void IntervalRelation_Starts_X()
{
var topLeft = PdfPointTestExtensions.OriginTopLeft();
var a = new PdfRectangle(topLeft, topLeft.MoveLeft(50).MoveDown(10));
var b = new PdfRectangle(topLeft, topLeft.MoveLeft(100).MoveDown(10));
var res = IntervalRelationsHelper.GetRelationX(a, b, 5);
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5);
Assert.Equal(IntervalRelations.Starts, res);
Assert.Equal(IntervalRelations.StartsI, resInverse);
}
[Fact]
public void IntervalRelation_Starts_Y()
{
var topLeft = PdfPointTestExtensions.OriginTopLeft();
var a = new PdfRectangle(topLeft, topLeft.MoveLeft(100).MoveDown(100));
var b = new PdfRectangle(topLeft, topLeft.MoveLeft(100).MoveDown(200));
var res = IntervalRelationsHelper.GetRelationY(a, b, 5);
var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5);
Assert.Equal(IntervalRelations.Starts, res);
Assert.Equal(IntervalRelations.StartsI, resInverse);
}
/// <summary>
/// A during B.
/// <para>--------|____A____|---------</para>
/// <para>-----|_______B________|-----</para>
/// </summary>
///During,
[Fact]
public void IntervalRelation_During_X()
{
var a = new PdfRectangle(new PdfPoint(20, 0), new PdfPoint(80, 0));
var b = new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(100, 0));
var res = IntervalRelationsHelper.GetRelationX(a, b, 5);
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5);
Assert.Equal(IntervalRelations.During, res);
Assert.Equal(IntervalRelations.DuringI, resInverse);
}
[Fact]
public void IntervalRelation_During_Y()
{
var a = new PdfRectangle(new PdfPoint(0, 20), new PdfPoint(0, 80));
var b = new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(0, 100));
var res = IntervalRelationsHelper.GetRelationY(a, b, 5);
var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5);
Assert.Equal(IntervalRelations.During, res);
Assert.Equal(IntervalRelations.DuringI, resInverse);
}
/// <summary>
/// A finishes B.
/// <para>-----------------|____A____|</para>
/// <para>-----------|_______B_______|</para>
/// </summary>
[Fact]
public void IntervalRelation_Finishes_X()
{
var topRight = PdfPointTestExtensions.OriginTopLeft().MoveLeft(400);
var a = new PdfRectangle(topRight.MoveX(-100), topRight);
var b = new PdfRectangle(topRight.MoveX(-200), topRight);
var res = IntervalRelationsHelper.GetRelationX(a, b, 5);
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5);
Assert.Equal(IntervalRelations.Finishes, res);
Assert.Equal(IntervalRelations.FinishesI, resInverse);
}
[Fact]
public void IntervalRelation_Finishes_Y()
{
var topleft = PdfPointTestExtensions.OriginTopLeft();
var a = PdfPointTestExtensions.BoxAtTopLeft(20).MoveDown(20);
var b = PdfPointTestExtensions.BoxAtTopLeft(40);
var res = IntervalRelationsHelper.GetRelationY(a, b, 5);
var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5);
Assert.Equal(IntervalRelations.Finishes, res);
Assert.Equal(IntervalRelations.FinishesI, resInverse);
}
}
internal static class PdfPointTestExtensions
{
internal static PdfPoint OriginTopLeft()
{
return new PdfPoint(0, 800);
}
internal static PdfPoint MoveLeft(this PdfPoint it, double dist)
{
if (dist < 0) throw new ArgumentException(nameof(dist) + "must be positive");
return it.MoveX(dist);
}
internal static PdfPoint MoveDown(this PdfPoint it, double dist)
{
if (dist < 0) throw new ArgumentException(nameof(dist) + "must be positive");
return it.MoveY(-dist);
}
internal static PdfRectangle BoxAtTopLeft(double length = 10d)
{
return new PdfRectangle(OriginTopLeft(), OriginTopLeft().MoveLeft(length).MoveDown(length));
}
internal static PdfRectangle MoveLeft(this PdfRectangle start, double dist)
{
if (dist < 0) throw new ArgumentException(nameof(dist) + "must be positive");
return new PdfRectangle(start.BottomLeft.MoveLeft(dist), start.TopRight.MoveLeft(dist));
}
internal static PdfRectangle MoveDown(this PdfRectangle start, double dist)
{
if (dist < 0) throw new ArgumentException(nameof(dist) + "must be positive");
return new PdfRectangle(start.BottomLeft.MoveDown(dist), start.TopRight.MoveDown(dist));
}
}
}

View File

@ -0,0 +1,73 @@
namespace UglyToad.PdfPig.Tests.Dla
{
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis;
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
using UglyToad.PdfPig.Core;
public class UnsupervisedReadingOrderTests
{
[Fact]
public void ReadingOrderOrdersItemsOnTheSameRowContents()
{
TextBlock leftTextBlock = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(10, 10)));
TextBlock rightTextBlock = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(100, 0), new PdfPoint(110, 10)));
// We deliberately submit in the wrong order
var textBlocks = new List<TextBlock>() { rightTextBlock, leftTextBlock };
var unsupervisedReadingOrderDetector = new UnsupervisedReadingOrderDetector(5, UnsupervisedReadingOrderDetector.SpatialReasoningRules.RowWise);
var orderedBlocks = unsupervisedReadingOrderDetector.Get(textBlocks);
var ordered = orderedBlocks.OrderBy(x => x.ReadingOrder).ToList();
Assert.Equal(0, ordered[0].BoundingBox.Left);
Assert.Equal(100, ordered[1].BoundingBox.Left);
}
[Fact]
public void DocumentTest()
{
var title = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 709.06), new PdfPoint(x: 42.6, y: 709.06)));
var line1_Left = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 668.86), new PdfPoint(x: 42.6, y: 668.86)));
var line1_Right = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 302.21, y: 668.86), new PdfPoint(x: 302.21, y: 668.86)));
var line2_Left = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 608.26), new PdfPoint(x: 42.6, y: 608.26)));
var line2_Taller_Right = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 302.21, y: 581.35), new PdfPoint(x: 302.21, y: 581.35)));
var line3 = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 515.83), new PdfPoint(x: 42.6, y: 515.83)));
var line4_left = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 490.27), new PdfPoint(x: 42.6, y: 490.27)));
var line4_right = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 302.21, y: 491.59), new PdfPoint(x: 302.21, y: 491.59)));
// We deliberately submit in the wrong order
var textBlocks = new List<TextBlock>() { title, line4_left, line2_Taller_Right, line4_right, line1_Right, line1_Left, line3, line2_Left };
var unsupervisedReadingOrderDetector = new UnsupervisedReadingOrderDetector(5, UnsupervisedReadingOrderDetector.SpatialReasoningRules.RowWise);
var orderedBlocks = unsupervisedReadingOrderDetector.Get(textBlocks);
var ordered = orderedBlocks.OrderBy(x => x.ReadingOrder).ToList();
Assert.Equal(title.BoundingBox, ordered[0].BoundingBox);
Assert.Equal(line1_Left.BoundingBox, ordered[1].BoundingBox);
Assert.Equal(line1_Right.BoundingBox, ordered[2].BoundingBox);
Assert.Equal(line2_Left.BoundingBox, ordered[3].BoundingBox);
Assert.Equal(line2_Taller_Right.BoundingBox, ordered[4].BoundingBox);
Assert.Equal(line3.BoundingBox, ordered[5].BoundingBox);
Assert.Equal(line4_left.BoundingBox, ordered[6].BoundingBox);
Assert.Equal(line4_right.BoundingBox, ordered[7].BoundingBox);
}
private static TextBlock CreateFakeTextBlock(PdfRectangle boundingBox)
{
var letter = new Letter("a",
boundingBox,
boundingBox.BottomLeft,
boundingBox.BottomRight,
10, 1, null, TextRenderingMode.NeitherClip, null, null, 0, 0);// These don't matter
var leftTextBlock = new TextBlock(new[] { new TextLine(new[] { new Word(new[] { letter }) }) });
return leftTextBlock;
}
}
}