mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 03:34:52 +08:00
Fix NearestNeighbourWordExtractor for rotated text
This commit is contained in:
@@ -73,51 +73,31 @@
|
||||
/// <summary>
|
||||
/// Create a new <see cref="PdfRectangle"/>.
|
||||
/// </summary>
|
||||
public PdfRectangle(PdfPoint point1, PdfPoint point2) : this(point1.X, point1.Y, point2.X, point2.Y) { }
|
||||
/// <param name="bottomLeft">Bottom left point of the rectangle.</param>
|
||||
/// <param name="topRight">Top right point of the rectangle.</param>
|
||||
public PdfRectangle(PdfPoint bottomLeft, PdfPoint topRight) :
|
||||
this(bottomLeft.X, bottomLeft.Y, topRight.X, topRight.Y)
|
||||
{ }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="PdfRectangle"/>.
|
||||
/// </summary>
|
||||
/// <param name="x1">Bottom left point's x coordinate of the rectangle.</param>
|
||||
/// <param name="y1">Bottom left point's y coordinate of the rectangle.</param>
|
||||
/// <param name="x2">Top right point's x coordinate of the rectangle.</param>
|
||||
/// <param name="y2">Top right point's y coordinate of the rectangle.</param>
|
||||
public PdfRectangle(short x1, short y1, short x2, short y2) : this((double)x1, y1, x2, y2) { }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="PdfRectangle"/>.
|
||||
/// </summary>
|
||||
public PdfRectangle(double x1, double y1, double x2, double y2)
|
||||
{
|
||||
double bottom;
|
||||
double top;
|
||||
|
||||
if (y1 <= y2)
|
||||
{
|
||||
bottom = y1;
|
||||
top = y2;
|
||||
}
|
||||
else
|
||||
{
|
||||
bottom = y2;
|
||||
top = y1;
|
||||
}
|
||||
|
||||
double left;
|
||||
double right;
|
||||
if (x1 <= x2)
|
||||
{
|
||||
left = x1;
|
||||
right = x2;
|
||||
}
|
||||
else
|
||||
{
|
||||
left = x2;
|
||||
right = x1;
|
||||
}
|
||||
|
||||
TopLeft = new PdfPoint(left, top);
|
||||
TopRight = new PdfPoint(right, top);
|
||||
|
||||
BottomLeft = new PdfPoint(left, bottom);
|
||||
BottomRight = new PdfPoint(right, bottom);
|
||||
}
|
||||
/// <param name="x1">Bottom left point's x coordinate of the rectangle.</param>
|
||||
/// <param name="y1">Bottom left point's y coordinate of the rectangle.</param>
|
||||
/// <param name="x2">Top right point's x coordinate of the rectangle.</param>
|
||||
/// <param name="y2">Top right point's y coordinate of the rectangle.</param>
|
||||
public PdfRectangle(double x1, double y1, double x2, double y2) :
|
||||
this(new PdfPoint(x1, y2), new PdfPoint(x2, y2), new PdfPoint(x1, y1), new PdfPoint(x2, y1))
|
||||
{ }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="PdfRectangle"/>.
|
||||
|
@@ -51,10 +51,12 @@
|
||||
|
||||
Text = string.Join(" ", words.Where(s => !string.IsNullOrWhiteSpace(s.Text)).Select(x => x.Text));
|
||||
|
||||
var minX = words.Min(x => x.BoundingBox.Left);
|
||||
var minY = words.Min(x => x.BoundingBox.Bottom);
|
||||
var maxX = words.Max(x => x.BoundingBox.Right);
|
||||
var maxY = words.Max(x => x.BoundingBox.Top);
|
||||
var normalisedBoundingBoxes = words.Select(x => NormaliseRectangle(x.BoundingBox)).ToList();
|
||||
var minX = normalisedBoundingBoxes.Min(x => x.Left);
|
||||
var minY = normalisedBoundingBoxes.Min(x => x.Bottom);
|
||||
var maxX = normalisedBoundingBoxes.Max(x => x.Right);
|
||||
var maxY = normalisedBoundingBoxes.Max(x => x.Top);
|
||||
|
||||
BoundingBox = new PdfRectangle(minX, minY, maxX, maxY);
|
||||
|
||||
if (words.All(x => x.TextDirection == words[0].TextDirection))
|
||||
@@ -67,6 +69,14 @@
|
||||
}
|
||||
}
|
||||
|
||||
private PdfRectangle NormaliseRectangle(PdfRectangle rectangle)
|
||||
{
|
||||
return new PdfRectangle(Math.Min(rectangle.Left, rectangle.Right),
|
||||
Math.Min(rectangle.Bottom, rectangle.Top),
|
||||
Math.Max(rectangle.Left, rectangle.Right),
|
||||
Math.Max(rectangle.Bottom, rectangle.Top));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
|
@@ -38,9 +38,17 @@
|
||||
.OrderByDescending(x => x.BoundingBox.Bottom)
|
||||
.ThenBy(x => x.BoundingBox.Left).ToList();
|
||||
|
||||
List<Word> words270 = GetWords(
|
||||
letters.Where(l => l.TextDirection == TextDirection.Rotate270),
|
||||
(l1, l2) => Math.Max(l1.GlyphRectangle.Width, l2.GlyphRectangle.Width) * 0.2,
|
||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||
.OrderBy(x => x.BoundingBox.Right)
|
||||
.ThenByDescending(x => x.BoundingBox.Bottom).ToList();
|
||||
wordsH.AddRange(words270);
|
||||
|
||||
List<Word> words180 = GetWords(
|
||||
letters.Where(l => l.TextDirection == TextDirection.Rotate180),
|
||||
(l1, l2) => Math.Max(l1.GlyphRectangle.Width, l2.GlyphRectangle.Width) * 0.2,
|
||||
(l1, l2) => Math.Max(Math.Abs(l1.GlyphRectangle.Width), Math.Abs(l2.GlyphRectangle.Width)) * 0.2,
|
||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||
.OrderBy(x => x.BoundingBox.Top)
|
||||
.ThenByDescending(x => x.BoundingBox.Right).ToList();
|
||||
@@ -48,20 +56,12 @@
|
||||
|
||||
List<Word> words90 = GetWords(
|
||||
letters.Where(l => l.TextDirection == TextDirection.Rotate90),
|
||||
(l1, l2) => Math.Max(l1.GlyphRectangle.Height, l2.GlyphRectangle.Height) * 0.2,
|
||||
(l1, l2) => Math.Max(Math.Abs(l1.GlyphRectangle.Width), Math.Abs(l2.GlyphRectangle.Width)) * 0.2,
|
||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||
.OrderByDescending(x => x.BoundingBox.Left)
|
||||
.ThenBy(x => x.BoundingBox.Top).ToList();
|
||||
wordsH.AddRange(words90);
|
||||
|
||||
List<Word> words270 = GetWords(
|
||||
letters.Where(l => l.TextDirection == TextDirection.Rotate270),
|
||||
(l1, l2) => Math.Max(l1.GlyphRectangle.Height, l2.GlyphRectangle.Height) * 0.2,
|
||||
Distances.Manhattan, MaxDegreeOfParallelism)
|
||||
.OrderBy(x => x.BoundingBox.Right)
|
||||
.ThenByDescending(x => x.BoundingBox.Bottom).ToList();
|
||||
wordsH.AddRange(words270);
|
||||
|
||||
List<Word> wordsU = GetWords(
|
||||
letters.Where(l => l.TextDirection == TextDirection.Unknown),
|
||||
(l1, l2) => Math.Max(l1.GlyphRectangle.Width, l2.GlyphRectangle.Width) * 0.2,
|
||||
@@ -85,7 +85,7 @@
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
private List<Word> GetWords(IEnumerable<Letter> pageLetters,
|
||||
public List<Word> GetWords(IEnumerable<Letter> pageLetters,
|
||||
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
int maxDegreeOfParallelism)
|
||||
{
|
||||
@@ -97,6 +97,7 @@
|
||||
throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction.");
|
||||
}
|
||||
|
||||
// TO DO: orderFunc should also take in account the edge relationships found by 'ClusterNearestNeighbours'
|
||||
Func<IEnumerable<Letter>, IReadOnlyList<Letter>> orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList();
|
||||
if (textDirection == TextDirection.Rotate180)
|
||||
{
|
||||
|
@@ -2,6 +2,7 @@
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using Core;
|
||||
|
||||
@@ -53,11 +54,53 @@
|
||||
|
||||
Letters = letters;
|
||||
|
||||
var tempTextDirection = letters[0].TextDirection;
|
||||
if (letters.Any(l => l.TextDirection != tempTextDirection))
|
||||
{
|
||||
tempTextDirection = TextDirection.Unknown;
|
||||
}
|
||||
|
||||
(string, PdfRectangle) data;
|
||||
|
||||
switch (tempTextDirection)
|
||||
{
|
||||
case TextDirection.Horizontal:
|
||||
data = GetBoundingBoxH(letters);
|
||||
break;
|
||||
|
||||
case TextDirection.Rotate180:
|
||||
data = GetBoundingBox180(letters);
|
||||
break;
|
||||
|
||||
case TextDirection.Rotate90:
|
||||
data = GetBoundingBox90(letters);
|
||||
break;
|
||||
|
||||
case TextDirection.Rotate270:
|
||||
data = GetBoundingBox270(letters);
|
||||
break;
|
||||
|
||||
case TextDirection.Unknown:
|
||||
default:
|
||||
data = GetBoundingBoxH(letters);
|
||||
break;
|
||||
}
|
||||
|
||||
Text = data.Item1;
|
||||
BoundingBox = data.Item2;
|
||||
|
||||
FontName = letters[0].FontName;
|
||||
TextDirection = tempTextDirection;
|
||||
}
|
||||
|
||||
#region Bounding box
|
||||
private (string, PdfRectangle) GetBoundingBoxH(IReadOnlyList<Letter> letters)
|
||||
{
|
||||
var builder = new StringBuilder();
|
||||
|
||||
var minX = double.MaxValue;
|
||||
var minY = double.MaxValue;
|
||||
var maxX = double.MinValue;
|
||||
var minY = double.MaxValue;
|
||||
var maxY = double.MinValue;
|
||||
|
||||
for (var i = 0; i < letters.Count; i++)
|
||||
@@ -65,17 +108,17 @@
|
||||
var letter = letters[i];
|
||||
builder.Append(letter.Value);
|
||||
|
||||
if (letter.Location.X < minX)
|
||||
if (letter.StartBaseLine.X < minX)
|
||||
{
|
||||
minX = letter.Location.X;
|
||||
minX = letter.StartBaseLine.X;
|
||||
}
|
||||
|
||||
if (letter.Location.Y < minY)
|
||||
if (letter.StartBaseLine.Y < minY)
|
||||
{
|
||||
minY = letter.Location.Y;
|
||||
minY = letter.StartBaseLine.Y;
|
||||
}
|
||||
|
||||
var right = letter.Location.X + letter.Width;
|
||||
var right = letter.StartBaseLine.X + letter.GlyphRectangle.Width;
|
||||
if (right > maxX)
|
||||
{
|
||||
maxX = right;
|
||||
@@ -87,13 +130,133 @@
|
||||
}
|
||||
}
|
||||
|
||||
Text = builder.ToString();
|
||||
BoundingBox = new PdfRectangle(minX, minY, maxX, maxY);
|
||||
|
||||
FontName = letters[0].FontName;
|
||||
TextDirection = letters[0].TextDirection;
|
||||
return (builder.ToString(), new PdfRectangle(minX, minY, maxX, maxY));
|
||||
}
|
||||
|
||||
private (string, PdfRectangle) GetBoundingBox180(IReadOnlyList<Letter> letters)
|
||||
{
|
||||
var builder = new StringBuilder();
|
||||
|
||||
var maxX = double.MinValue;
|
||||
var minX = double.MaxValue;
|
||||
var maxY = double.MinValue;
|
||||
var minY = double.MaxValue;
|
||||
|
||||
for (var i = 0; i < letters.Count; i++)
|
||||
{
|
||||
var letter = letters[i];
|
||||
builder.Append(letter.Value);
|
||||
|
||||
if (letter.StartBaseLine.X > maxX)
|
||||
{
|
||||
maxX = letter.StartBaseLine.X;
|
||||
}
|
||||
|
||||
if (letter.StartBaseLine.Y > maxY)
|
||||
{
|
||||
maxY = letter.StartBaseLine.Y;
|
||||
}
|
||||
|
||||
var right = letter.StartBaseLine.X + letter.GlyphRectangle.Width;
|
||||
if (right < minX)
|
||||
{
|
||||
minX = right;
|
||||
}
|
||||
|
||||
if (letter.GlyphRectangle.Top < minY)
|
||||
{
|
||||
minY = letter.GlyphRectangle.Top;
|
||||
}
|
||||
}
|
||||
|
||||
return (builder.ToString(), new PdfRectangle(maxX, maxY, minX, minY));
|
||||
}
|
||||
|
||||
private (string, PdfRectangle) GetBoundingBox90(IReadOnlyList<Letter> letters)
|
||||
{
|
||||
var builder = new StringBuilder();
|
||||
|
||||
var minX = double.MaxValue;
|
||||
var maxX = double.MinValue;
|
||||
var minY = double.MaxValue;
|
||||
var maxY = double.MinValue;
|
||||
|
||||
for (var i = 0; i < letters.Count; i++)
|
||||
{
|
||||
var letter = letters[i];
|
||||
builder.Append(letter.Value);
|
||||
|
||||
if (letter.StartBaseLine.X < minX)
|
||||
{
|
||||
minX = letter.StartBaseLine.X;
|
||||
}
|
||||
|
||||
if (letter.EndBaseLine.Y < minY)
|
||||
{
|
||||
minY = letter.EndBaseLine.Y;
|
||||
}
|
||||
|
||||
var right = letter.StartBaseLine.X - letter.GlyphRectangle.Width;
|
||||
if (right > maxX)
|
||||
{
|
||||
maxX = right;
|
||||
}
|
||||
|
||||
if (letter.GlyphRectangle.Top > maxY)
|
||||
{
|
||||
maxY = letter.GlyphRectangle.Top;
|
||||
}
|
||||
}
|
||||
|
||||
return (builder.ToString(), new PdfRectangle(new PdfPoint(maxX, maxY),
|
||||
new PdfPoint(maxX, minY),
|
||||
new PdfPoint(minX, maxY),
|
||||
new PdfPoint(minX, minY)));
|
||||
}
|
||||
|
||||
private (string, PdfRectangle) GetBoundingBox270(IReadOnlyList<Letter> letters)
|
||||
{
|
||||
var builder = new StringBuilder();
|
||||
|
||||
var maxX = double.MinValue;
|
||||
var minX = double.MaxValue;
|
||||
var minY = double.MaxValue;
|
||||
var maxY = double.MinValue;
|
||||
|
||||
for (var i = 0; i < letters.Count; i++)
|
||||
{
|
||||
var letter = letters[i];
|
||||
builder.Append(letter.Value);
|
||||
|
||||
if (letter.StartBaseLine.X > maxX)
|
||||
{
|
||||
maxX = letter.StartBaseLine.X;
|
||||
}
|
||||
|
||||
if (letter.StartBaseLine.Y < minY)
|
||||
{
|
||||
minY = letter.StartBaseLine.Y;
|
||||
}
|
||||
|
||||
var right = letter.StartBaseLine.X - letter.GlyphRectangle.Width;
|
||||
if (right < minX)
|
||||
{
|
||||
minX = right;
|
||||
}
|
||||
|
||||
if (letter.GlyphRectangle.Bottom > maxY)
|
||||
{
|
||||
maxY = letter.GlyphRectangle.Bottom;
|
||||
}
|
||||
}
|
||||
|
||||
return (builder.ToString(), new PdfRectangle(new PdfPoint(minX, minY),
|
||||
new PdfPoint(minX, maxY),
|
||||
new PdfPoint(maxX, minY),
|
||||
new PdfPoint(maxX, maxY)));
|
||||
}
|
||||
#endregion
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
|
Reference in New Issue
Block a user