2020-01-05 09:19:58 +00:00
|
|
|
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
2019-08-10 16:01:27 +01:00
|
|
|
|
{
|
2020-01-10 18:08:33 +00:00
|
|
|
|
using Content;
|
|
|
|
|
|
using Core;
|
2020-01-04 16:38:18 +00:00
|
|
|
|
using System;
|
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
|
using System.Linq;
|
|
|
|
|
|
|
2019-08-10 16:01:27 +01:00
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// A block of text.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public class TextBlock
|
|
|
|
|
|
{
|
2020-05-23 19:39:23 +01:00
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// The separator used between lines in the block.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public readonly string Separator;
|
|
|
|
|
|
|
2019-08-10 16:01:27 +01:00
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// The text of the block.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public string Text { get; }
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
2020-05-23 19:35:43 +01:00
|
|
|
|
/// The text orientation of the block.
|
2019-08-10 16:01:27 +01:00
|
|
|
|
/// </summary>
|
2020-05-23 19:35:43 +01:00
|
|
|
|
public TextOrientation TextOrientation { get; }
|
2019-08-10 16:01:27 +01:00
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// The rectangle completely containing the block.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public PdfRectangle BoundingBox { get; }
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// The text lines contained in the block.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public IReadOnlyList<TextLine> TextLines { get; }
|
|
|
|
|
|
|
2020-01-10 18:08:33 +00:00
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// The reading order index. Starts at 0. A value of -1 means the block is not ordered.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public int ReadingOrder { get; private set; }
|
|
|
|
|
|
|
2019-08-10 16:01:27 +01:00
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Create a new <see cref="TextBlock"/>.
|
|
|
|
|
|
/// </summary>
|
2020-05-23 19:39:23 +01:00
|
|
|
|
/// <param name="lines">The words contained in the line, in the correct order.</param>
|
|
|
|
|
|
/// <param name="separator">The separator used between lines in the block.</param>
|
|
|
|
|
|
public TextBlock(IReadOnlyList<TextLine> lines, string separator = "\n")
|
2019-08-10 16:01:27 +01:00
|
|
|
|
{
|
|
|
|
|
|
if (lines == null)
|
|
|
|
|
|
{
|
|
|
|
|
|
throw new ArgumentNullException(nameof(lines));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (lines.Count == 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
throw new ArgumentException("Empty lines provided.", nameof(lines));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-05-23 19:39:23 +01:00
|
|
|
|
Separator = separator;
|
|
|
|
|
|
|
2020-01-10 18:08:33 +00:00
|
|
|
|
ReadingOrder = -1;
|
|
|
|
|
|
|
2019-08-10 16:01:27 +01:00
|
|
|
|
TextLines = lines;
|
|
|
|
|
|
|
|
|
|
|
|
Text = string.Join(" ", lines.Select(x => x.Text));
|
|
|
|
|
|
|
|
|
|
|
|
var minX = lines.Min(x => x.BoundingBox.Left);
|
|
|
|
|
|
var minY = lines.Min(x => x.BoundingBox.Bottom);
|
|
|
|
|
|
var maxX = lines.Max(x => x.BoundingBox.Right);
|
|
|
|
|
|
var maxY = lines.Max(x => x.BoundingBox.Top);
|
|
|
|
|
|
BoundingBox = new PdfRectangle(minX, minY, maxX, maxY);
|
|
|
|
|
|
|
2020-05-23 19:35:43 +01:00
|
|
|
|
TextOrientation = lines[0].TextOrientation;
|
2019-08-10 16:01:27 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2020-01-13 09:19:15 +00:00
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Sets the <see cref="TextBlock"/>'s reading order.
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
/// <param name="readingOrder"></param>
|
|
|
|
|
|
public void SetReadingOrder(int readingOrder)
|
2020-01-10 18:08:33 +00:00
|
|
|
|
{
|
|
|
|
|
|
if (readingOrder < -1)
|
|
|
|
|
|
{
|
|
|
|
|
|
throw new ArgumentException("The reading order should be more or equal to -1. A value of -1 means the block is not ordered.", nameof(readingOrder));
|
|
|
|
|
|
}
|
|
|
|
|
|
this.ReadingOrder = readingOrder;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-08-10 16:01:27 +01:00
|
|
|
|
/// <inheritdoc />
|
|
|
|
|
|
public override string ToString()
|
|
|
|
|
|
{
|
|
|
|
|
|
return Text;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|