Make DlaOptions an interface, add IWordExtractorOptions, remove GetBlocks(words, options), GetWords(letters, options) and put options in constructors - Fix #424. Tidy up code

This commit is contained in:
BobLD
2022-03-12 13:23:40 +00:00
parent b0a5f4c8d0
commit 05aba1cfe7
17 changed files with 307 additions and 213 deletions

View File

@@ -2,7 +2,6 @@
{ {
using Alto; using Alto;
using Content; using Content;
using Core;
using DocumentLayoutAnalysis; using DocumentLayoutAnalysis;
using System; using System;
using System.Globalization; using System.Globalization;
@@ -147,7 +146,7 @@
altoPage.PrintSpace.TextBlock = blocks; altoPage.PrintSpace.TextBlock = blocks;
altoPage.PrintSpace.Illustrations = page.GetImages().Select(i => ToAltoIllustration(i, page.Height)).ToArray(); altoPage.PrintSpace.Illustrations = page.GetImages().Select(i => ToAltoIllustration(i, page.Height)).ToArray();
if (includePaths) if (includePaths)
{ {
altoPage.PrintSpace.GraphicalElements = page.ExperimentalAccess.Paths altoPage.PrintSpace.GraphicalElements = page.ExperimentalAccess.Paths
@@ -288,7 +287,7 @@
ProcessingSoftware = new AltoDocument.AltoProcessingSoftware ProcessingSoftware = new AltoDocument.AltoProcessingSoftware
{ {
SoftwareName = "PdfPig", SoftwareName = "PdfPig",
SoftwareCreator = @"https://github.com/UglyToad/PdfPig", SoftwareCreator = "https://github.com/UglyToad/PdfPig",
ApplicationDescription = "Read and extract text and other content from PDFs in C# (port of PdfBox)", ApplicationDescription = "Read and extract text and other content from PDFs in C# (port of PdfBox)",
SoftwareVersion = "x.x.xx" SoftwareVersion = "x.x.xx"
}, },

View File

@@ -49,7 +49,7 @@
/// </summary> /// </summary>
/// <param name="document">The document.</param> /// <param name="document">The document.</param>
/// <param name="includePaths">Draw PdfPaths present in the page.</param> /// <param name="includePaths">Draw PdfPaths present in the page.</param>
/// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the /// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the
/// interface to a plain hOCR file.<para>See https://github.com/kba/hocrjs for more information.</para></param> /// interface to a plain hOCR file.<para>See https://github.com/kba/hocrjs for more information.</para></param>
public string Get(PdfDocument document, bool includePaths = false, bool useHocrjs = false) public string Get(PdfDocument document, bool includePaths = false, bool useHocrjs = false)
{ {
@@ -61,10 +61,13 @@
hocr += GetCode(page, includePaths) + "\n"; hocr += GetCode(page, includePaths) + "\n";
} }
if (useHocrjs) hocr += indentChar + indentChar + Hocrjs; if (useHocrjs)
{
hocr += indentChar + indentChar + Hocrjs;
}
hocr += indentChar + "</body>"; hocr += indentChar + "</body>";
hocr = XmlHeader + AddHtmlHeader(hocr); return XmlHeader + AddHtmlHeader(hocr);
return hocr;
} }
/// <summary> /// <summary>
@@ -80,8 +83,8 @@
/// Get the hOCR (HTML) string of the page layout. /// Get the hOCR (HTML) string of the page layout.
/// </summary> /// </summary>
/// <param name="page">The page.</param> /// <param name="page">The page.</param>
/// <param name="imageName">The image name, if any.</param>
/// <param name="includePaths">Draw PdfPaths present in the page.</param> /// <param name="includePaths">Draw PdfPaths present in the page.</param>
/// <param name="imageName">The image name, if any.</param>
/// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the interface to a plain hOCR file.<para>See https://github.com/kba/hocrjs for more information.</para></param> /// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the interface to a plain hOCR file.<para>See https://github.com/kba/hocrjs for more information.</para></param>
public string Get(Page page, bool includePaths = false, string imageName = "unknown", bool useHocrjs = false) public string Get(Page page, bool includePaths = false, string imageName = "unknown", bool useHocrjs = false)
{ {
@@ -89,10 +92,13 @@
hocr += GetCode(page, includePaths, imageName) + "\n"; hocr += GetCode(page, includePaths, imageName) + "\n";
if (useHocrjs) hocr += indentChar + indentChar + Hocrjs; if (useHocrjs)
{
hocr += indentChar + indentChar + Hocrjs;
}
hocr += indentChar + "</body>"; hocr += indentChar + "</body>";
hocr = XmlHeader + AddHtmlHeader(hocr); return XmlHeader + AddHtmlHeader(hocr);
return hocr;
} }
private string GetHead() private string GetHead()
@@ -129,14 +135,14 @@
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page</para> /// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page</para>
/// </summary> /// </summary>
/// <param name="page"></param> /// <param name="page"></param>
/// <param name="imageName"></param>
/// <param name="includePaths">Draw PdfPaths present in the page.</param> /// <param name="includePaths">Draw PdfPaths present in the page.</param>
/// <param name="imageName"></param>
private string GetCode(Page page, bool includePaths, string imageName = "unknown") private string GetCode(Page page, bool includePaths, string imageName = "unknown")
{ {
pageCount++; pageCount++;
int level = 2; int level = 2;
string hocr = GetIndent(level) + @"<div class='ocr_page' id='page_" + page.Number.ToString() + string hocr = GetIndent(level) + "<div class='ocr_page' id='page_" + page.Number.ToString() +
"' title='image \"" + imageName + "\"; bbox 0 0 " + "' title='image \"" + imageName + "\"; bbox 0 0 " +
(int)Math.Round(page.Width * scale) + " " + (int)Math.Round(page.Height * scale) + (int)Math.Round(page.Width * scale) + " " + (int)Math.Round(page.Height * scale) +
"; ppageno " + (page.Number - 1) + "\'>"; "; ppageno " + (page.Number - 1) + "\'>";
@@ -156,16 +162,15 @@
var words = page.GetWords(wordExtractor); var words = page.GetWords(wordExtractor);
if (words.Count() > 0) if (words.Any())
{ {
var blocks = pageSegmenter.GetBlocks(words); foreach (var block in pageSegmenter.GetBlocks(words))
foreach (var block in blocks)
{ {
hocr += "\n" + GetCodeArea(block, page.Height, level + 1); hocr += "\n" + GetCodeArea(block, page.Height, level + 1);
} }
} }
hocr += "\n" + GetIndent(level) + @"</div>"; hocr += "\n" + GetIndent(level) + "</div>";
return hocr; return hocr;
} }
@@ -179,7 +184,10 @@
/// <param name="level">The indent level.</param> /// <param name="level">The indent level.</param>
private string GetCode(PdfPath path, double pageHeight, bool subPaths, int level) private string GetCode(PdfPath path, double pageHeight, bool subPaths, int level)
{ {
if (path == null) return string.Empty; if (path == null)
{
return string.Empty;
}
string hocr = string.Empty; string hocr = string.Empty;
@@ -189,7 +197,7 @@
if (bbox.HasValue) if (bbox.HasValue)
{ {
areaCount++; areaCount++;
hocr += GetIndent(level) + @"<div class='ocr_carea' id='block_" + pageCount + "_" hocr += GetIndent(level) + "<div class='ocr_carea' id='block_" + pageCount + "_"
+ areaCount + "' title='" + GetCode(bbox.Value, pageHeight) + "'>\n"; + areaCount + "' title='" + GetCode(bbox.Value, pageHeight) + "'>\n";
foreach (var subPath in path) foreach (var subPath in path)
{ {
@@ -197,11 +205,11 @@
if (subBbox.HasValue) if (subBbox.HasValue)
{ {
pathCount++; pathCount++;
hocr += GetIndent(level + 1) + @"<span class='ocr_linedrawing' id='drawing_" + pageCount + "_" hocr += GetIndent(level + 1) + "<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
+ pathCount + "' title='" + GetCode(subBbox.Value, pageHeight) + "' />\n"; + pathCount + "' title='" + GetCode(subBbox.Value, pageHeight) + "' />\n";
} }
} }
hocr += GetIndent(level) + @"</div>"; hocr += GetIndent(level) + "</div>";
} }
} }
else else
@@ -210,7 +218,7 @@
if (bbox.HasValue) if (bbox.HasValue)
{ {
pathCount++; pathCount++;
hocr += GetIndent(level) + @"<span class='ocr_linedrawing' id='drawing_" + pageCount + "_" hocr += GetIndent(level) + "<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
+ pathCount + "' title='" + GetCode(bbox.Value, pageHeight) + "' />"; + pathCount + "' title='" + GetCode(bbox.Value, pageHeight) + "' />";
} }
} }
@@ -222,7 +230,7 @@
{ {
imageCount++; imageCount++;
var bbox = pdfImage.Bounds; var bbox = pdfImage.Bounds;
return GetIndent(level) + @"<span class='ocr_image' id='image_" + pageCount + "_" return GetIndent(level) + "<span class='ocr_image' id='image_" + pageCount + "_"
+ imageCount + "' title='" + GetCode(bbox, pageHeight) + "' />"; + imageCount + "' title='" + GetCode(bbox, pageHeight) + "' />";
} }
@@ -237,12 +245,11 @@
{ {
areaCount++; areaCount++;
string bbox = GetCode(block.BoundingBox, pageHeight); string hocr = GetIndent(level) + "<div class='ocr_carea' id='block_" + pageCount + "_"
string hocr = GetIndent(level) + @"<div class='ocr_carea' id='block_" + pageCount + "_"
+ areaCount + "' title='" + GetCode(block.BoundingBox, pageHeight) + "'>"; + areaCount + "' title='" + GetCode(block.BoundingBox, pageHeight) + "'>";
hocr += GetCodeParagraph(block, pageHeight, level + 1); // we concider 1 area = 1 block. should change in the future hocr += GetCodeParagraph(block, pageHeight, level + 1); // we concider 1 area = 1 block. should change in the future
hocr += "\n" + GetIndent(level) + @"</div>"; hocr += "\n" + GetIndent(level) + "</div>";
return hocr; return hocr;
} }
@@ -256,14 +263,14 @@
private string GetCodeParagraph(TextBlock block, double pageHeight, int level) private string GetCodeParagraph(TextBlock block, double pageHeight, int level)
{ {
paraCount++; paraCount++;
string hocr = "\n" + GetIndent(level) + @"<p class='ocr_par' id='par_" + pageCount + "_" string hocr = "\n" + GetIndent(level) + "<p class='ocr_par' id='par_" + pageCount + "_"
+ paraCount + "' title='" + GetCode(block.BoundingBox, pageHeight) + "'>"; // lang='eng' + paraCount + "' title='" + GetCode(block.BoundingBox, pageHeight) + "'>"; // lang='eng'
foreach (var line in block.TextLines) foreach (var line in block.TextLines)
{ {
hocr += "\n" + GetCode(line, pageHeight, level + 1); hocr += "\n" + GetCode(line, pageHeight, level + 1);
} }
hocr += "\n" + GetIndent(level) + @"</p>"; hocr += "\n" + GetIndent(level) + "</p>";
return hocr; return hocr;
} }
@@ -285,14 +292,14 @@
double baseLine = (double)line.Words[0].Letters[0].StartBaseLine.Y; double baseLine = (double)line.Words[0].Letters[0].StartBaseLine.Y;
baseLine = (double)line.BoundingBox.Bottom - baseLine; baseLine = (double)line.BoundingBox.Bottom - baseLine;
string hocr = GetIndent(level) + @"<span class='ocr_line' id='line_" + pageCount + "_" + lineCount + "' title='" + string hocr = GetIndent(level) + "<span class='ocr_line' id='line_" + pageCount + "_" + lineCount + "' title='" +
GetCode(line.BoundingBox, pageHeight) + "; baseline " + angle + " 0'>"; //"; x_size 42; x_descenders 5; x_ascenders 12' >"; GetCode(line.BoundingBox, pageHeight) + "; baseline " + angle + " 0'>"; //"; x_size 42; x_descenders 5; x_ascenders 12' >";
foreach (var word in line.Words) foreach (var word in line.Words)
{ {
hocr += "\n" + GetCode(word, pageHeight, level + 1); hocr += "\n" + GetCode(word, pageHeight, level + 1);
} }
hocr += "\n" + GetIndent(level) + @"</span>"; hocr += "\n" + GetIndent(level) + "</span>";
return hocr; return hocr;
} }
@@ -307,7 +314,7 @@
{ {
wordCount++; wordCount++;
string hocr = GetIndent(level) + string hocr = GetIndent(level) +
@"<span class='ocrx_word' id='word_" + pageCount + "_" + wordCount + "<span class='ocrx_word' id='word_" + pageCount + "_" + wordCount +
"' title='" + GetCode(word.BoundingBox, pageHeight) + "; x_wconf " + GetConfidence(word); "' title='" + GetCode(word.BoundingBox, pageHeight) + "; x_wconf " + GetConfidence(word);
hocr += "; x_font " + word.FontName; hocr += "; x_font " + word.FontName;
@@ -343,7 +350,7 @@
var right = (int)Math.Round(rectangle.Right * scale); var right = (int)Math.Round(rectangle.Right * scale);
var bottom = (int)Math.Round((pageHeight - rectangle.Bottom) * scale); var bottom = (int)Math.Round((pageHeight - rectangle.Bottom) * scale);
return @"bbox " + (left > 0 ? left : 0) + " " return "bbox " + (left > 0 ? left : 0) + " "
+ (top > 0 ? top : 0) + " " + (top > 0 ? top : 0) + " "
+ (right > 0 ? right : 0) + " " + (right > 0 ? right : 0) + " "
+ (bottom > 0 ? bottom : 0); + (bottom > 0 ? bottom : 0);

View File

@@ -147,7 +147,7 @@
/// <summary> /// <summary>
/// PageXml Text colour in RGB encoded format /// PageXml Text colour in RGB encoded format
/// <para>(red value) + (256 x green value) + (65536 x blue value).</para> /// <para>(red value) + (256 x green value) + (65536 x blue value).</para>
/// </summary> /// </summary>
private string ToRgbEncoded(IColor color) private string ToRgbEncoded(IColor color)
{ {
@@ -184,7 +184,7 @@
regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Width, page.Height))); regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Width, page.Height)));
if (orderedRegions.Any()) if (orderedRegions.Count > 0)
{ {
pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder() pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder()
{ {
@@ -206,7 +206,7 @@
if (includePaths) if (includePaths)
{ {
var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Width, page.Height)); var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Width, page.Height));
if (graphicalElements.Where(g => g != null).Count() > 0) if (graphicalElements.Count(g => g != null) > 0)
{ {
regions.AddRange(graphicalElements.Where(g => g != null)); regions.AddRange(graphicalElements.Where(g => g != null));
} }

View File

@@ -53,7 +53,7 @@
builder.Append("</g></svg>"); builder.Append("</g></svg>");
return builder.ToString(); return builder.ToString();
} }
private static string LetterToSvg(Letter l, double height, XmlDocument doc) private static string LetterToSvg(Letter l, double height, XmlDocument doc)
{ {
string fontFamily = GetFontFamily(l.FontName, out string style, out string weight); string fontFamily = GetFontFamily(l.FontName, out string style, out string weight);
@@ -69,7 +69,7 @@
var x = Math.Round(l.StartBaseLine.X, Rounding); var x = Math.Round(l.StartBaseLine.X, Rounding);
var y = Math.Round(height - l.StartBaseLine.Y, Rounding); var y = Math.Round(height - l.StartBaseLine.Y, Rounding);
return $"<text x='{x}' y='{y}'{rotation} font-family='{fontFamily}' font-style='{style}' font-weight='{weight}' {fontSize} fill='{ColorToSvg(l.Color)}'>{safeValue}</text>" return $"<text x='{x}' y='{y}'{rotation} font-family='{fontFamily}' font-style='{style}' font-weight='{weight}' {fontSize} fill='{ColorToSvg(l.Color)}'>{safeValue}</text>"
+ Environment.NewLine; + Environment.NewLine;
} }
@@ -123,7 +123,11 @@
} }
} }
if (Fonts.ContainsKey(fontName)) fontName = Fonts[fontName]; if (Fonts.ContainsKey(fontName))
{
fontName = Fonts[fontName];
}
return fontName; return fontName;
} }
@@ -136,7 +140,11 @@
private static string ColorToSvg(IColor color) private static string ColorToSvg(IColor color)
{ {
if (color == null) return ""; if (color == null)
{
return string.Empty;
}
var (r, g, b) = color.ToRGBValues(); var (r, g, b) = color.ToRGBValues();
return $"rgb({Math.Ceiling(r * 255)},{Math.Ceiling(g * 255)},{Math.Ceiling(b * 255)})"; return $"rgb({Math.Ceiling(r * 255)},{Math.Ceiling(g * 255)},{Math.Ceiling(b * 255)})";
} }

View File

@@ -1,16 +1,15 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{ {
/// <summary> /// <summary>
/// Abstract class that stores options that configure the operation of methods of the document layout analysis algorithm. /// Interface that stores options that configure the operation of methods of the document layout analysis algorithm.
/// </summary> /// </summary>
public abstract class DlaOptions public interface IDlaOptions
{ {
/// <summary> /// <summary>
/// Gets or sets the maximum number of concurrent tasks enabled. /// Gets or sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value. /// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para> /// If it is -1, there is no limit on the number of concurrently running operations.</para>
/// <para>Default value is -1.</para>
/// </summary> /// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1; int MaxDegreeOfParallelism { get; set; }
} }
} }

View File

@@ -11,47 +11,67 @@
/// </summary> /// </summary>
public class DefaultPageSegmenter : IPageSegmenter public class DefaultPageSegmenter : IPageSegmenter
{ {
private readonly DefaultPageSegmenterOptions options;
/// <summary> /// <summary>
/// Create an instance of default page segmenter, <see cref="DefaultPageSegmenter"/>. /// Create an instance of default page segmenter, <see cref="DefaultPageSegmenter"/>.
/// </summary> /// </summary>
public static DefaultPageSegmenter Instance { get; } = new DefaultPageSegmenter(); public static DefaultPageSegmenter Instance { get; } = new DefaultPageSegmenter();
/// <summary> /// <summary>
/// Get the blocks using default options values. /// using default options values.
/// </summary> /// </summary>
/// <param name="words">The page's words to generate text blocks for.</param> public DefaultPageSegmenter() : this(new DefaultPageSegmenterOptions())
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{ {
return GetBlocks(words, new DefaultPageSegmenterOptions()); }
/// <summary>
/// Create using options values.
/// </summary>
/// <param name="options">The <see cref="DefaultPageSegmenterOptions"/> to use.</param>
/// <exception cref="ArgumentNullException"></exception>
public DefaultPageSegmenter(DefaultPageSegmenterOptions options)
{
this.options = options ?? throw new ArgumentNullException(nameof(options));
} }
/// <summary> /// <summary>
/// Get the text blocks using options. /// Get the text blocks using options.
/// </summary> /// </summary>
/// <param name="words">The page's words to generate text blocks for.</param> /// <param name="words">The page's words to generate text blocks for.</param>
/// <param name="options">The <see cref="DefaultPageSegmenterOptions"/> to use.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the default method.</returns> /// <returns>The <see cref="TextBlock"/>s generated by the default method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options) public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{ {
if (options is DefaultPageSegmenterOptions dOptions) if (words?.Any() != true)
{ {
if (words?.Any() != true) return EmptyArray<TextBlock>.Instance;
{ }
return EmptyArray<TextBlock>.Instance;
}
return new List<TextBlock>() { new TextBlock(new XYLeaf(words).GetLines(dOptions.WordSeparator), dOptions.LineSeparator) }; return new List<TextBlock>() { new TextBlock(new XYLeaf(words).GetLines(options.WordSeparator), options.LineSeparator) };
}
else
{
throw new ArgumentException("Options provided must be of type " + nameof(DefaultPageSegmenterOptions) + ".", nameof(options));
}
} }
/// <summary> /// <summary>
/// Default page segmenter options. /// Default page segmenter options.
/// </summary> /// </summary>
public class DefaultPageSegmenterOptions : PageSegmenterOptions public class DefaultPageSegmenterOptions : IPageSegmenterOptions
{ } {
/// <summary>
/// <inheritdoc/>
/// Default value is -1.
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1;
/// <summary>
/// <inheritdoc/>
/// <para>Default value is ' ' (space).</para>
/// </summary>
public string WordSeparator { get; set; } = " ";
/// <summary>
/// <inheritdoc/>
/// <para>Default value is '\n' (new line).</para>
/// </summary>
public string LineSeparator { get; set; } = "\n";
}
} }
} }

View File

@@ -17,48 +17,49 @@
/// </summary> /// </summary>
public class DocstrumBoundingBoxes : IPageSegmenter public class DocstrumBoundingBoxes : IPageSegmenter
{ {
private readonly DocstrumBoundingBoxesOptions options;
/// <summary> /// <summary>
/// Create an instance of Docstrum for bounding boxes page segmenter, <see cref="DocstrumBoundingBoxes"/>. /// Create an instance of Docstrum for bounding boxes page segmenter, <see cref="DocstrumBoundingBoxes"/>.
/// </summary> /// </summary>
public static DocstrumBoundingBoxes Instance { get; } = new DocstrumBoundingBoxes(); public static DocstrumBoundingBoxes Instance { get; } = new DocstrumBoundingBoxes();
/// <summary> /// <summary>
/// Get the blocks using default options values. /// Create an instance of Docstrum for bounding boxes page segmenter using default options values.
/// </summary>
public DocstrumBoundingBoxes() : this(new DocstrumBoundingBoxesOptions())
{
}
/// <summary>
/// Create an instance of Docstrum for bounding boxes page segmenter using options values.
/// </summary>
/// <param name="options">The <see cref="DocstrumBoundingBoxesOptions"/> to use.</param>
/// <exception cref="ArgumentException"></exception>
public DocstrumBoundingBoxes(DocstrumBoundingBoxesOptions options)
{
this.options = options ?? throw new ArgumentNullException(nameof(options));
}
/// <summary>
/// Get the blocks.
/// </summary> /// </summary>
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param> /// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns> /// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words) public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{ {
return GetBlocks(words, new DocstrumBoundingBoxesOptions()); if (words?.Any() != true)
}
/// <summary>
/// Get the blocks using options values.
/// </summary>
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="options">The <see cref="DocstrumBoundingBoxesOptions"/> to use.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
{
if (options is DocstrumBoundingBoxesOptions dbbOptions)
{ {
if (words?.Any() != true) return EmptyArray<TextBlock>.Instance;
{ }
return EmptyArray<TextBlock>.Instance;
}
return GetBlocks(words.ToList(), return GetBlocks(words.ToList(),
dbbOptions.WithinLineBounds, dbbOptions.WithinLineMultiplier, dbbOptions.WithinLineBinSize, options.WithinLineBounds, options.WithinLineMultiplier, options.WithinLineBinSize,
dbbOptions.BetweenLineBounds, dbbOptions.BetweenLineMultiplier, dbbOptions.BetweenLineBinSize, options.BetweenLineBounds, options.BetweenLineMultiplier, options.BetweenLineBinSize,
dbbOptions.AngularDifferenceBounds, options.AngularDifferenceBounds,
dbbOptions.Epsilon, options.Epsilon,
dbbOptions.WordSeparator, dbbOptions.LineSeparator, options.WordSeparator, options.LineSeparator,
dbbOptions.MaxDegreeOfParallelism); options.MaxDegreeOfParallelism);
}
else
{
throw new ArgumentException("Options provided must be of type " + nameof(DocstrumBoundingBoxesOptions) + ".", nameof(options));
}
} }
/// <summary> /// <summary>
@@ -612,8 +613,26 @@
/// <summary> /// <summary>
/// Docstrum bounding boxes page segmenter options. /// Docstrum bounding boxes page segmenter options.
/// </summary> /// </summary>
public class DocstrumBoundingBoxesOptions : PageSegmenterOptions public class DocstrumBoundingBoxesOptions : IPageSegmenterOptions
{ {
/// <summary>
/// <inheritdoc/>
/// Default value is -1.
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1;
/// <summary>
/// <inheritdoc/>
/// <para>Default value is ' ' (space).</para>
/// </summary>
public string WordSeparator { get; set; } = " ";
/// <summary>
/// <inheritdoc/>
/// <para>Default value is '\n' (new line).</para>
/// </summary>
public string LineSeparator { get; set; } = "\n";
/// <summary> /// <summary>
/// Precision when testing equalities. /// Precision when testing equalities.
/// <para>Default value is 1e-3.</para> /// <para>Default value is 1e-3.</para>
@@ -640,7 +659,6 @@
/// </summary> /// </summary>
public int WithinLineBinSize { get; set; } = 10; public int WithinLineBinSize { get; set; } = 10;
/// <summary> /// <summary>
/// Angle bounds for words to be considered as neighbours on separate lines. /// Angle bounds for words to be considered as neighbours on separate lines.
/// <para>Default value is 45 ≤ θ ≤ 135.</para> /// <para>Default value is 45 ≤ θ ≤ 135.</para>

View File

@@ -10,18 +10,10 @@
public interface IPageSegmenter public interface IPageSegmenter
{ {
/// <summary> /// <summary>
/// Get the blocks using default options values. /// Get the blocks.
/// </summary> /// </summary>
/// <param name="words">The page's words to generate text blocks for.</param> /// <param name="words">The page's words to generate text blocks for.</param>
/// <returns>A list of text blocks from this approach.</returns> /// <returns>A list of text blocks from this approach.</returns>
IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words); IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words);
/// <summary>
/// Get the text blocks using options.
/// </summary>
/// <param name="words">The page's words to generate text blocks for.</param>
/// <param name="options"></param>
/// <returns>A list of text blocks from this approach.</returns>
IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options);
} }
} }

View File

@@ -1,20 +1,18 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
{ {
/// <summary> /// <summary>
/// Abstract page segmenter options. /// Page segmenter options interface.
/// </summary> /// </summary>
public abstract class PageSegmenterOptions : DlaOptions public interface IPageSegmenterOptions : IDlaOptions
{ {
/// <summary> /// <summary>
/// Separator used between words when building lines. /// Separator used between words when building lines.
/// <para>Default value is ' ' (space).</para>
/// </summary> /// </summary>
public string WordSeparator { get; set; } = " "; string WordSeparator { get; set; }
/// <summary> /// <summary>
/// Separator used between lines when building paragraphs. /// Separator used between lines when building paragraphs.
/// <para>Default value is '\n' (new line).</para>
/// </summary> /// </summary>
public string LineSeparator { get; set; } = "\n"; string LineSeparator { get; set; }
} }
} }

View File

@@ -15,47 +15,48 @@
/// </summary> /// </summary>
public class RecursiveXYCut : IPageSegmenter public class RecursiveXYCut : IPageSegmenter
{ {
private readonly RecursiveXYCutOptions options;
/// <summary> /// <summary>
/// Create an instance of Recursive X-Y Cut page segmenter, <see cref="RecursiveXYCut"/>. /// Create an instance of Recursive X-Y Cut page segmenter, <see cref="RecursiveXYCut"/>.
/// </summary> /// </summary>
public static RecursiveXYCut Instance { get; } = new RecursiveXYCut(); public static RecursiveXYCut Instance { get; } = new RecursiveXYCut();
/// <summary> /// <summary>
/// Get the blocks using default options values. /// Create an instance of Recursive X-Y Cut page segmenter using default options values.
/// </summary>
public RecursiveXYCut() : this(new RecursiveXYCutOptions())
{
}
/// <summary>
/// Create an instance of Recursive X-Y Cut page segmenter using options values.
/// </summary>
/// <param name="options">The <see cref="RecursiveXYCutOptions"/> to use.</param>
/// <exception cref="ArgumentException"></exception>
public RecursiveXYCut(RecursiveXYCutOptions options)
{
this.options = options ?? throw new ArgumentNullException(nameof(options));
}
/// <summary>
/// Get the blocks.
/// </summary> /// </summary>
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param> /// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns> /// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words) public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{ {
return GetBlocks(words, new RecursiveXYCutOptions()); if (words?.Any() != true)
}
/// <summary>
/// Get the blocks using options values.
/// </summary>
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="options">The <see cref="RecursiveXYCutOptions"/> to use.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
{
if (options is RecursiveXYCutOptions ryxcOptions)
{ {
if (words?.Any() != true) return EmptyArray<TextBlock>.Instance;
{ }
return EmptyArray<TextBlock>.Instance;
}
return GetBlocks(words, return GetBlocks(words,
ryxcOptions.MinimumWidth, options.MinimumWidth,
ryxcOptions.DominantFontWidthFunc, options.DominantFontWidthFunc,
ryxcOptions.DominantFontHeightFunc, options.DominantFontHeightFunc,
ryxcOptions.WordSeparator, options.WordSeparator,
ryxcOptions.LineSeparator); options.LineSeparator);
}
else
{
throw new ArgumentException("Options provided must be of type " + nameof(RecursiveXYCutOptions) + ".", nameof(options));
}
} }
/// <summary> /// <summary>
@@ -92,7 +93,7 @@
if (leaves.Count > 0) if (leaves.Count > 0)
{ {
return leaves.Select(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator)).ToList(); return leaves.ConvertAll(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator));
} }
} }
@@ -183,7 +184,11 @@
} }
} }
} }
if (i == wordsCount - 1) projectionProfile.Add(currentProjection);
if (i == wordsCount - 1)
{
projectionProfile.Add(currentProjection);
}
} }
var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w => var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w =>
@@ -271,7 +276,11 @@
} }
} }
} }
if (i == wordsCount - 1) projectionProfile.Add(currentProjection);
if (i == wordsCount - 1)
{
projectionProfile.Add(currentProjection);
}
} }
if (projectionProfile.Count == 1) if (projectionProfile.Count == 1)
@@ -329,8 +338,26 @@
/// <summary> /// <summary>
/// Recursive X-Y cut page segmenter options. /// Recursive X-Y cut page segmenter options.
/// </summary> /// </summary>
public class RecursiveXYCutOptions : PageSegmenterOptions public class RecursiveXYCutOptions : IPageSegmenterOptions
{ {
/// <summary>
/// <inheritdoc/>
/// Default value is -1.
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1;
/// <summary>
/// <inheritdoc/>
/// <para>Default value is ' ' (space).</para>
/// </summary>
public string WordSeparator { get; set; } = " ";
/// <summary>
/// <inheritdoc/>
/// <para>Default value is '\n' (new line).</para>
/// </summary>
public string LineSeparator { get; set; } = "\n";
/// <summary> /// <summary>
/// The minimum width for a block. /// The minimum width for a block.
/// <para>Default value is 1.</para> /// <para>Default value is 1.</para>

View File

@@ -31,7 +31,6 @@
public XYNode(params XYNode[] children) public XYNode(params XYNode[] children)
: this(children?.ToList()) : this(children?.ToList())
{ {
} }
/// <summary> /// <summary>
@@ -40,12 +39,12 @@
/// <param name="children">The node's children.</param> /// <param name="children">The node's children.</param>
public XYNode(IEnumerable<XYNode> children) public XYNode(IEnumerable<XYNode> children)
{ {
if (children != null && children.Count() != 0) if (children?.Any() == true)
{ {
Children = children.ToArray(); Children = children.ToArray();
BoundingBox = new PdfRectangle(children.Min(b => b.BoundingBox.Left), BoundingBox = new PdfRectangle(children.Min(b => b.BoundingBox.Left),
children.Min(b => b.BoundingBox.Bottom), children.Min(b => b.BoundingBox.Bottom),
children.Max(b => b.BoundingBox.Right), children.Max(b => b.BoundingBox.Right),
children.Max(b => b.BoundingBox.Top)); children.Max(b => b.BoundingBox.Top));
} }
else else
@@ -87,7 +86,11 @@
private void RecursiveCount(IEnumerable<XYNode> children, ref int count) private void RecursiveCount(IEnumerable<XYNode> children, ref int count)
{ {
if (children.Count() == 0) return; if (!children.Any())
{
return;
}
foreach (XYNode node in children.Where(x => x.IsLeaf)) foreach (XYNode node in children.Where(x => x.IsLeaf))
{ {
count += node.CountWords(); count += node.CountWords();
@@ -101,7 +104,11 @@
private void RecursiveGetLeaves(IEnumerable<XYNode> children, ref List<XYLeaf> leaves, int level) private void RecursiveGetLeaves(IEnumerable<XYNode> children, ref List<XYLeaf> leaves, int level)
{ {
if (children.Count() == 0) return; if (!children.Any())
{
return;
}
bool isVerticalCut = level % 2 == 0; bool isVerticalCut = level % 2 == 0;
foreach (XYLeaf node in children.Where(x => x.IsLeaf)) foreach (XYLeaf node in children.Where(x => x.IsLeaf))

View File

@@ -0,0 +1,9 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor
{
/// <summary>
/// Word extractor options interface.
/// </summary>
public interface IWordExtractorOptions : IDlaOptions
{
}
}

View File

@@ -13,77 +13,78 @@
/// </summary> /// </summary>
public class NearestNeighbourWordExtractor : IWordExtractor public class NearestNeighbourWordExtractor : IWordExtractor
{ {
private readonly NearestNeighbourWordExtractorOptions options;
/// <summary> /// <summary>
/// Create an instance of Nearest Neighbour Word Extractor, <see cref="NearestNeighbourWordExtractor"/>. /// Create an instance of Nearest Neighbour Word Extractor, <see cref="NearestNeighbourWordExtractor"/>.
/// </summary> /// </summary>
public static NearestNeighbourWordExtractor Instance { get; } = new NearestNeighbourWordExtractor(); public static NearestNeighbourWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
/// <summary> /// <summary>
/// Get the words using default options values. /// Create an instance of Nearest Neighbour Word Extractor using default options values.
/// </summary>
public NearestNeighbourWordExtractor() : this(new NearestNeighbourWordExtractorOptions())
{
}
/// <summary>
/// Create an instance of Nearest Neighbour Word Extractor using options values.
/// </summary>
/// <param name="options">The <see cref="NearestNeighbourWordExtractorOptions"/> to use.</param>
/// <exception cref="ArgumentNullException"></exception>
public NearestNeighbourWordExtractor(NearestNeighbourWordExtractorOptions options)
{
this.options = options ?? throw new ArgumentNullException(nameof(options));
}
/// <summary>
/// Get the words.
/// </summary> /// </summary>
/// <param name="letters">The page's letters to group into <see cref="Word"/>s.</param> /// <param name="letters">The page's letters to group into <see cref="Word"/>s.</param>
/// <returns>The <see cref="Word"/>s generated by the nearest neighbour method.</returns> /// <returns>The <see cref="Word"/>s generated by the nearest neighbour method.</returns>
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters) public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
{ {
return GetWords(letters, new NearestNeighbourWordExtractorOptions()); if (letters == null || letters.Count == 0)
}
/// <summary>
/// Get the words using options values.
/// </summary>
/// <param name="letters">The page's letters to group into <see cref="Word"/>s.</param>
/// <param name="options">The <see cref="NearestNeighbourWordExtractorOptions"/> to use.</param>
/// <returns>The <see cref="Word"/>s generated by the nearest neighbour method.</returns>
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters, DlaOptions options)
{
if (options is NearestNeighbourWordExtractorOptions nnOptions)
{ {
if (letters == null || letters.Count == 0) return EmptyArray<Word>.Instance;
{ }
return EmptyArray<Word>.Instance;
}
if (nnOptions.GroupByOrientation) if (options.GroupByOrientation)
{ {
// axis aligned // axis aligned
List<Word> words = GetWords( List<Word> words = GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(), letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot, options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism); options.Filter, options.MaxDegreeOfParallelism);
words.AddRange(GetWords( words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(), letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot, options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism)); options.Filter, options.MaxDegreeOfParallelism));
words.AddRange(GetWords( words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(), letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot, options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism)); options.Filter, options.MaxDegreeOfParallelism));
words.AddRange(GetWords( words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(), letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot, options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism)); options.Filter, options.MaxDegreeOfParallelism));
// not axis aligned // not axis aligned
words.AddRange(GetWords( words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(), letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot, options.MaximumDistance, options.DistanceMeasure, options.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism)); options.Filter, options.MaxDegreeOfParallelism));
return words; return words;
}
else
{
return GetWords(letters,
nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism);
}
} }
else else
{ {
throw new ArgumentException("Options provided must be of type " + nameof(NearestNeighbourWordExtractorOptions) + ".", nameof(options)); return GetWords(letters,
options.MaximumDistance, options.DistanceMeasure, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism);
} }
} }
@@ -107,7 +108,10 @@
Func<Letter, bool> filterPivotFunction, Func<Letter, bool> filterPivotFunction,
Func<Letter, Letter, bool> filterFunction, int maxDegreeOfParallelism) Func<Letter, Letter, bool> filterFunction, int maxDegreeOfParallelism)
{ {
if (letters == null || letters.Count == 0) return new List<Word>(); if (letters == null || letters.Count == 0)
{
return new List<Word>();
}
var groupedLetters = Clustering.NearestNeighbours(letters, var groupedLetters = Clustering.NearestNeighbours(letters,
distMeasure, maxDistanceFunction, distMeasure, maxDistanceFunction,
@@ -128,11 +132,17 @@
/// <summary> /// <summary>
/// Nearest neighbour word extractor options. /// Nearest neighbour word extractor options.
/// </summary> /// </summary>
public class NearestNeighbourWordExtractorOptions : DlaOptions public class NearestNeighbourWordExtractorOptions : IWordExtractorOptions
{ {
/// <summary>
/// <inheritdoc/>
/// Default value is -1.
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1;
/// <summary> /// <summary>
/// The maximum distance between two letters (start and end base line points) within the same word, as a function of the two letters. /// The maximum distance between two letters (start and end base line points) within the same word, as a function of the two letters.
/// If the distance between the two letters is greater than this maximum, they will belong to different words. /// <para>If the distance between the two letters is greater than this maximum, they will belong to different words.</para>
/// <para>Default value is 20% of the Max(Width, PointSize) of both letters. If <see cref="TextOrientation"/> is Other, this distance is doubled.</para> /// <para>Default value is 20% of the Max(Width, PointSize) of both letters. If <see cref="TextOrientation"/> is Other, this distance is doubled.</para>
/// </summary> /// </summary>
public Func<Letter, Letter, double> MaximumDistance { get; set; } = (l1, l2) => public Func<Letter, Letter, double> MaximumDistance { get; set; } = (l1, l2) =>
@@ -159,15 +169,15 @@
/// <summary> /// <summary>
/// The distance measure used between two letters (start and end base line points) with axis aligned <see cref="TextOrientation"/>. /// The distance measure used between two letters (start and end base line points) with axis aligned <see cref="TextOrientation"/>.
/// Only used if GroupByOrientation is set to true. /// <para>Only used if <see cref="GroupByOrientation"/> is set to <c>true</c>.</para>
/// <para>Default value is the Manhattan distance.</para> /// <para>Default value is the Manhattan distance.</para>
/// </summary> /// </summary>
public Func<PdfPoint, PdfPoint, double> DistanceMeasureAA { get; set; } = Distances.Manhattan; public Func<PdfPoint, PdfPoint, double> DistanceMeasureAA { get; set; } = Distances.Manhattan;
/// <summary> /// <summary>
/// Function used to filter out connection between letters, e.g. check if the letters have the same color. /// Function used to filter out connection between letters, e.g. check if the letters have the same color.
/// If the function returns false, letters will belong to different words. /// If the function returns <c>false</c>, letters will belong to different words.
/// <para>Default value checks whether the neighbour is a white space or not. If it is the case, it returns false.</para> /// <para>Default value checks whether the neighbour is a white space or not. If it is the case, it returns <c>false</c>.</para>
/// </summary> /// </summary>
public Func<Letter, Letter, bool> Filter { get; set; } = (_, l2) => !string.IsNullOrWhiteSpace(l2.Value); public Func<Letter, Letter, bool> Filter { get; set; } = (_, l2) => !string.IsNullOrWhiteSpace(l2.Value);
@@ -178,9 +188,9 @@
public Func<Letter, bool> FilterPivot { get; set; } = l => !string.IsNullOrWhiteSpace(l.Value); public Func<Letter, bool> FilterPivot { get; set; } = l => !string.IsNullOrWhiteSpace(l.Value);
/// <summary> /// <summary>
/// If true, letters will be grouped by <see cref="TextOrientation"/> before processing. /// If <c>true</c>, letters will be grouped by <see cref="TextOrientation"/> before processing.
/// The DistanceMeasureAA will be used on axis aligned letters, and the DistanceMeasure on others. /// The <see cref="DistanceMeasureAA"/> will be used on axis aligned letters, and the <see cref="DistanceMeasure"/> on others.
/// If false, DistanceMeasure will be used for all letters and DistanceMeasureAA won't be used. /// <para>If <c>false</c>, <see cref="DistanceMeasure"/> will be used for all letters, and <see cref="DistanceMeasureAA"/> won't be used.</para>
/// <para>Default value is true.</para> /// <para>Default value is true.</para>
/// </summary> /// </summary>
public bool GroupByOrientation { get; set; } = true; public bool GroupByOrientation { get; set; } = true;

View File

@@ -88,7 +88,7 @@
{ {
var page = document.GetPage(1); var page = document.GetPage(1);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters); var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words, options); var blocks = new DocstrumBoundingBoxes(options).GetBlocks(words);
Assert.Equal(expected.Length, blocks.Count); Assert.Equal(expected.Length, blocks.Count);
var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X) var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)

View File

@@ -40,7 +40,7 @@
var page = document.GetPage(1); var page = document.GetPage(1);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters); var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
var options = new RecursiveXYCut.RecursiveXYCutOptions() { MinimumWidth = page.Width / 3.0, LineSeparator = " " }; var options = new RecursiveXYCut.RecursiveXYCutOptions() { MinimumWidth = page.Width / 3.0, LineSeparator = " " };
var blocks = RecursiveXYCut.Instance.GetBlocks(words, options); var blocks = new RecursiveXYCut(options).GetBlocks(words);
Assert.Equal(expected.Length, blocks.Count); Assert.Equal(expected.Length, blocks.Count);
var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X) var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)

View File

@@ -43,7 +43,7 @@
DefaultWordExtractor.Instance, DefaultWordExtractor.Instance,
RecursiveXYCut.Instance, RecursiveXYCut.Instance,
UnsupervisedReadingOrderDetector.Instance); UnsupervisedReadingOrderDetector.Instance);
var xml = GetXml(pageXmlTextExporter); var xml = GetXml(pageXmlTextExporter);
Assert.Contains("<ReadingOrder>", xml); Assert.Contains("<ReadingOrder>", xml);
Assert.Contains("</OrderedGroup>", xml); Assert.Contains("</OrderedGroup>", xml);
@@ -69,7 +69,7 @@
public void ContainsExpectedText() public void ContainsExpectedText()
{ {
var xml = GetXml(); var xml = GetXml();
Assert.Contains(@"2006 Swedish Touring Car Championship", xml); Assert.Contains("2006 Swedish Touring Car Championship", xml);
// the coords for that text // the coords for that text
Assert.Contains(@"<Coords points=""35,77 35,62 397,62 397,77"" />", xml); Assert.Contains(@"<Coords points=""35,77 35,62 397,62 397,77"" />", xml);
} }

View File

@@ -79,10 +79,10 @@
public int NumberOfImages => Content.NumberOfImages; public int NumberOfImages => Content.NumberOfImages;
/// <summary> /// <summary>
/// The parsed graphics state operations in the content stream for this page. /// The parsed graphics state operations in the content stream for this page.
/// </summary> /// </summary>
public IReadOnlyList<IGraphicsStateOperation> Operations => Content.GraphicsStateOperations; public IReadOnlyList<IGraphicsStateOperation> Operations => Content.GraphicsStateOperations;
/// <summary> /// <summary>
/// Access to members whose future locations within the API will change without warning. /// Access to members whose future locations within the API will change without warning.
/// </summary> /// </summary>
@@ -97,7 +97,7 @@
{ {
throw new ArgumentOutOfRangeException(nameof(number), "Page number cannot be 0 or negative."); throw new ArgumentOutOfRangeException(nameof(number), "Page number cannot be 0 or negative.");
} }
Dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary)); Dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary));
Number = number; Number = number;