Make DlaOptions an interface, add IWordExtractorOptions, remove GetBlocks(words, options), GetWords(letters, options) and put options in constructors - Fix #424. Tidy up code

This commit is contained in:
BobLD
2022-03-12 13:23:40 +00:00
parent b0a5f4c8d0
commit 05aba1cfe7
17 changed files with 307 additions and 213 deletions

View File

@@ -2,7 +2,6 @@
{
using Alto;
using Content;
using Core;
using DocumentLayoutAnalysis;
using System;
using System.Globalization;
@@ -147,7 +146,7 @@
altoPage.PrintSpace.TextBlock = blocks;
altoPage.PrintSpace.Illustrations = page.GetImages().Select(i => ToAltoIllustration(i, page.Height)).ToArray();
if (includePaths)
{
altoPage.PrintSpace.GraphicalElements = page.ExperimentalAccess.Paths
@@ -288,7 +287,7 @@
ProcessingSoftware = new AltoDocument.AltoProcessingSoftware
{
SoftwareName = "PdfPig",
SoftwareCreator = @"https://github.com/UglyToad/PdfPig",
SoftwareCreator = "https://github.com/UglyToad/PdfPig",
ApplicationDescription = "Read and extract text and other content from PDFs in C# (port of PdfBox)",
SoftwareVersion = "x.x.xx"
},

View File

@@ -49,7 +49,7 @@
/// </summary>
/// <param name="document">The document.</param>
/// <param name="includePaths">Draw PdfPaths present in the page.</param>
/// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the
/// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the
/// interface to a plain hOCR file.<para>See https://github.com/kba/hocrjs for more information.</para></param>
public string Get(PdfDocument document, bool includePaths = false, bool useHocrjs = false)
{
@@ -61,10 +61,13 @@
hocr += GetCode(page, includePaths) + "\n";
}
if (useHocrjs) hocr += indentChar + indentChar + Hocrjs;
if (useHocrjs)
{
hocr += indentChar + indentChar + Hocrjs;
}
hocr += indentChar + "</body>";
hocr = XmlHeader + AddHtmlHeader(hocr);
return hocr;
return XmlHeader + AddHtmlHeader(hocr);
}
/// <summary>
@@ -80,8 +83,8 @@
/// Get the hOCR (HTML) string of the page layout.
/// </summary>
/// <param name="page">The page.</param>
/// <param name="imageName">The image name, if any.</param>
/// <param name="includePaths">Draw PdfPaths present in the page.</param>
/// <param name="imageName">The image name, if any.</param>
/// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the interface to a plain hOCR file.<para>See https://github.com/kba/hocrjs for more information.</para></param>
public string Get(Page page, bool includePaths = false, string imageName = "unknown", bool useHocrjs = false)
{
@@ -89,10 +92,13 @@
hocr += GetCode(page, includePaths, imageName) + "\n";
if (useHocrjs) hocr += indentChar + indentChar + Hocrjs;
if (useHocrjs)
{
hocr += indentChar + indentChar + Hocrjs;
}
hocr += indentChar + "</body>";
hocr = XmlHeader + AddHtmlHeader(hocr);
return hocr;
return XmlHeader + AddHtmlHeader(hocr);
}
private string GetHead()
@@ -129,14 +135,14 @@
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page</para>
/// </summary>
/// <param name="page"></param>
/// <param name="imageName"></param>
/// <param name="includePaths">Draw PdfPaths present in the page.</param>
/// <param name="imageName"></param>
private string GetCode(Page page, bool includePaths, string imageName = "unknown")
{
pageCount++;
int level = 2;
string hocr = GetIndent(level) + @"<div class='ocr_page' id='page_" + page.Number.ToString() +
string hocr = GetIndent(level) + "<div class='ocr_page' id='page_" + page.Number.ToString() +
"' title='image \"" + imageName + "\"; bbox 0 0 " +
(int)Math.Round(page.Width * scale) + " " + (int)Math.Round(page.Height * scale) +
"; ppageno " + (page.Number - 1) + "\'>";
@@ -156,16 +162,15 @@
var words = page.GetWords(wordExtractor);
if (words.Count() > 0)
if (words.Any())
{
var blocks = pageSegmenter.GetBlocks(words);
foreach (var block in blocks)
foreach (var block in pageSegmenter.GetBlocks(words))
{
hocr += "\n" + GetCodeArea(block, page.Height, level + 1);
}
}
hocr += "\n" + GetIndent(level) + @"</div>";
hocr += "\n" + GetIndent(level) + "</div>";
return hocr;
}
@@ -179,7 +184,10 @@
/// <param name="level">The indent level.</param>
private string GetCode(PdfPath path, double pageHeight, bool subPaths, int level)
{
if (path == null) return string.Empty;
if (path == null)
{
return string.Empty;
}
string hocr = string.Empty;
@@ -189,7 +197,7 @@
if (bbox.HasValue)
{
areaCount++;
hocr += GetIndent(level) + @"<div class='ocr_carea' id='block_" + pageCount + "_"
hocr += GetIndent(level) + "<div class='ocr_carea' id='block_" + pageCount + "_"
+ areaCount + "' title='" + GetCode(bbox.Value, pageHeight) + "'>\n";
foreach (var subPath in path)
{
@@ -197,11 +205,11 @@
if (subBbox.HasValue)
{
pathCount++;
hocr += GetIndent(level + 1) + @"<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
hocr += GetIndent(level + 1) + "<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
+ pathCount + "' title='" + GetCode(subBbox.Value, pageHeight) + "' />\n";
}
}
hocr += GetIndent(level) + @"</div>";
hocr += GetIndent(level) + "</div>";
}
}
else
@@ -210,7 +218,7 @@
if (bbox.HasValue)
{
pathCount++;
hocr += GetIndent(level) + @"<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
hocr += GetIndent(level) + "<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
+ pathCount + "' title='" + GetCode(bbox.Value, pageHeight) + "' />";
}
}
@@ -222,7 +230,7 @@
{
imageCount++;
var bbox = pdfImage.Bounds;
return GetIndent(level) + @"<span class='ocr_image' id='image_" + pageCount + "_"
return GetIndent(level) + "<span class='ocr_image' id='image_" + pageCount + "_"
+ imageCount + "' title='" + GetCode(bbox, pageHeight) + "' />";
}
@@ -237,12 +245,11 @@
{
areaCount++;
string bbox = GetCode(block.BoundingBox, pageHeight);
string hocr = GetIndent(level) + @"<div class='ocr_carea' id='block_" + pageCount + "_"
string hocr = GetIndent(level) + "<div class='ocr_carea' id='block_" + pageCount + "_"
+ areaCount + "' title='" + GetCode(block.BoundingBox, pageHeight) + "'>";
hocr += GetCodeParagraph(block, pageHeight, level + 1); // we concider 1 area = 1 block. should change in the future
hocr += "\n" + GetIndent(level) + @"</div>";
hocr += "\n" + GetIndent(level) + "</div>";
return hocr;
}
@@ -256,14 +263,14 @@
private string GetCodeParagraph(TextBlock block, double pageHeight, int level)
{
paraCount++;
string hocr = "\n" + GetIndent(level) + @"<p class='ocr_par' id='par_" + pageCount + "_"
string hocr = "\n" + GetIndent(level) + "<p class='ocr_par' id='par_" + pageCount + "_"
+ paraCount + "' title='" + GetCode(block.BoundingBox, pageHeight) + "'>"; // lang='eng'
foreach (var line in block.TextLines)
{
hocr += "\n" + GetCode(line, pageHeight, level + 1);
}
hocr += "\n" + GetIndent(level) + @"</p>";
hocr += "\n" + GetIndent(level) + "</p>";
return hocr;
}
@@ -285,14 +292,14 @@
double baseLine = (double)line.Words[0].Letters[0].StartBaseLine.Y;
baseLine = (double)line.BoundingBox.Bottom - baseLine;
string hocr = GetIndent(level) + @"<span class='ocr_line' id='line_" + pageCount + "_" + lineCount + "' title='" +
string hocr = GetIndent(level) + "<span class='ocr_line' id='line_" + pageCount + "_" + lineCount + "' title='" +
GetCode(line.BoundingBox, pageHeight) + "; baseline " + angle + " 0'>"; //"; x_size 42; x_descenders 5; x_ascenders 12' >";
foreach (var word in line.Words)
{
hocr += "\n" + GetCode(word, pageHeight, level + 1);
}
hocr += "\n" + GetIndent(level) + @"</span>";
hocr += "\n" + GetIndent(level) + "</span>";
return hocr;
}
@@ -307,7 +314,7 @@
{
wordCount++;
string hocr = GetIndent(level) +
@"<span class='ocrx_word' id='word_" + pageCount + "_" + wordCount +
"<span class='ocrx_word' id='word_" + pageCount + "_" + wordCount +
"' title='" + GetCode(word.BoundingBox, pageHeight) + "; x_wconf " + GetConfidence(word);
hocr += "; x_font " + word.FontName;
@@ -343,7 +350,7 @@
var right = (int)Math.Round(rectangle.Right * scale);
var bottom = (int)Math.Round((pageHeight - rectangle.Bottom) * scale);
return @"bbox " + (left > 0 ? left : 0) + " "
return "bbox " + (left > 0 ? left : 0) + " "
+ (top > 0 ? top : 0) + " "
+ (right > 0 ? right : 0) + " "
+ (bottom > 0 ? bottom : 0);

View File

@@ -147,7 +147,7 @@
/// <summary>
/// PageXml Text colour in RGB encoded format
/// <para>(red value) + (256 x green value) + (65536 x blue value).</para>
/// <para>(red value) + (256 x green value) + (65536 x blue value).</para>
/// </summary>
private string ToRgbEncoded(IColor color)
{
@@ -184,7 +184,7 @@
regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Width, page.Height)));
if (orderedRegions.Any())
if (orderedRegions.Count > 0)
{
pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder()
{
@@ -206,7 +206,7 @@
if (includePaths)
{
var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Width, page.Height));
if (graphicalElements.Where(g => g != null).Count() > 0)
if (graphicalElements.Count(g => g != null) > 0)
{
regions.AddRange(graphicalElements.Where(g => g != null));
}

View File

@@ -53,7 +53,7 @@
builder.Append("</g></svg>");
return builder.ToString();
}
private static string LetterToSvg(Letter l, double height, XmlDocument doc)
{
string fontFamily = GetFontFamily(l.FontName, out string style, out string weight);
@@ -69,7 +69,7 @@
var x = Math.Round(l.StartBaseLine.X, Rounding);
var y = Math.Round(height - l.StartBaseLine.Y, Rounding);
return $"<text x='{x}' y='{y}'{rotation} font-family='{fontFamily}' font-style='{style}' font-weight='{weight}' {fontSize} fill='{ColorToSvg(l.Color)}'>{safeValue}</text>"
return $"<text x='{x}' y='{y}'{rotation} font-family='{fontFamily}' font-style='{style}' font-weight='{weight}' {fontSize} fill='{ColorToSvg(l.Color)}'>{safeValue}</text>"
+ Environment.NewLine;
}
@@ -123,7 +123,11 @@
}
}
if (Fonts.ContainsKey(fontName)) fontName = Fonts[fontName];
if (Fonts.ContainsKey(fontName))
{
fontName = Fonts[fontName];
}
return fontName;
}
@@ -136,7 +140,11 @@
private static string ColorToSvg(IColor color)
{
if (color == null) return "";
if (color == null)
{
return string.Empty;
}
var (r, g, b) = color.ToRGBValues();
return $"rgb({Math.Ceiling(r * 255)},{Math.Ceiling(g * 255)},{Math.Ceiling(b * 255)})";
}

View File

@@ -1,16 +1,15 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// Abstract class that stores options that configure the operation of methods of the document layout analysis algorithm.
/// Interface that stores options that configure the operation of methods of the document layout analysis algorithm.
/// </summary>
public abstract class DlaOptions
public interface IDlaOptions
{
/// <summary>
/// Gets or sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para>
/// <para>Default value is -1.</para>
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1;
int MaxDegreeOfParallelism { get; set; }
}
}

View File

@@ -11,47 +11,67 @@
/// </summary>
public class DefaultPageSegmenter : IPageSegmenter
{
private readonly DefaultPageSegmenterOptions options;
/// <summary>
/// Create an instance of default page segmenter, <see cref="DefaultPageSegmenter"/>.
/// </summary>
public static DefaultPageSegmenter Instance { get; } = new DefaultPageSegmenter();
/// <summary>
/// Get the blocks using default options values.
/// using default options values.
/// </summary>
/// <param name="words">The page's words to generate text blocks for.</param>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
public DefaultPageSegmenter() : this(new DefaultPageSegmenterOptions())
{
return GetBlocks(words, new DefaultPageSegmenterOptions());
}
/// <summary>
/// Create using options values.
/// </summary>
/// <param name="options">The <see cref="DefaultPageSegmenterOptions"/> to use.</param>
/// <exception cref="ArgumentNullException"></exception>
public DefaultPageSegmenter(DefaultPageSegmenterOptions options)
{
this.options = options ?? throw new ArgumentNullException(nameof(options));
}
/// <summary>
/// Get the text blocks using options.
/// </summary>
/// <param name="words">The page's words to generate text blocks for.</param>
/// <param name="options">The <see cref="DefaultPageSegmenterOptions"/> to use.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the default method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{
if (options is DefaultPageSegmenterOptions dOptions)
if (words?.Any() != true)
{
if (words?.Any() != true)
{
return EmptyArray<TextBlock>.Instance;
}
return EmptyArray<TextBlock>.Instance;
}
return new List<TextBlock>() { new TextBlock(new XYLeaf(words).GetLines(dOptions.WordSeparator), dOptions.LineSeparator) };
}
else
{
throw new ArgumentException("Options provided must be of type " + nameof(DefaultPageSegmenterOptions) + ".", nameof(options));
}
return new List<TextBlock>() { new TextBlock(new XYLeaf(words).GetLines(options.WordSeparator), options.LineSeparator) };
}
/// <summary>
/// Default page segmenter options.
/// </summary>
public class DefaultPageSegmenterOptions : PageSegmenterOptions
{ }
public class DefaultPageSegmenterOptions : IPageSegmenterOptions
{
/// <summary>
/// <inheritdoc/>
/// Default value is -1.
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1;
/// <summary>
/// <inheritdoc/>
/// <para>Default value is ' ' (space).</para>
/// </summary>
public string WordSeparator { get; set; } = " ";
/// <summary>
/// <inheritdoc/>
/// <para>Default value is '\n' (new line).</para>
/// </summary>
public string LineSeparator { get; set; } = "\n";
}
}
}

View File

@@ -17,48 +17,49 @@
/// </summary>
public class DocstrumBoundingBoxes : IPageSegmenter
{
private readonly DocstrumBoundingBoxesOptions options;
/// <summary>
/// Create an instance of Docstrum for bounding boxes page segmenter, <see cref="DocstrumBoundingBoxes"/>.
/// </summary>
public static DocstrumBoundingBoxes Instance { get; } = new DocstrumBoundingBoxes();
/// <summary>
/// Get the blocks using default options values.
/// Create an instance of Docstrum for bounding boxes page segmenter using default options values.
/// </summary>
public DocstrumBoundingBoxes() : this(new DocstrumBoundingBoxesOptions())
{
}
/// <summary>
/// Create an instance of Docstrum for bounding boxes page segmenter using options values.
/// </summary>
/// <param name="options">The <see cref="DocstrumBoundingBoxesOptions"/> to use.</param>
/// <exception cref="ArgumentException"></exception>
public DocstrumBoundingBoxes(DocstrumBoundingBoxesOptions options)
{
this.options = options ?? throw new ArgumentNullException(nameof(options));
}
/// <summary>
/// Get the blocks.
/// </summary>
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{
return GetBlocks(words, new DocstrumBoundingBoxesOptions());
}
/// <summary>
/// Get the blocks using options values.
/// </summary>
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="options">The <see cref="DocstrumBoundingBoxesOptions"/> to use.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
{
if (options is DocstrumBoundingBoxesOptions dbbOptions)
if (words?.Any() != true)
{
if (words?.Any() != true)
{
return EmptyArray<TextBlock>.Instance;
}
return EmptyArray<TextBlock>.Instance;
}
return GetBlocks(words.ToList(),
dbbOptions.WithinLineBounds, dbbOptions.WithinLineMultiplier, dbbOptions.WithinLineBinSize,
dbbOptions.BetweenLineBounds, dbbOptions.BetweenLineMultiplier, dbbOptions.BetweenLineBinSize,
dbbOptions.AngularDifferenceBounds,
dbbOptions.Epsilon,
dbbOptions.WordSeparator, dbbOptions.LineSeparator,
dbbOptions.MaxDegreeOfParallelism);
}
else
{
throw new ArgumentException("Options provided must be of type " + nameof(DocstrumBoundingBoxesOptions) + ".", nameof(options));
}
return GetBlocks(words.ToList(),
options.WithinLineBounds, options.WithinLineMultiplier, options.WithinLineBinSize,
options.BetweenLineBounds, options.BetweenLineMultiplier, options.BetweenLineBinSize,
options.AngularDifferenceBounds,
options.Epsilon,
options.WordSeparator, options.LineSeparator,
options.MaxDegreeOfParallelism);
}
/// <summary>
@@ -612,8 +613,26 @@
/// <summary>
/// Docstrum bounding boxes page segmenter options.
/// </summary>
public class DocstrumBoundingBoxesOptions : PageSegmenterOptions
public class DocstrumBoundingBoxesOptions : IPageSegmenterOptions
{
/// <summary>
/// <inheritdoc/>
/// Default value is -1.
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1;
/// <summary>
/// <inheritdoc/>
/// <para>Default value is ' ' (space).</para>
/// </summary>
public string WordSeparator { get; set; } = " ";
/// <summary>
/// <inheritdoc/>
/// <para>Default value is '\n' (new line).</para>
/// </summary>
public string LineSeparator { get; set; } = "\n";
/// <summary>
/// Precision when testing equalities.
/// <para>Default value is 1e-3.</para>
@@ -640,7 +659,6 @@
/// </summary>
public int WithinLineBinSize { get; set; } = 10;
/// <summary>
/// Angle bounds for words to be considered as neighbours on separate lines.
/// <para>Default value is 45 ≤ θ ≤ 135.</para>

View File

@@ -10,18 +10,10 @@
public interface IPageSegmenter
{
/// <summary>
/// Get the blocks using default options values.
/// Get the blocks.
/// </summary>
/// <param name="words">The page's words to generate text blocks for.</param>
/// <returns>A list of text blocks from this approach.</returns>
IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words);
/// <summary>
/// Get the text blocks using options.
/// </summary>
/// <param name="words">The page's words to generate text blocks for.</param>
/// <param name="options"></param>
/// <returns>A list of text blocks from this approach.</returns>
IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options);
}
}

View File

@@ -1,20 +1,18 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
{
/// <summary>
/// Abstract page segmenter options.
/// Page segmenter options interface.
/// </summary>
public abstract class PageSegmenterOptions : DlaOptions
public interface IPageSegmenterOptions : IDlaOptions
{
/// <summary>
/// Separator used between words when building lines.
/// <para>Default value is ' ' (space).</para>
/// </summary>
public string WordSeparator { get; set; } = " ";
string WordSeparator { get; set; }
/// <summary>
/// Separator used between lines when building paragraphs.
/// <para>Default value is '\n' (new line).</para>
/// </summary>
public string LineSeparator { get; set; } = "\n";
string LineSeparator { get; set; }
}
}

View File

@@ -15,47 +15,48 @@
/// </summary>
public class RecursiveXYCut : IPageSegmenter
{
private readonly RecursiveXYCutOptions options;
/// <summary>
/// Create an instance of Recursive X-Y Cut page segmenter, <see cref="RecursiveXYCut"/>.
/// </summary>
public static RecursiveXYCut Instance { get; } = new RecursiveXYCut();
/// <summary>
/// Get the blocks using default options values.
/// Create an instance of Recursive X-Y Cut page segmenter using default options values.
/// </summary>
public RecursiveXYCut() : this(new RecursiveXYCutOptions())
{
}
/// <summary>
/// Create an instance of Recursive X-Y Cut page segmenter using options values.
/// </summary>
/// <param name="options">The <see cref="RecursiveXYCutOptions"/> to use.</param>
/// <exception cref="ArgumentException"></exception>
public RecursiveXYCut(RecursiveXYCutOptions options)
{
this.options = options ?? throw new ArgumentNullException(nameof(options));
}
/// <summary>
/// Get the blocks.
/// </summary>
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{
return GetBlocks(words, new RecursiveXYCutOptions());
}
/// <summary>
/// Get the blocks using options values.
/// </summary>
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="options">The <see cref="RecursiveXYCutOptions"/> to use.</param>
/// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
{
if (options is RecursiveXYCutOptions ryxcOptions)
if (words?.Any() != true)
{
if (words?.Any() != true)
{
return EmptyArray<TextBlock>.Instance;
}
return EmptyArray<TextBlock>.Instance;
}
return GetBlocks(words,
ryxcOptions.MinimumWidth,
ryxcOptions.DominantFontWidthFunc,
ryxcOptions.DominantFontHeightFunc,
ryxcOptions.WordSeparator,
ryxcOptions.LineSeparator);
}
else
{
throw new ArgumentException("Options provided must be of type " + nameof(RecursiveXYCutOptions) + ".", nameof(options));
}
return GetBlocks(words,
options.MinimumWidth,
options.DominantFontWidthFunc,
options.DominantFontHeightFunc,
options.WordSeparator,
options.LineSeparator);
}
/// <summary>
@@ -92,7 +93,7 @@
if (leaves.Count > 0)
{
return leaves.Select(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator)).ToList();
return leaves.ConvertAll(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator));
}
}
@@ -183,7 +184,11 @@
}
}
}
if (i == wordsCount - 1) projectionProfile.Add(currentProjection);
if (i == wordsCount - 1)
{
projectionProfile.Add(currentProjection);
}
}
var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w =>
@@ -271,7 +276,11 @@
}
}
}
if (i == wordsCount - 1) projectionProfile.Add(currentProjection);
if (i == wordsCount - 1)
{
projectionProfile.Add(currentProjection);
}
}
if (projectionProfile.Count == 1)
@@ -329,8 +338,26 @@
/// <summary>
/// Recursive X-Y cut page segmenter options.
/// </summary>
public class RecursiveXYCutOptions : PageSegmenterOptions
public class RecursiveXYCutOptions : IPageSegmenterOptions
{
/// <summary>
/// <inheritdoc/>
/// Default value is -1.
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1;
/// <summary>
/// <inheritdoc/>
/// <para>Default value is ' ' (space).</para>
/// </summary>
public string WordSeparator { get; set; } = " ";
/// <summary>
/// <inheritdoc/>
/// <para>Default value is '\n' (new line).</para>
/// </summary>
public string LineSeparator { get; set; } = "\n";
/// <summary>
/// The minimum width for a block.
/// <para>Default value is 1.</para>

View File

@@ -31,7 +31,6 @@
public XYNode(params XYNode[] children)
: this(children?.ToList())
{
}
/// <summary>
@@ -40,12 +39,12 @@
/// <param name="children">The node's children.</param>
public XYNode(IEnumerable<XYNode> children)
{
if (children != null && children.Count() != 0)
if (children?.Any() == true)
{
Children = children.ToArray();
BoundingBox = new PdfRectangle(children.Min(b => b.BoundingBox.Left),
children.Min(b => b.BoundingBox.Bottom),
children.Max(b => b.BoundingBox.Right),
BoundingBox = new PdfRectangle(children.Min(b => b.BoundingBox.Left),
children.Min(b => b.BoundingBox.Bottom),
children.Max(b => b.BoundingBox.Right),
children.Max(b => b.BoundingBox.Top));
}
else
@@ -87,7 +86,11 @@
private void RecursiveCount(IEnumerable<XYNode> children, ref int count)
{
if (children.Count() == 0) return;
if (!children.Any())
{
return;
}
foreach (XYNode node in children.Where(x => x.IsLeaf))
{
count += node.CountWords();
@@ -101,7 +104,11 @@
private void RecursiveGetLeaves(IEnumerable<XYNode> children, ref List<XYLeaf> leaves, int level)
{
if (children.Count() == 0) return;
if (!children.Any())
{
return;
}
bool isVerticalCut = level % 2 == 0;
foreach (XYLeaf node in children.Where(x => x.IsLeaf))

View File

@@ -0,0 +1,9 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor
{
/// <summary>
/// Word extractor options interface.
/// </summary>
public interface IWordExtractorOptions : IDlaOptions
{
}
}

View File

@@ -13,77 +13,78 @@
/// </summary>
public class NearestNeighbourWordExtractor : IWordExtractor
{
private readonly NearestNeighbourWordExtractorOptions options;
/// <summary>
/// Create an instance of Nearest Neighbour Word Extractor, <see cref="NearestNeighbourWordExtractor"/>.
/// </summary>
public static NearestNeighbourWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
/// <summary>
/// Get the words using default options values.
/// Create an instance of Nearest Neighbour Word Extractor using default options values.
/// </summary>
public NearestNeighbourWordExtractor() : this(new NearestNeighbourWordExtractorOptions())
{
}
/// <summary>
/// Create an instance of Nearest Neighbour Word Extractor using options values.
/// </summary>
/// <param name="options">The <see cref="NearestNeighbourWordExtractorOptions"/> to use.</param>
/// <exception cref="ArgumentNullException"></exception>
public NearestNeighbourWordExtractor(NearestNeighbourWordExtractorOptions options)
{
this.options = options ?? throw new ArgumentNullException(nameof(options));
}
/// <summary>
/// Get the words.
/// </summary>
/// <param name="letters">The page's letters to group into <see cref="Word"/>s.</param>
/// <returns>The <see cref="Word"/>s generated by the nearest neighbour method.</returns>
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
{
return GetWords(letters, new NearestNeighbourWordExtractorOptions());
}
/// <summary>
/// Get the words using options values.
/// </summary>
/// <param name="letters">The page's letters to group into <see cref="Word"/>s.</param>
/// <param name="options">The <see cref="NearestNeighbourWordExtractorOptions"/> to use.</param>
/// <returns>The <see cref="Word"/>s generated by the nearest neighbour method.</returns>
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters, DlaOptions options)
{
if (options is NearestNeighbourWordExtractorOptions nnOptions)
if (letters == null || letters.Count == 0)
{
if (letters == null || letters.Count == 0)
{
return EmptyArray<Word>.Instance;
}
return EmptyArray<Word>.Instance;
}
if (nnOptions.GroupByOrientation)
{
// axis aligned
List<Word> words = GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism);
if (options.GroupByOrientation)
{
// axis aligned
List<Word> words = GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism);
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism));
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism));
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism));
// not axis aligned
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
// not axis aligned
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
options.MaximumDistance, options.DistanceMeasure, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism));
return words;
}
else
{
return GetWords(letters,
nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot,
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism);
}
return words;
}
else
{
throw new ArgumentException("Options provided must be of type " + nameof(NearestNeighbourWordExtractorOptions) + ".", nameof(options));
return GetWords(letters,
options.MaximumDistance, options.DistanceMeasure, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism);
}
}
@@ -107,7 +108,10 @@
Func<Letter, bool> filterPivotFunction,
Func<Letter, Letter, bool> filterFunction, int maxDegreeOfParallelism)
{
if (letters == null || letters.Count == 0) return new List<Word>();
if (letters == null || letters.Count == 0)
{
return new List<Word>();
}
var groupedLetters = Clustering.NearestNeighbours(letters,
distMeasure, maxDistanceFunction,
@@ -128,11 +132,17 @@
/// <summary>
/// Nearest neighbour word extractor options.
/// </summary>
public class NearestNeighbourWordExtractorOptions : DlaOptions
public class NearestNeighbourWordExtractorOptions : IWordExtractorOptions
{
/// <summary>
/// <inheritdoc/>
/// Default value is -1.
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = -1;
/// <summary>
/// The maximum distance between two letters (start and end base line points) within the same word, as a function of the two letters.
/// If the distance between the two letters is greater than this maximum, they will belong to different words.
/// <para>If the distance between the two letters is greater than this maximum, they will belong to different words.</para>
/// <para>Default value is 20% of the Max(Width, PointSize) of both letters. If <see cref="TextOrientation"/> is Other, this distance is doubled.</para>
/// </summary>
public Func<Letter, Letter, double> MaximumDistance { get; set; } = (l1, l2) =>
@@ -159,15 +169,15 @@
/// <summary>
/// The distance measure used between two letters (start and end base line points) with axis aligned <see cref="TextOrientation"/>.
/// Only used if GroupByOrientation is set to true.
/// <para>Only used if <see cref="GroupByOrientation"/> is set to <c>true</c>.</para>
/// <para>Default value is the Manhattan distance.</para>
/// </summary>
public Func<PdfPoint, PdfPoint, double> DistanceMeasureAA { get; set; } = Distances.Manhattan;
/// <summary>
/// Function used to filter out connection between letters, e.g. check if the letters have the same color.
/// If the function returns false, letters will belong to different words.
/// <para>Default value checks whether the neighbour is a white space or not. If it is the case, it returns false.</para>
/// If the function returns <c>false</c>, letters will belong to different words.
/// <para>Default value checks whether the neighbour is a white space or not. If it is the case, it returns <c>false</c>.</para>
/// </summary>
public Func<Letter, Letter, bool> Filter { get; set; } = (_, l2) => !string.IsNullOrWhiteSpace(l2.Value);
@@ -178,9 +188,9 @@
public Func<Letter, bool> FilterPivot { get; set; } = l => !string.IsNullOrWhiteSpace(l.Value);
/// <summary>
/// If true, letters will be grouped by <see cref="TextOrientation"/> before processing.
/// The DistanceMeasureAA will be used on axis aligned letters, and the DistanceMeasure on others.
/// If false, DistanceMeasure will be used for all letters and DistanceMeasureAA won't be used.
/// If <c>true</c>, letters will be grouped by <see cref="TextOrientation"/> before processing.
/// The <see cref="DistanceMeasureAA"/> will be used on axis aligned letters, and the <see cref="DistanceMeasure"/> on others.
/// <para>If <c>false</c>, <see cref="DistanceMeasure"/> will be used for all letters, and <see cref="DistanceMeasureAA"/> won't be used.</para>
/// <para>Default value is true.</para>
/// </summary>
public bool GroupByOrientation { get; set; } = true;

View File

@@ -88,7 +88,7 @@
{
var page = document.GetPage(1);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words, options);
var blocks = new DocstrumBoundingBoxes(options).GetBlocks(words);
Assert.Equal(expected.Length, blocks.Count);
var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)

View File

@@ -40,7 +40,7 @@
var page = document.GetPage(1);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
var options = new RecursiveXYCut.RecursiveXYCutOptions() { MinimumWidth = page.Width / 3.0, LineSeparator = " " };
var blocks = RecursiveXYCut.Instance.GetBlocks(words, options);
var blocks = new RecursiveXYCut(options).GetBlocks(words);
Assert.Equal(expected.Length, blocks.Count);
var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)

View File

@@ -43,7 +43,7 @@
DefaultWordExtractor.Instance,
RecursiveXYCut.Instance,
UnsupervisedReadingOrderDetector.Instance);
var xml = GetXml(pageXmlTextExporter);
var xml = GetXml(pageXmlTextExporter);
Assert.Contains("<ReadingOrder>", xml);
Assert.Contains("</OrderedGroup>", xml);
@@ -69,7 +69,7 @@
public void ContainsExpectedText()
{
var xml = GetXml();
Assert.Contains(@"2006 Swedish Touring Car Championship", xml);
Assert.Contains("2006 Swedish Touring Car Championship", xml);
// the coords for that text
Assert.Contains(@"<Coords points=""35,77 35,62 397,62 397,77"" />", xml);
}

View File

@@ -79,10 +79,10 @@
public int NumberOfImages => Content.NumberOfImages;
/// <summary>
/// The parsed graphics state operations in the content stream for this page.
/// The parsed graphics state operations in the content stream for this page.
/// </summary>
public IReadOnlyList<IGraphicsStateOperation> Operations => Content.GraphicsStateOperations;
/// <summary>
/// Access to members whose future locations within the API will change without warning.
/// </summary>
@@ -97,7 +97,7 @@
{
throw new ArgumentOutOfRangeException(nameof(number), "Page number cannot be 0 or negative.");
}
Dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary));
Number = number;