mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-01-18 19:51:24 +08:00
Make DlaOptions an interface, add IWordExtractorOptions, remove GetBlocks(words, options), GetWords(letters, options) and put options in constructors - Fix #424. Tidy up code
This commit is contained in:
@@ -2,7 +2,6 @@
|
||||
{
|
||||
using Alto;
|
||||
using Content;
|
||||
using Core;
|
||||
using DocumentLayoutAnalysis;
|
||||
using System;
|
||||
using System.Globalization;
|
||||
@@ -147,7 +146,7 @@
|
||||
altoPage.PrintSpace.TextBlock = blocks;
|
||||
|
||||
altoPage.PrintSpace.Illustrations = page.GetImages().Select(i => ToAltoIllustration(i, page.Height)).ToArray();
|
||||
|
||||
|
||||
if (includePaths)
|
||||
{
|
||||
altoPage.PrintSpace.GraphicalElements = page.ExperimentalAccess.Paths
|
||||
@@ -288,7 +287,7 @@
|
||||
ProcessingSoftware = new AltoDocument.AltoProcessingSoftware
|
||||
{
|
||||
SoftwareName = "PdfPig",
|
||||
SoftwareCreator = @"https://github.com/UglyToad/PdfPig",
|
||||
SoftwareCreator = "https://github.com/UglyToad/PdfPig",
|
||||
ApplicationDescription = "Read and extract text and other content from PDFs in C# (port of PdfBox)",
|
||||
SoftwareVersion = "x.x.xx"
|
||||
},
|
||||
|
||||
@@ -49,7 +49,7 @@
|
||||
/// </summary>
|
||||
/// <param name="document">The document.</param>
|
||||
/// <param name="includePaths">Draw PdfPaths present in the page.</param>
|
||||
/// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the
|
||||
/// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the
|
||||
/// interface to a plain hOCR file.<para>See https://github.com/kba/hocrjs for more information.</para></param>
|
||||
public string Get(PdfDocument document, bool includePaths = false, bool useHocrjs = false)
|
||||
{
|
||||
@@ -61,10 +61,13 @@
|
||||
hocr += GetCode(page, includePaths) + "\n";
|
||||
}
|
||||
|
||||
if (useHocrjs) hocr += indentChar + indentChar + Hocrjs;
|
||||
if (useHocrjs)
|
||||
{
|
||||
hocr += indentChar + indentChar + Hocrjs;
|
||||
}
|
||||
|
||||
hocr += indentChar + "</body>";
|
||||
hocr = XmlHeader + AddHtmlHeader(hocr);
|
||||
return hocr;
|
||||
return XmlHeader + AddHtmlHeader(hocr);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -80,8 +83,8 @@
|
||||
/// Get the hOCR (HTML) string of the page layout.
|
||||
/// </summary>
|
||||
/// <param name="page">The page.</param>
|
||||
/// <param name="imageName">The image name, if any.</param>
|
||||
/// <param name="includePaths">Draw PdfPaths present in the page.</param>
|
||||
/// <param name="imageName">The image name, if any.</param>
|
||||
/// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the interface to a plain hOCR file.<para>See https://github.com/kba/hocrjs for more information.</para></param>
|
||||
public string Get(Page page, bool includePaths = false, string imageName = "unknown", bool useHocrjs = false)
|
||||
{
|
||||
@@ -89,10 +92,13 @@
|
||||
|
||||
hocr += GetCode(page, includePaths, imageName) + "\n";
|
||||
|
||||
if (useHocrjs) hocr += indentChar + indentChar + Hocrjs;
|
||||
if (useHocrjs)
|
||||
{
|
||||
hocr += indentChar + indentChar + Hocrjs;
|
||||
}
|
||||
|
||||
hocr += indentChar + "</body>";
|
||||
hocr = XmlHeader + AddHtmlHeader(hocr);
|
||||
return hocr;
|
||||
return XmlHeader + AddHtmlHeader(hocr);
|
||||
}
|
||||
|
||||
private string GetHead()
|
||||
@@ -129,14 +135,14 @@
|
||||
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page</para>
|
||||
/// </summary>
|
||||
/// <param name="page"></param>
|
||||
/// <param name="imageName"></param>
|
||||
/// <param name="includePaths">Draw PdfPaths present in the page.</param>
|
||||
/// <param name="imageName"></param>
|
||||
private string GetCode(Page page, bool includePaths, string imageName = "unknown")
|
||||
{
|
||||
pageCount++;
|
||||
int level = 2;
|
||||
|
||||
string hocr = GetIndent(level) + @"<div class='ocr_page' id='page_" + page.Number.ToString() +
|
||||
string hocr = GetIndent(level) + "<div class='ocr_page' id='page_" + page.Number.ToString() +
|
||||
"' title='image \"" + imageName + "\"; bbox 0 0 " +
|
||||
(int)Math.Round(page.Width * scale) + " " + (int)Math.Round(page.Height * scale) +
|
||||
"; ppageno " + (page.Number - 1) + "\'>";
|
||||
@@ -156,16 +162,15 @@
|
||||
|
||||
var words = page.GetWords(wordExtractor);
|
||||
|
||||
if (words.Count() > 0)
|
||||
if (words.Any())
|
||||
{
|
||||
var blocks = pageSegmenter.GetBlocks(words);
|
||||
foreach (var block in blocks)
|
||||
foreach (var block in pageSegmenter.GetBlocks(words))
|
||||
{
|
||||
hocr += "\n" + GetCodeArea(block, page.Height, level + 1);
|
||||
}
|
||||
}
|
||||
|
||||
hocr += "\n" + GetIndent(level) + @"</div>";
|
||||
hocr += "\n" + GetIndent(level) + "</div>";
|
||||
return hocr;
|
||||
}
|
||||
|
||||
@@ -179,7 +184,10 @@
|
||||
/// <param name="level">The indent level.</param>
|
||||
private string GetCode(PdfPath path, double pageHeight, bool subPaths, int level)
|
||||
{
|
||||
if (path == null) return string.Empty;
|
||||
if (path == null)
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
string hocr = string.Empty;
|
||||
|
||||
@@ -189,7 +197,7 @@
|
||||
if (bbox.HasValue)
|
||||
{
|
||||
areaCount++;
|
||||
hocr += GetIndent(level) + @"<div class='ocr_carea' id='block_" + pageCount + "_"
|
||||
hocr += GetIndent(level) + "<div class='ocr_carea' id='block_" + pageCount + "_"
|
||||
+ areaCount + "' title='" + GetCode(bbox.Value, pageHeight) + "'>\n";
|
||||
foreach (var subPath in path)
|
||||
{
|
||||
@@ -197,11 +205,11 @@
|
||||
if (subBbox.HasValue)
|
||||
{
|
||||
pathCount++;
|
||||
hocr += GetIndent(level + 1) + @"<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
|
||||
hocr += GetIndent(level + 1) + "<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
|
||||
+ pathCount + "' title='" + GetCode(subBbox.Value, pageHeight) + "' />\n";
|
||||
}
|
||||
}
|
||||
hocr += GetIndent(level) + @"</div>";
|
||||
hocr += GetIndent(level) + "</div>";
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -210,7 +218,7 @@
|
||||
if (bbox.HasValue)
|
||||
{
|
||||
pathCount++;
|
||||
hocr += GetIndent(level) + @"<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
|
||||
hocr += GetIndent(level) + "<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
|
||||
+ pathCount + "' title='" + GetCode(bbox.Value, pageHeight) + "' />";
|
||||
}
|
||||
}
|
||||
@@ -222,7 +230,7 @@
|
||||
{
|
||||
imageCount++;
|
||||
var bbox = pdfImage.Bounds;
|
||||
return GetIndent(level) + @"<span class='ocr_image' id='image_" + pageCount + "_"
|
||||
return GetIndent(level) + "<span class='ocr_image' id='image_" + pageCount + "_"
|
||||
+ imageCount + "' title='" + GetCode(bbox, pageHeight) + "' />";
|
||||
}
|
||||
|
||||
@@ -237,12 +245,11 @@
|
||||
{
|
||||
areaCount++;
|
||||
|
||||
string bbox = GetCode(block.BoundingBox, pageHeight);
|
||||
string hocr = GetIndent(level) + @"<div class='ocr_carea' id='block_" + pageCount + "_"
|
||||
string hocr = GetIndent(level) + "<div class='ocr_carea' id='block_" + pageCount + "_"
|
||||
+ areaCount + "' title='" + GetCode(block.BoundingBox, pageHeight) + "'>";
|
||||
|
||||
hocr += GetCodeParagraph(block, pageHeight, level + 1); // we concider 1 area = 1 block. should change in the future
|
||||
hocr += "\n" + GetIndent(level) + @"</div>";
|
||||
hocr += "\n" + GetIndent(level) + "</div>";
|
||||
return hocr;
|
||||
}
|
||||
|
||||
@@ -256,14 +263,14 @@
|
||||
private string GetCodeParagraph(TextBlock block, double pageHeight, int level)
|
||||
{
|
||||
paraCount++;
|
||||
string hocr = "\n" + GetIndent(level) + @"<p class='ocr_par' id='par_" + pageCount + "_"
|
||||
string hocr = "\n" + GetIndent(level) + "<p class='ocr_par' id='par_" + pageCount + "_"
|
||||
+ paraCount + "' title='" + GetCode(block.BoundingBox, pageHeight) + "'>"; // lang='eng'
|
||||
|
||||
foreach (var line in block.TextLines)
|
||||
{
|
||||
hocr += "\n" + GetCode(line, pageHeight, level + 1);
|
||||
}
|
||||
hocr += "\n" + GetIndent(level) + @"</p>";
|
||||
hocr += "\n" + GetIndent(level) + "</p>";
|
||||
|
||||
return hocr;
|
||||
}
|
||||
@@ -285,14 +292,14 @@
|
||||
double baseLine = (double)line.Words[0].Letters[0].StartBaseLine.Y;
|
||||
baseLine = (double)line.BoundingBox.Bottom - baseLine;
|
||||
|
||||
string hocr = GetIndent(level) + @"<span class='ocr_line' id='line_" + pageCount + "_" + lineCount + "' title='" +
|
||||
string hocr = GetIndent(level) + "<span class='ocr_line' id='line_" + pageCount + "_" + lineCount + "' title='" +
|
||||
GetCode(line.BoundingBox, pageHeight) + "; baseline " + angle + " 0'>"; //"; x_size 42; x_descenders 5; x_ascenders 12' >";
|
||||
|
||||
foreach (var word in line.Words)
|
||||
{
|
||||
hocr += "\n" + GetCode(word, pageHeight, level + 1);
|
||||
}
|
||||
hocr += "\n" + GetIndent(level) + @"</span>";
|
||||
hocr += "\n" + GetIndent(level) + "</span>";
|
||||
return hocr;
|
||||
}
|
||||
|
||||
@@ -307,7 +314,7 @@
|
||||
{
|
||||
wordCount++;
|
||||
string hocr = GetIndent(level) +
|
||||
@"<span class='ocrx_word' id='word_" + pageCount + "_" + wordCount +
|
||||
"<span class='ocrx_word' id='word_" + pageCount + "_" + wordCount +
|
||||
"' title='" + GetCode(word.BoundingBox, pageHeight) + "; x_wconf " + GetConfidence(word);
|
||||
|
||||
hocr += "; x_font " + word.FontName;
|
||||
@@ -343,7 +350,7 @@
|
||||
var right = (int)Math.Round(rectangle.Right * scale);
|
||||
var bottom = (int)Math.Round((pageHeight - rectangle.Bottom) * scale);
|
||||
|
||||
return @"bbox " + (left > 0 ? left : 0) + " "
|
||||
return "bbox " + (left > 0 ? left : 0) + " "
|
||||
+ (top > 0 ? top : 0) + " "
|
||||
+ (right > 0 ? right : 0) + " "
|
||||
+ (bottom > 0 ? bottom : 0);
|
||||
|
||||
@@ -147,7 +147,7 @@
|
||||
|
||||
/// <summary>
|
||||
/// PageXml Text colour in RGB encoded format
|
||||
/// <para>(red value) + (256 x green value) + (65536 x blue value).</para>
|
||||
/// <para>(red value) + (256 x green value) + (65536 x blue value).</para>
|
||||
/// </summary>
|
||||
private string ToRgbEncoded(IColor color)
|
||||
{
|
||||
@@ -184,7 +184,7 @@
|
||||
|
||||
regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Width, page.Height)));
|
||||
|
||||
if (orderedRegions.Any())
|
||||
if (orderedRegions.Count > 0)
|
||||
{
|
||||
pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder()
|
||||
{
|
||||
@@ -206,7 +206,7 @@
|
||||
if (includePaths)
|
||||
{
|
||||
var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Width, page.Height));
|
||||
if (graphicalElements.Where(g => g != null).Count() > 0)
|
||||
if (graphicalElements.Count(g => g != null) > 0)
|
||||
{
|
||||
regions.AddRange(graphicalElements.Where(g => g != null));
|
||||
}
|
||||
|
||||
@@ -53,7 +53,7 @@
|
||||
builder.Append("</g></svg>");
|
||||
return builder.ToString();
|
||||
}
|
||||
|
||||
|
||||
private static string LetterToSvg(Letter l, double height, XmlDocument doc)
|
||||
{
|
||||
string fontFamily = GetFontFamily(l.FontName, out string style, out string weight);
|
||||
@@ -69,7 +69,7 @@
|
||||
var x = Math.Round(l.StartBaseLine.X, Rounding);
|
||||
var y = Math.Round(height - l.StartBaseLine.Y, Rounding);
|
||||
|
||||
return $"<text x='{x}' y='{y}'{rotation} font-family='{fontFamily}' font-style='{style}' font-weight='{weight}' {fontSize} fill='{ColorToSvg(l.Color)}'>{safeValue}</text>"
|
||||
return $"<text x='{x}' y='{y}'{rotation} font-family='{fontFamily}' font-style='{style}' font-weight='{weight}' {fontSize} fill='{ColorToSvg(l.Color)}'>{safeValue}</text>"
|
||||
+ Environment.NewLine;
|
||||
}
|
||||
|
||||
@@ -123,7 +123,11 @@
|
||||
}
|
||||
}
|
||||
|
||||
if (Fonts.ContainsKey(fontName)) fontName = Fonts[fontName];
|
||||
if (Fonts.ContainsKey(fontName))
|
||||
{
|
||||
fontName = Fonts[fontName];
|
||||
}
|
||||
|
||||
return fontName;
|
||||
}
|
||||
|
||||
@@ -136,7 +140,11 @@
|
||||
|
||||
private static string ColorToSvg(IColor color)
|
||||
{
|
||||
if (color == null) return "";
|
||||
if (color == null)
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
var (r, g, b) = color.ToRGBValues();
|
||||
return $"rgb({Math.Ceiling(r * 255)},{Math.Ceiling(g * 255)},{Math.Ceiling(b * 255)})";
|
||||
}
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
/// <summary>
|
||||
/// Abstract class that stores options that configure the operation of methods of the document layout analysis algorithm.
|
||||
/// Interface that stores options that configure the operation of methods of the document layout analysis algorithm.
|
||||
/// </summary>
|
||||
public abstract class DlaOptions
|
||||
public interface IDlaOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para>
|
||||
/// <para>Default value is -1.</para>
|
||||
/// </summary>
|
||||
public int MaxDegreeOfParallelism { get; set; } = -1;
|
||||
int MaxDegreeOfParallelism { get; set; }
|
||||
}
|
||||
}
|
||||
@@ -11,47 +11,67 @@
|
||||
/// </summary>
|
||||
public class DefaultPageSegmenter : IPageSegmenter
|
||||
{
|
||||
private readonly DefaultPageSegmenterOptions options;
|
||||
|
||||
/// <summary>
|
||||
/// Create an instance of default page segmenter, <see cref="DefaultPageSegmenter"/>.
|
||||
/// </summary>
|
||||
public static DefaultPageSegmenter Instance { get; } = new DefaultPageSegmenter();
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks using default options values.
|
||||
/// using default options values.
|
||||
/// </summary>
|
||||
/// <param name="words">The page's words to generate text blocks for.</param>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
|
||||
public DefaultPageSegmenter() : this(new DefaultPageSegmenterOptions())
|
||||
{
|
||||
return GetBlocks(words, new DefaultPageSegmenterOptions());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create using options values.
|
||||
/// </summary>
|
||||
/// <param name="options">The <see cref="DefaultPageSegmenterOptions"/> to use.</param>
|
||||
/// <exception cref="ArgumentNullException"></exception>
|
||||
public DefaultPageSegmenter(DefaultPageSegmenterOptions options)
|
||||
{
|
||||
this.options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the text blocks using options.
|
||||
/// </summary>
|
||||
/// <param name="words">The page's words to generate text blocks for.</param>
|
||||
/// <param name="options">The <see cref="DefaultPageSegmenterOptions"/> to use.</param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the default method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
|
||||
{
|
||||
if (options is DefaultPageSegmenterOptions dOptions)
|
||||
if (words?.Any() != true)
|
||||
{
|
||||
if (words?.Any() != true)
|
||||
{
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
|
||||
return new List<TextBlock>() { new TextBlock(new XYLeaf(words).GetLines(dOptions.WordSeparator), dOptions.LineSeparator) };
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentException("Options provided must be of type " + nameof(DefaultPageSegmenterOptions) + ".", nameof(options));
|
||||
}
|
||||
return new List<TextBlock>() { new TextBlock(new XYLeaf(words).GetLines(options.WordSeparator), options.LineSeparator) };
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default page segmenter options.
|
||||
/// </summary>
|
||||
public class DefaultPageSegmenterOptions : PageSegmenterOptions
|
||||
{ }
|
||||
public class DefaultPageSegmenterOptions : IPageSegmenterOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// <inheritdoc/>
|
||||
/// Default value is -1.
|
||||
/// </summary>
|
||||
public int MaxDegreeOfParallelism { get; set; } = -1;
|
||||
|
||||
/// <summary>
|
||||
/// <inheritdoc/>
|
||||
/// <para>Default value is ' ' (space).</para>
|
||||
/// </summary>
|
||||
public string WordSeparator { get; set; } = " ";
|
||||
|
||||
/// <summary>
|
||||
/// <inheritdoc/>
|
||||
/// <para>Default value is '\n' (new line).</para>
|
||||
/// </summary>
|
||||
public string LineSeparator { get; set; } = "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -17,48 +17,49 @@
|
||||
/// </summary>
|
||||
public class DocstrumBoundingBoxes : IPageSegmenter
|
||||
{
|
||||
private readonly DocstrumBoundingBoxesOptions options;
|
||||
|
||||
/// <summary>
|
||||
/// Create an instance of Docstrum for bounding boxes page segmenter, <see cref="DocstrumBoundingBoxes"/>.
|
||||
/// </summary>
|
||||
public static DocstrumBoundingBoxes Instance { get; } = new DocstrumBoundingBoxes();
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks using default options values.
|
||||
/// Create an instance of Docstrum for bounding boxes page segmenter using default options values.
|
||||
/// </summary>
|
||||
public DocstrumBoundingBoxes() : this(new DocstrumBoundingBoxesOptions())
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create an instance of Docstrum for bounding boxes page segmenter using options values.
|
||||
/// </summary>
|
||||
/// <param name="options">The <see cref="DocstrumBoundingBoxesOptions"/> to use.</param>
|
||||
/// <exception cref="ArgumentException"></exception>
|
||||
public DocstrumBoundingBoxes(DocstrumBoundingBoxesOptions options)
|
||||
{
|
||||
this.options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks.
|
||||
/// </summary>
|
||||
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
|
||||
{
|
||||
return GetBlocks(words, new DocstrumBoundingBoxesOptions());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks using options values.
|
||||
/// </summary>
|
||||
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <param name="options">The <see cref="DocstrumBoundingBoxesOptions"/> to use.</param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
|
||||
{
|
||||
if (options is DocstrumBoundingBoxesOptions dbbOptions)
|
||||
if (words?.Any() != true)
|
||||
{
|
||||
if (words?.Any() != true)
|
||||
{
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
|
||||
return GetBlocks(words.ToList(),
|
||||
dbbOptions.WithinLineBounds, dbbOptions.WithinLineMultiplier, dbbOptions.WithinLineBinSize,
|
||||
dbbOptions.BetweenLineBounds, dbbOptions.BetweenLineMultiplier, dbbOptions.BetweenLineBinSize,
|
||||
dbbOptions.AngularDifferenceBounds,
|
||||
dbbOptions.Epsilon,
|
||||
dbbOptions.WordSeparator, dbbOptions.LineSeparator,
|
||||
dbbOptions.MaxDegreeOfParallelism);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentException("Options provided must be of type " + nameof(DocstrumBoundingBoxesOptions) + ".", nameof(options));
|
||||
}
|
||||
return GetBlocks(words.ToList(),
|
||||
options.WithinLineBounds, options.WithinLineMultiplier, options.WithinLineBinSize,
|
||||
options.BetweenLineBounds, options.BetweenLineMultiplier, options.BetweenLineBinSize,
|
||||
options.AngularDifferenceBounds,
|
||||
options.Epsilon,
|
||||
options.WordSeparator, options.LineSeparator,
|
||||
options.MaxDegreeOfParallelism);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -612,8 +613,26 @@
|
||||
/// <summary>
|
||||
/// Docstrum bounding boxes page segmenter options.
|
||||
/// </summary>
|
||||
public class DocstrumBoundingBoxesOptions : PageSegmenterOptions
|
||||
public class DocstrumBoundingBoxesOptions : IPageSegmenterOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// <inheritdoc/>
|
||||
/// Default value is -1.
|
||||
/// </summary>
|
||||
public int MaxDegreeOfParallelism { get; set; } = -1;
|
||||
|
||||
/// <summary>
|
||||
/// <inheritdoc/>
|
||||
/// <para>Default value is ' ' (space).</para>
|
||||
/// </summary>
|
||||
public string WordSeparator { get; set; } = " ";
|
||||
|
||||
/// <summary>
|
||||
/// <inheritdoc/>
|
||||
/// <para>Default value is '\n' (new line).</para>
|
||||
/// </summary>
|
||||
public string LineSeparator { get; set; } = "\n";
|
||||
|
||||
/// <summary>
|
||||
/// Precision when testing equalities.
|
||||
/// <para>Default value is 1e-3.</para>
|
||||
@@ -640,7 +659,6 @@
|
||||
/// </summary>
|
||||
public int WithinLineBinSize { get; set; } = 10;
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Angle bounds for words to be considered as neighbours on separate lines.
|
||||
/// <para>Default value is 45 ≤ θ ≤ 135.</para>
|
||||
|
||||
@@ -10,18 +10,10 @@
|
||||
public interface IPageSegmenter
|
||||
{
|
||||
/// <summary>
|
||||
/// Get the blocks using default options values.
|
||||
/// Get the blocks.
|
||||
/// </summary>
|
||||
/// <param name="words">The page's words to generate text blocks for.</param>
|
||||
/// <returns>A list of text blocks from this approach.</returns>
|
||||
IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words);
|
||||
|
||||
/// <summary>
|
||||
/// Get the text blocks using options.
|
||||
/// </summary>
|
||||
/// <param name="words">The page's words to generate text blocks for.</param>
|
||||
/// <param name="options"></param>
|
||||
/// <returns>A list of text blocks from this approach.</returns>
|
||||
IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,20 +1,18 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter
|
||||
{
|
||||
/// <summary>
|
||||
/// Abstract page segmenter options.
|
||||
/// Page segmenter options interface.
|
||||
/// </summary>
|
||||
public abstract class PageSegmenterOptions : DlaOptions
|
||||
public interface IPageSegmenterOptions : IDlaOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Separator used between words when building lines.
|
||||
/// <para>Default value is ' ' (space).</para>
|
||||
/// </summary>
|
||||
public string WordSeparator { get; set; } = " ";
|
||||
string WordSeparator { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Separator used between lines when building paragraphs.
|
||||
/// <para>Default value is '\n' (new line).</para>
|
||||
/// </summary>
|
||||
public string LineSeparator { get; set; } = "\n";
|
||||
string LineSeparator { get; set; }
|
||||
}
|
||||
}
|
||||
@@ -15,47 +15,48 @@
|
||||
/// </summary>
|
||||
public class RecursiveXYCut : IPageSegmenter
|
||||
{
|
||||
private readonly RecursiveXYCutOptions options;
|
||||
|
||||
/// <summary>
|
||||
/// Create an instance of Recursive X-Y Cut page segmenter, <see cref="RecursiveXYCut"/>.
|
||||
/// </summary>
|
||||
public static RecursiveXYCut Instance { get; } = new RecursiveXYCut();
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks using default options values.
|
||||
/// Create an instance of Recursive X-Y Cut page segmenter using default options values.
|
||||
/// </summary>
|
||||
public RecursiveXYCut() : this(new RecursiveXYCutOptions())
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create an instance of Recursive X-Y Cut page segmenter using options values.
|
||||
/// </summary>
|
||||
/// <param name="options">The <see cref="RecursiveXYCutOptions"/> to use.</param>
|
||||
/// <exception cref="ArgumentException"></exception>
|
||||
public RecursiveXYCut(RecursiveXYCutOptions options)
|
||||
{
|
||||
this.options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks.
|
||||
/// </summary>
|
||||
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
|
||||
{
|
||||
return GetBlocks(words, new RecursiveXYCutOptions());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks using options values.
|
||||
/// </summary>
|
||||
/// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <param name="options">The <see cref="RecursiveXYCutOptions"/> to use.</param>
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, DlaOptions options)
|
||||
{
|
||||
if (options is RecursiveXYCutOptions ryxcOptions)
|
||||
if (words?.Any() != true)
|
||||
{
|
||||
if (words?.Any() != true)
|
||||
{
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
|
||||
return GetBlocks(words,
|
||||
ryxcOptions.MinimumWidth,
|
||||
ryxcOptions.DominantFontWidthFunc,
|
||||
ryxcOptions.DominantFontHeightFunc,
|
||||
ryxcOptions.WordSeparator,
|
||||
ryxcOptions.LineSeparator);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentException("Options provided must be of type " + nameof(RecursiveXYCutOptions) + ".", nameof(options));
|
||||
}
|
||||
return GetBlocks(words,
|
||||
options.MinimumWidth,
|
||||
options.DominantFontWidthFunc,
|
||||
options.DominantFontHeightFunc,
|
||||
options.WordSeparator,
|
||||
options.LineSeparator);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -92,7 +93,7 @@
|
||||
|
||||
if (leaves.Count > 0)
|
||||
{
|
||||
return leaves.Select(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator)).ToList();
|
||||
return leaves.ConvertAll(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -183,7 +184,11 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
if (i == wordsCount - 1) projectionProfile.Add(currentProjection);
|
||||
|
||||
if (i == wordsCount - 1)
|
||||
{
|
||||
projectionProfile.Add(currentProjection);
|
||||
}
|
||||
}
|
||||
|
||||
var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w =>
|
||||
@@ -271,7 +276,11 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
if (i == wordsCount - 1) projectionProfile.Add(currentProjection);
|
||||
|
||||
if (i == wordsCount - 1)
|
||||
{
|
||||
projectionProfile.Add(currentProjection);
|
||||
}
|
||||
}
|
||||
|
||||
if (projectionProfile.Count == 1)
|
||||
@@ -329,8 +338,26 @@
|
||||
/// <summary>
|
||||
/// Recursive X-Y cut page segmenter options.
|
||||
/// </summary>
|
||||
public class RecursiveXYCutOptions : PageSegmenterOptions
|
||||
public class RecursiveXYCutOptions : IPageSegmenterOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// <inheritdoc/>
|
||||
/// Default value is -1.
|
||||
/// </summary>
|
||||
public int MaxDegreeOfParallelism { get; set; } = -1;
|
||||
|
||||
/// <summary>
|
||||
/// <inheritdoc/>
|
||||
/// <para>Default value is ' ' (space).</para>
|
||||
/// </summary>
|
||||
public string WordSeparator { get; set; } = " ";
|
||||
|
||||
/// <summary>
|
||||
/// <inheritdoc/>
|
||||
/// <para>Default value is '\n' (new line).</para>
|
||||
/// </summary>
|
||||
public string LineSeparator { get; set; } = "\n";
|
||||
|
||||
/// <summary>
|
||||
/// The minimum width for a block.
|
||||
/// <para>Default value is 1.</para>
|
||||
|
||||
@@ -31,7 +31,6 @@
|
||||
public XYNode(params XYNode[] children)
|
||||
: this(children?.ToList())
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -40,12 +39,12 @@
|
||||
/// <param name="children">The node's children.</param>
|
||||
public XYNode(IEnumerable<XYNode> children)
|
||||
{
|
||||
if (children != null && children.Count() != 0)
|
||||
if (children?.Any() == true)
|
||||
{
|
||||
Children = children.ToArray();
|
||||
BoundingBox = new PdfRectangle(children.Min(b => b.BoundingBox.Left),
|
||||
children.Min(b => b.BoundingBox.Bottom),
|
||||
children.Max(b => b.BoundingBox.Right),
|
||||
BoundingBox = new PdfRectangle(children.Min(b => b.BoundingBox.Left),
|
||||
children.Min(b => b.BoundingBox.Bottom),
|
||||
children.Max(b => b.BoundingBox.Right),
|
||||
children.Max(b => b.BoundingBox.Top));
|
||||
}
|
||||
else
|
||||
@@ -87,7 +86,11 @@
|
||||
|
||||
private void RecursiveCount(IEnumerable<XYNode> children, ref int count)
|
||||
{
|
||||
if (children.Count() == 0) return;
|
||||
if (!children.Any())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (XYNode node in children.Where(x => x.IsLeaf))
|
||||
{
|
||||
count += node.CountWords();
|
||||
@@ -101,7 +104,11 @@
|
||||
|
||||
private void RecursiveGetLeaves(IEnumerable<XYNode> children, ref List<XYLeaf> leaves, int level)
|
||||
{
|
||||
if (children.Count() == 0) return;
|
||||
if (!children.Any())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
bool isVerticalCut = level % 2 == 0;
|
||||
|
||||
foreach (XYLeaf node in children.Where(x => x.IsLeaf))
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Word extractor options interface.
|
||||
/// </summary>
|
||||
public interface IWordExtractorOptions : IDlaOptions
|
||||
{
|
||||
}
|
||||
}
|
||||
@@ -13,77 +13,78 @@
|
||||
/// </summary>
|
||||
public class NearestNeighbourWordExtractor : IWordExtractor
|
||||
{
|
||||
private readonly NearestNeighbourWordExtractorOptions options;
|
||||
|
||||
/// <summary>
|
||||
/// Create an instance of Nearest Neighbour Word Extractor, <see cref="NearestNeighbourWordExtractor"/>.
|
||||
/// </summary>
|
||||
public static NearestNeighbourWordExtractor Instance { get; } = new NearestNeighbourWordExtractor();
|
||||
|
||||
/// <summary>
|
||||
/// Get the words using default options values.
|
||||
/// Create an instance of Nearest Neighbour Word Extractor using default options values.
|
||||
/// </summary>
|
||||
public NearestNeighbourWordExtractor() : this(new NearestNeighbourWordExtractorOptions())
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create an instance of Nearest Neighbour Word Extractor using options values.
|
||||
/// </summary>
|
||||
/// <param name="options">The <see cref="NearestNeighbourWordExtractorOptions"/> to use.</param>
|
||||
/// <exception cref="ArgumentNullException"></exception>
|
||||
public NearestNeighbourWordExtractor(NearestNeighbourWordExtractorOptions options)
|
||||
{
|
||||
this.options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the words.
|
||||
/// </summary>
|
||||
/// <param name="letters">The page's letters to group into <see cref="Word"/>s.</param>
|
||||
/// <returns>The <see cref="Word"/>s generated by the nearest neighbour method.</returns>
|
||||
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
|
||||
{
|
||||
return GetWords(letters, new NearestNeighbourWordExtractorOptions());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the words using options values.
|
||||
/// </summary>
|
||||
/// <param name="letters">The page's letters to group into <see cref="Word"/>s.</param>
|
||||
/// <param name="options">The <see cref="NearestNeighbourWordExtractorOptions"/> to use.</param>
|
||||
/// <returns>The <see cref="Word"/>s generated by the nearest neighbour method.</returns>
|
||||
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters, DlaOptions options)
|
||||
{
|
||||
if (options is NearestNeighbourWordExtractorOptions nnOptions)
|
||||
if (letters == null || letters.Count == 0)
|
||||
{
|
||||
if (letters == null || letters.Count == 0)
|
||||
{
|
||||
return EmptyArray<Word>.Instance;
|
||||
}
|
||||
return EmptyArray<Word>.Instance;
|
||||
}
|
||||
|
||||
if (nnOptions.GroupByOrientation)
|
||||
{
|
||||
// axis aligned
|
||||
List<Word> words = GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
|
||||
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
|
||||
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism);
|
||||
if (options.GroupByOrientation)
|
||||
{
|
||||
// axis aligned
|
||||
List<Word> words = GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism);
|
||||
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
|
||||
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
|
||||
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism));
|
||||
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
|
||||
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
|
||||
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism));
|
||||
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
|
||||
nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
|
||||
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism));
|
||||
|
||||
// not axis aligned
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
|
||||
nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot,
|
||||
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
|
||||
// not axis aligned
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasure, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism));
|
||||
|
||||
return words;
|
||||
}
|
||||
else
|
||||
{
|
||||
return GetWords(letters,
|
||||
nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot,
|
||||
nnOptions.Filter, nnOptions.MaxDegreeOfParallelism);
|
||||
}
|
||||
return words;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentException("Options provided must be of type " + nameof(NearestNeighbourWordExtractorOptions) + ".", nameof(options));
|
||||
return GetWords(letters,
|
||||
options.MaximumDistance, options.DistanceMeasure, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -107,7 +108,10 @@
|
||||
Func<Letter, bool> filterPivotFunction,
|
||||
Func<Letter, Letter, bool> filterFunction, int maxDegreeOfParallelism)
|
||||
{
|
||||
if (letters == null || letters.Count == 0) return new List<Word>();
|
||||
if (letters == null || letters.Count == 0)
|
||||
{
|
||||
return new List<Word>();
|
||||
}
|
||||
|
||||
var groupedLetters = Clustering.NearestNeighbours(letters,
|
||||
distMeasure, maxDistanceFunction,
|
||||
@@ -128,11 +132,17 @@
|
||||
/// <summary>
|
||||
/// Nearest neighbour word extractor options.
|
||||
/// </summary>
|
||||
public class NearestNeighbourWordExtractorOptions : DlaOptions
|
||||
public class NearestNeighbourWordExtractorOptions : IWordExtractorOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// <inheritdoc/>
|
||||
/// Default value is -1.
|
||||
/// </summary>
|
||||
public int MaxDegreeOfParallelism { get; set; } = -1;
|
||||
|
||||
/// <summary>
|
||||
/// The maximum distance between two letters (start and end base line points) within the same word, as a function of the two letters.
|
||||
/// If the distance between the two letters is greater than this maximum, they will belong to different words.
|
||||
/// <para>If the distance between the two letters is greater than this maximum, they will belong to different words.</para>
|
||||
/// <para>Default value is 20% of the Max(Width, PointSize) of both letters. If <see cref="TextOrientation"/> is Other, this distance is doubled.</para>
|
||||
/// </summary>
|
||||
public Func<Letter, Letter, double> MaximumDistance { get; set; } = (l1, l2) =>
|
||||
@@ -159,15 +169,15 @@
|
||||
|
||||
/// <summary>
|
||||
/// The distance measure used between two letters (start and end base line points) with axis aligned <see cref="TextOrientation"/>.
|
||||
/// Only used if GroupByOrientation is set to true.
|
||||
/// <para>Only used if <see cref="GroupByOrientation"/> is set to <c>true</c>.</para>
|
||||
/// <para>Default value is the Manhattan distance.</para>
|
||||
/// </summary>
|
||||
public Func<PdfPoint, PdfPoint, double> DistanceMeasureAA { get; set; } = Distances.Manhattan;
|
||||
|
||||
/// <summary>
|
||||
/// Function used to filter out connection between letters, e.g. check if the letters have the same color.
|
||||
/// If the function returns false, letters will belong to different words.
|
||||
/// <para>Default value checks whether the neighbour is a white space or not. If it is the case, it returns false.</para>
|
||||
/// If the function returns <c>false</c>, letters will belong to different words.
|
||||
/// <para>Default value checks whether the neighbour is a white space or not. If it is the case, it returns <c>false</c>.</para>
|
||||
/// </summary>
|
||||
public Func<Letter, Letter, bool> Filter { get; set; } = (_, l2) => !string.IsNullOrWhiteSpace(l2.Value);
|
||||
|
||||
@@ -178,9 +188,9 @@
|
||||
public Func<Letter, bool> FilterPivot { get; set; } = l => !string.IsNullOrWhiteSpace(l.Value);
|
||||
|
||||
/// <summary>
|
||||
/// If true, letters will be grouped by <see cref="TextOrientation"/> before processing.
|
||||
/// The DistanceMeasureAA will be used on axis aligned letters, and the DistanceMeasure on others.
|
||||
/// If false, DistanceMeasure will be used for all letters and DistanceMeasureAA won't be used.
|
||||
/// If <c>true</c>, letters will be grouped by <see cref="TextOrientation"/> before processing.
|
||||
/// The <see cref="DistanceMeasureAA"/> will be used on axis aligned letters, and the <see cref="DistanceMeasure"/> on others.
|
||||
/// <para>If <c>false</c>, <see cref="DistanceMeasure"/> will be used for all letters, and <see cref="DistanceMeasureAA"/> won't be used.</para>
|
||||
/// <para>Default value is true.</para>
|
||||
/// </summary>
|
||||
public bool GroupByOrientation { get; set; } = true;
|
||||
|
||||
@@ -88,7 +88,7 @@
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
|
||||
var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words, options);
|
||||
var blocks = new DocstrumBoundingBoxes(options).GetBlocks(words);
|
||||
|
||||
Assert.Equal(expected.Length, blocks.Count);
|
||||
var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
var page = document.GetPage(1);
|
||||
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
|
||||
var options = new RecursiveXYCut.RecursiveXYCutOptions() { MinimumWidth = page.Width / 3.0, LineSeparator = " " };
|
||||
var blocks = RecursiveXYCut.Instance.GetBlocks(words, options);
|
||||
var blocks = new RecursiveXYCut(options).GetBlocks(words);
|
||||
|
||||
Assert.Equal(expected.Length, blocks.Count);
|
||||
var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
|
||||
|
||||
@@ -43,7 +43,7 @@
|
||||
DefaultWordExtractor.Instance,
|
||||
RecursiveXYCut.Instance,
|
||||
UnsupervisedReadingOrderDetector.Instance);
|
||||
var xml = GetXml(pageXmlTextExporter);
|
||||
var xml = GetXml(pageXmlTextExporter);
|
||||
|
||||
Assert.Contains("<ReadingOrder>", xml);
|
||||
Assert.Contains("</OrderedGroup>", xml);
|
||||
@@ -69,7 +69,7 @@
|
||||
public void ContainsExpectedText()
|
||||
{
|
||||
var xml = GetXml();
|
||||
Assert.Contains(@"2006 Swedish Touring Car Championship", xml);
|
||||
Assert.Contains("2006 Swedish Touring Car Championship", xml);
|
||||
// the coords for that text
|
||||
Assert.Contains(@"<Coords points=""35,77 35,62 397,62 397,77"" />", xml);
|
||||
}
|
||||
|
||||
@@ -79,10 +79,10 @@
|
||||
public int NumberOfImages => Content.NumberOfImages;
|
||||
|
||||
/// <summary>
|
||||
/// The parsed graphics state operations in the content stream for this page.
|
||||
/// The parsed graphics state operations in the content stream for this page.
|
||||
/// </summary>
|
||||
public IReadOnlyList<IGraphicsStateOperation> Operations => Content.GraphicsStateOperations;
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Access to members whose future locations within the API will change without warning.
|
||||
/// </summary>
|
||||
@@ -97,7 +97,7 @@
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(number), "Page number cannot be 0 or negative.");
|
||||
}
|
||||
|
||||
|
||||
Dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary));
|
||||
|
||||
Number = number;
|
||||
|
||||
Reference in New Issue
Block a user