From 05aba1cfe7366f633f35e26c43ca246327b6955a Mon Sep 17 00:00:00 2001 From: BobLD Date: Sat, 12 Mar 2022 13:23:40 +0000 Subject: [PATCH] Make DlaOptions an interface, add IWordExtractorOptions, remove GetBlocks(words, options), GetWords(letters, options) and put options in constructors - Fix #424. Tidy up code --- .../Export/AltoXmlTextExporter.cs | 5 +- .../Export/HOcrTextExporter.cs | 65 +++++---- .../Export/PageXmlTextExporter.cs | 6 +- .../Export/SvgTextExporter.cs | 16 ++- .../{DlaOptions.cs => IDlaOptions.cs} | 7 +- .../PageSegmenter/DefaultPageSegmenter.cs | 58 +++++--- .../PageSegmenter/DocstrumBoundingBoxes.cs | 80 ++++++----- .../PageSegmenter/IPageSegmenter.cs | 10 +- ...terOptions.cs => IPageSegmenterOptions.cs} | 10 +- .../PageSegmenter/RecursiveXYCut.cs | 91 ++++++++----- .../PageSegmenter/XYNode.cs | 21 ++- .../WordExtractor/IWordExtractorOptions.cs | 9 ++ .../NearestNeighbourWordExtractor.cs | 128 ++++++++++-------- .../Dla/DocstrumBoundingBoxesTests.cs | 2 +- .../Dla/RecursiveXYCutTests.cs | 2 +- .../Integration/PageXmlTextExporterTests.cs | 4 +- src/UglyToad.PdfPig/Content/Page.cs | 6 +- 17 files changed, 307 insertions(+), 213 deletions(-) rename src/UglyToad.PdfPig.DocumentLayoutAnalysis/{DlaOptions.cs => IDlaOptions.cs} (61%) rename src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/{PageSegmenterOptions.cs => IPageSegmenterOptions.cs} (50%) create mode 100644 src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/IWordExtractorOptions.cs diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs index 6e826e77..acbe5827 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs @@ -2,7 +2,6 @@ { using Alto; using Content; - using Core; using DocumentLayoutAnalysis; using System; using System.Globalization; @@ -147,7 +146,7 @@ altoPage.PrintSpace.TextBlock = blocks; altoPage.PrintSpace.Illustrations = page.GetImages().Select(i => ToAltoIllustration(i, page.Height)).ToArray(); - + if (includePaths) { altoPage.PrintSpace.GraphicalElements = page.ExperimentalAccess.Paths @@ -288,7 +287,7 @@ ProcessingSoftware = new AltoDocument.AltoProcessingSoftware { SoftwareName = "PdfPig", - SoftwareCreator = @"https://github.com/UglyToad/PdfPig", + SoftwareCreator = "https://github.com/UglyToad/PdfPig", ApplicationDescription = "Read and extract text and other content from PDFs in C# (port of PdfBox)", SoftwareVersion = "x.x.xx" }, diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs index 4f77dfc2..40cc53dc 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs @@ -49,7 +49,7 @@ /// /// The document. /// Draw PdfPaths present in the page. - /// Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the + /// Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the /// interface to a plain hOCR file.See https://github.com/kba/hocrjs for more information. public string Get(PdfDocument document, bool includePaths = false, bool useHocrjs = false) { @@ -61,10 +61,13 @@ hocr += GetCode(page, includePaths) + "\n"; } - if (useHocrjs) hocr += indentChar + indentChar + Hocrjs; + if (useHocrjs) + { + hocr += indentChar + indentChar + Hocrjs; + } + hocr += indentChar + ""; - hocr = XmlHeader + AddHtmlHeader(hocr); - return hocr; + return XmlHeader + AddHtmlHeader(hocr); } /// @@ -80,8 +83,8 @@ /// Get the hOCR (HTML) string of the page layout. /// /// The page. - /// The image name, if any. /// Draw PdfPaths present in the page. + /// The image name, if any. /// Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the interface to a plain hOCR file.See https://github.com/kba/hocrjs for more information. public string Get(Page page, bool includePaths = false, string imageName = "unknown", bool useHocrjs = false) { @@ -89,10 +92,13 @@ hocr += GetCode(page, includePaths, imageName) + "\n"; - if (useHocrjs) hocr += indentChar + indentChar + Hocrjs; + if (useHocrjs) + { + hocr += indentChar + indentChar + Hocrjs; + } + hocr += indentChar + ""; - hocr = XmlHeader + AddHtmlHeader(hocr); - return hocr; + return XmlHeader + AddHtmlHeader(hocr); } private string GetHead() @@ -129,14 +135,14 @@ /// http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page /// /// - /// /// Draw PdfPaths present in the page. + /// private string GetCode(Page page, bool includePaths, string imageName = "unknown") { pageCount++; int level = 2; - string hocr = GetIndent(level) + @"
"; @@ -156,16 +162,15 @@ var words = page.GetWords(wordExtractor); - if (words.Count() > 0) + if (words.Any()) { - var blocks = pageSegmenter.GetBlocks(words); - foreach (var block in blocks) + foreach (var block in pageSegmenter.GetBlocks(words)) { hocr += "\n" + GetCodeArea(block, page.Height, level + 1); } } - hocr += "\n" + GetIndent(level) + @"
"; + hocr += "\n" + GetIndent(level) + ""; return hocr; } @@ -179,7 +184,10 @@ /// The indent level. private string GetCode(PdfPath path, double pageHeight, bool subPaths, int level) { - if (path == null) return string.Empty; + if (path == null) + { + return string.Empty; + } string hocr = string.Empty; @@ -189,7 +197,7 @@ if (bbox.HasValue) { areaCount++; - hocr += GetIndent(level) + @"
\n"; foreach (var subPath in path) { @@ -197,11 +205,11 @@ if (subBbox.HasValue) { pathCount++; - hocr += GetIndent(level + 1) + @"\n"; } } - hocr += GetIndent(level) + @"
"; + hocr += GetIndent(level) + ""; } } else @@ -210,7 +218,7 @@ if (bbox.HasValue) { pathCount++; - hocr += GetIndent(level) + @""; } } @@ -222,7 +230,7 @@ { imageCount++; var bbox = pdfImage.Bounds; - return GetIndent(level) + @""; } @@ -237,12 +245,11 @@ { areaCount++; - string bbox = GetCode(block.BoundingBox, pageHeight); - string hocr = GetIndent(level) + @"
"; hocr += GetCodeParagraph(block, pageHeight, level + 1); // we concider 1 area = 1 block. should change in the future - hocr += "\n" + GetIndent(level) + @"
"; + hocr += "\n" + GetIndent(level) + ""; return hocr; } @@ -256,14 +263,14 @@ private string GetCodeParagraph(TextBlock block, double pageHeight, int level) { paraCount++; - string hocr = "\n" + GetIndent(level) + @"

"; // lang='eng' foreach (var line in block.TextLines) { hocr += "\n" + GetCode(line, pageHeight, level + 1); } - hocr += "\n" + GetIndent(level) + @"

"; + hocr += "\n" + GetIndent(level) + "

"; return hocr; } @@ -285,14 +292,14 @@ double baseLine = (double)line.Words[0].Letters[0].StartBaseLine.Y; baseLine = (double)line.BoundingBox.Bottom - baseLine; - string hocr = GetIndent(level) + @""; //"; x_size 42; x_descenders 5; x_ascenders 12' >"; foreach (var word in line.Words) { hocr += "\n" + GetCode(word, pageHeight, level + 1); } - hocr += "\n" + GetIndent(level) + @""; + hocr += "\n" + GetIndent(level) + "
"; return hocr; } @@ -307,7 +314,7 @@ { wordCount++; string hocr = GetIndent(level) + - @"{safeValue}" + return $"{safeValue}" + Environment.NewLine; } @@ -123,7 +123,11 @@ } } - if (Fonts.ContainsKey(fontName)) fontName = Fonts[fontName]; + if (Fonts.ContainsKey(fontName)) + { + fontName = Fonts[fontName]; + } + return fontName; } @@ -136,7 +140,11 @@ private static string ColorToSvg(IColor color) { - if (color == null) return ""; + if (color == null) + { + return string.Empty; + } + var (r, g, b) = color.ToRGBValues(); return $"rgb({Math.Ceiling(r * 255)},{Math.Ceiling(g * 255)},{Math.Ceiling(b * 255)})"; } diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DlaOptions.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/IDlaOptions.cs similarity index 61% rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/DlaOptions.cs rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/IDlaOptions.cs index 39537029..b8700add 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DlaOptions.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/IDlaOptions.cs @@ -1,16 +1,15 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis { /// - /// Abstract class that stores options that configure the operation of methods of the document layout analysis algorithm. + /// Interface that stores options that configure the operation of methods of the document layout analysis algorithm. /// - public abstract class DlaOptions + public interface IDlaOptions { /// /// Gets or sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. - /// Default value is -1. /// - public int MaxDegreeOfParallelism { get; set; } = -1; + int MaxDegreeOfParallelism { get; set; } } } diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DefaultPageSegmenter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DefaultPageSegmenter.cs index 7f847c50..4be72182 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DefaultPageSegmenter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DefaultPageSegmenter.cs @@ -11,47 +11,67 @@ /// public class DefaultPageSegmenter : IPageSegmenter { + private readonly DefaultPageSegmenterOptions options; + /// /// Create an instance of default page segmenter, . /// public static DefaultPageSegmenter Instance { get; } = new DefaultPageSegmenter(); /// - /// Get the blocks using default options values. + /// using default options values. /// - /// The page's words to generate text blocks for. - public IReadOnlyList GetBlocks(IEnumerable words) + public DefaultPageSegmenter() : this(new DefaultPageSegmenterOptions()) { - return GetBlocks(words, new DefaultPageSegmenterOptions()); + } + + /// + /// Create using options values. + /// + /// The to use. + /// + public DefaultPageSegmenter(DefaultPageSegmenterOptions options) + { + this.options = options ?? throw new ArgumentNullException(nameof(options)); } /// /// Get the text blocks using options. /// /// The page's words to generate text blocks for. - /// The to use. /// The s generated by the default method. - public IReadOnlyList GetBlocks(IEnumerable words, DlaOptions options) + public IReadOnlyList GetBlocks(IEnumerable words) { - if (options is DefaultPageSegmenterOptions dOptions) + if (words?.Any() != true) { - if (words?.Any() != true) - { - return EmptyArray.Instance; - } + return EmptyArray.Instance; + } - return new List() { new TextBlock(new XYLeaf(words).GetLines(dOptions.WordSeparator), dOptions.LineSeparator) }; - } - else - { - throw new ArgumentException("Options provided must be of type " + nameof(DefaultPageSegmenterOptions) + ".", nameof(options)); - } + return new List() { new TextBlock(new XYLeaf(words).GetLines(options.WordSeparator), options.LineSeparator) }; } /// /// Default page segmenter options. /// - public class DefaultPageSegmenterOptions : PageSegmenterOptions - { } + public class DefaultPageSegmenterOptions : IPageSegmenterOptions + { + /// + /// + /// Default value is -1. + /// + public int MaxDegreeOfParallelism { get; set; } = -1; + + /// + /// + /// Default value is ' ' (space). + /// + public string WordSeparator { get; set; } = " "; + + /// + /// + /// Default value is '\n' (new line). + /// + public string LineSeparator { get; set; } = "\n"; + } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs index 6ca7ebf1..52823191 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs @@ -17,48 +17,49 @@ /// public class DocstrumBoundingBoxes : IPageSegmenter { + private readonly DocstrumBoundingBoxesOptions options; + /// /// Create an instance of Docstrum for bounding boxes page segmenter, . /// public static DocstrumBoundingBoxes Instance { get; } = new DocstrumBoundingBoxes(); /// - /// Get the blocks using default options values. + /// Create an instance of Docstrum for bounding boxes page segmenter using default options values. + /// + public DocstrumBoundingBoxes() : this(new DocstrumBoundingBoxesOptions()) + { + } + + /// + /// Create an instance of Docstrum for bounding boxes page segmenter using options values. + /// + /// The to use. + /// + public DocstrumBoundingBoxes(DocstrumBoundingBoxesOptions options) + { + this.options = options ?? throw new ArgumentNullException(nameof(options)); + } + + /// + /// Get the blocks. /// /// The page's words to segment into s. /// The s generated by the document spectrum method. public IReadOnlyList GetBlocks(IEnumerable words) { - return GetBlocks(words, new DocstrumBoundingBoxesOptions()); - } - - /// - /// Get the blocks using options values. - /// - /// The page's words to segment into s. - /// The to use. - /// The s generated by the document spectrum method. - public IReadOnlyList GetBlocks(IEnumerable words, DlaOptions options) - { - if (options is DocstrumBoundingBoxesOptions dbbOptions) + if (words?.Any() != true) { - if (words?.Any() != true) - { - return EmptyArray.Instance; - } + return EmptyArray.Instance; + } - return GetBlocks(words.ToList(), - dbbOptions.WithinLineBounds, dbbOptions.WithinLineMultiplier, dbbOptions.WithinLineBinSize, - dbbOptions.BetweenLineBounds, dbbOptions.BetweenLineMultiplier, dbbOptions.BetweenLineBinSize, - dbbOptions.AngularDifferenceBounds, - dbbOptions.Epsilon, - dbbOptions.WordSeparator, dbbOptions.LineSeparator, - dbbOptions.MaxDegreeOfParallelism); - } - else - { - throw new ArgumentException("Options provided must be of type " + nameof(DocstrumBoundingBoxesOptions) + ".", nameof(options)); - } + return GetBlocks(words.ToList(), + options.WithinLineBounds, options.WithinLineMultiplier, options.WithinLineBinSize, + options.BetweenLineBounds, options.BetweenLineMultiplier, options.BetweenLineBinSize, + options.AngularDifferenceBounds, + options.Epsilon, + options.WordSeparator, options.LineSeparator, + options.MaxDegreeOfParallelism); } /// @@ -612,8 +613,26 @@ /// /// Docstrum bounding boxes page segmenter options. /// - public class DocstrumBoundingBoxesOptions : PageSegmenterOptions + public class DocstrumBoundingBoxesOptions : IPageSegmenterOptions { + /// + /// + /// Default value is -1. + /// + public int MaxDegreeOfParallelism { get; set; } = -1; + + /// + /// + /// Default value is ' ' (space). + /// + public string WordSeparator { get; set; } = " "; + + /// + /// + /// Default value is '\n' (new line). + /// + public string LineSeparator { get; set; } = "\n"; + /// /// Precision when testing equalities. /// Default value is 1e-3. @@ -640,7 +659,6 @@ /// public int WithinLineBinSize { get; set; } = 10; - /// /// Angle bounds for words to be considered as neighbours on separate lines. /// Default value is 45 ≤ θ ≤ 135. diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenter.cs index 26daa85a..117ac6d7 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenter.cs @@ -10,18 +10,10 @@ public interface IPageSegmenter { /// - /// Get the blocks using default options values. + /// Get the blocks. /// /// The page's words to generate text blocks for. /// A list of text blocks from this approach. IReadOnlyList GetBlocks(IEnumerable words); - - /// - /// Get the text blocks using options. - /// - /// The page's words to generate text blocks for. - /// - /// A list of text blocks from this approach. - IReadOnlyList GetBlocks(IEnumerable words, DlaOptions options); } } diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/PageSegmenterOptions.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenterOptions.cs similarity index 50% rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/PageSegmenterOptions.cs rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenterOptions.cs index aeb9eb5d..483f7c45 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/PageSegmenterOptions.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/IPageSegmenterOptions.cs @@ -1,20 +1,18 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter { /// - /// Abstract page segmenter options. + /// Page segmenter options interface. /// - public abstract class PageSegmenterOptions : DlaOptions + public interface IPageSegmenterOptions : IDlaOptions { /// /// Separator used between words when building lines. - /// Default value is ' ' (space). /// - public string WordSeparator { get; set; } = " "; + string WordSeparator { get; set; } /// /// Separator used between lines when building paragraphs. - /// Default value is '\n' (new line). /// - public string LineSeparator { get; set; } = "\n"; + string LineSeparator { get; set; } } } diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs index 96601614..b09a9265 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs @@ -15,47 +15,48 @@ /// public class RecursiveXYCut : IPageSegmenter { + private readonly RecursiveXYCutOptions options; + /// /// Create an instance of Recursive X-Y Cut page segmenter, . /// public static RecursiveXYCut Instance { get; } = new RecursiveXYCut(); /// - /// Get the blocks using default options values. + /// Create an instance of Recursive X-Y Cut page segmenter using default options values. + /// + public RecursiveXYCut() : this(new RecursiveXYCutOptions()) + { + } + + /// + /// Create an instance of Recursive X-Y Cut page segmenter using options values. + /// + /// The to use. + /// + public RecursiveXYCut(RecursiveXYCutOptions options) + { + this.options = options ?? throw new ArgumentNullException(nameof(options)); + } + + /// + /// Get the blocks. /// /// The page's words to segment into s. /// The s generated by the Recursive X-Y cut method. public IReadOnlyList GetBlocks(IEnumerable words) { - return GetBlocks(words, new RecursiveXYCutOptions()); - } - - /// - /// Get the blocks using options values. - /// - /// The page's words to segment into s. - /// The to use. - /// The s generated by the Recursive X-Y cut method. - public IReadOnlyList GetBlocks(IEnumerable words, DlaOptions options) - { - if (options is RecursiveXYCutOptions ryxcOptions) + if (words?.Any() != true) { - if (words?.Any() != true) - { - return EmptyArray.Instance; - } + return EmptyArray.Instance; + } - return GetBlocks(words, - ryxcOptions.MinimumWidth, - ryxcOptions.DominantFontWidthFunc, - ryxcOptions.DominantFontHeightFunc, - ryxcOptions.WordSeparator, - ryxcOptions.LineSeparator); - } - else - { - throw new ArgumentException("Options provided must be of type " + nameof(RecursiveXYCutOptions) + ".", nameof(options)); - } + return GetBlocks(words, + options.MinimumWidth, + options.DominantFontWidthFunc, + options.DominantFontHeightFunc, + options.WordSeparator, + options.LineSeparator); } /// @@ -92,7 +93,7 @@ if (leaves.Count > 0) { - return leaves.Select(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator)).ToList(); + return leaves.ConvertAll(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator)); } } @@ -183,7 +184,11 @@ } } } - if (i == wordsCount - 1) projectionProfile.Add(currentProjection); + + if (i == wordsCount - 1) + { + projectionProfile.Add(currentProjection); + } } var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w => @@ -271,7 +276,11 @@ } } } - if (i == wordsCount - 1) projectionProfile.Add(currentProjection); + + if (i == wordsCount - 1) + { + projectionProfile.Add(currentProjection); + } } if (projectionProfile.Count == 1) @@ -329,8 +338,26 @@ /// /// Recursive X-Y cut page segmenter options. /// - public class RecursiveXYCutOptions : PageSegmenterOptions + public class RecursiveXYCutOptions : IPageSegmenterOptions { + /// + /// + /// Default value is -1. + /// + public int MaxDegreeOfParallelism { get; set; } = -1; + + /// + /// + /// Default value is ' ' (space). + /// + public string WordSeparator { get; set; } = " "; + + /// + /// + /// Default value is '\n' (new line). + /// + public string LineSeparator { get; set; } = "\n"; + /// /// The minimum width for a block. /// Default value is 1. diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYNode.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYNode.cs index 4303acca..9484e598 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYNode.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/XYNode.cs @@ -31,7 +31,6 @@ public XYNode(params XYNode[] children) : this(children?.ToList()) { - } /// @@ -40,12 +39,12 @@ /// The node's children. public XYNode(IEnumerable children) { - if (children != null && children.Count() != 0) + if (children?.Any() == true) { Children = children.ToArray(); - BoundingBox = new PdfRectangle(children.Min(b => b.BoundingBox.Left), - children.Min(b => b.BoundingBox.Bottom), - children.Max(b => b.BoundingBox.Right), + BoundingBox = new PdfRectangle(children.Min(b => b.BoundingBox.Left), + children.Min(b => b.BoundingBox.Bottom), + children.Max(b => b.BoundingBox.Right), children.Max(b => b.BoundingBox.Top)); } else @@ -87,7 +86,11 @@ private void RecursiveCount(IEnumerable children, ref int count) { - if (children.Count() == 0) return; + if (!children.Any()) + { + return; + } + foreach (XYNode node in children.Where(x => x.IsLeaf)) { count += node.CountWords(); @@ -101,7 +104,11 @@ private void RecursiveGetLeaves(IEnumerable children, ref List leaves, int level) { - if (children.Count() == 0) return; + if (!children.Any()) + { + return; + } + bool isVerticalCut = level % 2 == 0; foreach (XYLeaf node in children.Where(x => x.IsLeaf)) diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/IWordExtractorOptions.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/IWordExtractorOptions.cs new file mode 100644 index 00000000..b35825c0 --- /dev/null +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/IWordExtractorOptions.cs @@ -0,0 +1,9 @@ +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor +{ + /// + /// Word extractor options interface. + /// + public interface IWordExtractorOptions : IDlaOptions + { + } +} diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs index 2c181231..45fabc4f 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs @@ -13,77 +13,78 @@ /// public class NearestNeighbourWordExtractor : IWordExtractor { + private readonly NearestNeighbourWordExtractorOptions options; + /// /// Create an instance of Nearest Neighbour Word Extractor, . /// public static NearestNeighbourWordExtractor Instance { get; } = new NearestNeighbourWordExtractor(); /// - /// Get the words using default options values. + /// Create an instance of Nearest Neighbour Word Extractor using default options values. + /// + public NearestNeighbourWordExtractor() : this(new NearestNeighbourWordExtractorOptions()) + { + } + + /// + /// Create an instance of Nearest Neighbour Word Extractor using options values. + /// + /// The to use. + /// + public NearestNeighbourWordExtractor(NearestNeighbourWordExtractorOptions options) + { + this.options = options ?? throw new ArgumentNullException(nameof(options)); + } + + /// + /// Get the words. /// /// The page's letters to group into s. /// The s generated by the nearest neighbour method. public IEnumerable GetWords(IReadOnlyList letters) { - return GetWords(letters, new NearestNeighbourWordExtractorOptions()); - } - - /// - /// Get the words using options values. - /// - /// The page's letters to group into s. - /// The to use. - /// The s generated by the nearest neighbour method. - public IEnumerable GetWords(IReadOnlyList letters, DlaOptions options) - { - if (options is NearestNeighbourWordExtractorOptions nnOptions) + if (letters == null || letters.Count == 0) { - if (letters == null || letters.Count == 0) - { - return EmptyArray.Instance; - } + return EmptyArray.Instance; + } - if (nnOptions.GroupByOrientation) - { - // axis aligned - List words = GetWords( - letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(), - nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot, - nnOptions.Filter, nnOptions.MaxDegreeOfParallelism); + if (options.GroupByOrientation) + { + // axis aligned + List words = GetWords( + letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(), + options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot, + options.Filter, options.MaxDegreeOfParallelism); - words.AddRange(GetWords( - letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(), - nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot, - nnOptions.Filter, nnOptions.MaxDegreeOfParallelism)); + words.AddRange(GetWords( + letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(), + options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot, + options.Filter, options.MaxDegreeOfParallelism)); - words.AddRange(GetWords( - letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(), - nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot, - nnOptions.Filter, nnOptions.MaxDegreeOfParallelism)); + words.AddRange(GetWords( + letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(), + options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot, + options.Filter, options.MaxDegreeOfParallelism)); - words.AddRange(GetWords( - letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(), - nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot, - nnOptions.Filter, nnOptions.MaxDegreeOfParallelism)); + words.AddRange(GetWords( + letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(), + options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot, + options.Filter, options.MaxDegreeOfParallelism)); - // not axis aligned - words.AddRange(GetWords( - letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(), - nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot, - nnOptions.Filter, nnOptions.MaxDegreeOfParallelism)); + // not axis aligned + words.AddRange(GetWords( + letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(), + options.MaximumDistance, options.DistanceMeasure, options.FilterPivot, + options.Filter, options.MaxDegreeOfParallelism)); - return words; - } - else - { - return GetWords(letters, - nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot, - nnOptions.Filter, nnOptions.MaxDegreeOfParallelism); - } + return words; } else { - throw new ArgumentException("Options provided must be of type " + nameof(NearestNeighbourWordExtractorOptions) + ".", nameof(options)); + return GetWords(letters, + options.MaximumDistance, options.DistanceMeasure, options.FilterPivot, + options.Filter, options.MaxDegreeOfParallelism); } } @@ -107,7 +108,10 @@ Func filterPivotFunction, Func filterFunction, int maxDegreeOfParallelism) { - if (letters == null || letters.Count == 0) return new List(); + if (letters == null || letters.Count == 0) + { + return new List(); + } var groupedLetters = Clustering.NearestNeighbours(letters, distMeasure, maxDistanceFunction, @@ -128,11 +132,17 @@ /// /// Nearest neighbour word extractor options. /// - public class NearestNeighbourWordExtractorOptions : DlaOptions + public class NearestNeighbourWordExtractorOptions : IWordExtractorOptions { + /// + /// + /// Default value is -1. + /// + public int MaxDegreeOfParallelism { get; set; } = -1; + /// /// The maximum distance between two letters (start and end base line points) within the same word, as a function of the two letters. - /// If the distance between the two letters is greater than this maximum, they will belong to different words. + /// If the distance between the two letters is greater than this maximum, they will belong to different words. /// Default value is 20% of the Max(Width, PointSize) of both letters. If is Other, this distance is doubled. /// public Func MaximumDistance { get; set; } = (l1, l2) => @@ -159,15 +169,15 @@ /// /// The distance measure used between two letters (start and end base line points) with axis aligned . - /// Only used if GroupByOrientation is set to true. + /// Only used if is set to true. /// Default value is the Manhattan distance. /// public Func DistanceMeasureAA { get; set; } = Distances.Manhattan; /// /// Function used to filter out connection between letters, e.g. check if the letters have the same color. - /// If the function returns false, letters will belong to different words. - /// Default value checks whether the neighbour is a white space or not. If it is the case, it returns false. + /// If the function returns false, letters will belong to different words. + /// Default value checks whether the neighbour is a white space or not. If it is the case, it returns false. /// public Func Filter { get; set; } = (_, l2) => !string.IsNullOrWhiteSpace(l2.Value); @@ -178,9 +188,9 @@ public Func FilterPivot { get; set; } = l => !string.IsNullOrWhiteSpace(l.Value); /// - /// If true, letters will be grouped by before processing. - /// The DistanceMeasureAA will be used on axis aligned letters, and the DistanceMeasure on others. - /// If false, DistanceMeasure will be used for all letters and DistanceMeasureAA won't be used. + /// If true, letters will be grouped by before processing. + /// The will be used on axis aligned letters, and the on others. + /// If false, will be used for all letters, and won't be used. /// Default value is true. /// public bool GroupByOrientation { get; set; } = true; diff --git a/src/UglyToad.PdfPig.Tests/Dla/DocstrumBoundingBoxesTests.cs b/src/UglyToad.PdfPig.Tests/Dla/DocstrumBoundingBoxesTests.cs index b1c48000..fb01007a 100644 --- a/src/UglyToad.PdfPig.Tests/Dla/DocstrumBoundingBoxesTests.cs +++ b/src/UglyToad.PdfPig.Tests/Dla/DocstrumBoundingBoxesTests.cs @@ -88,7 +88,7 @@ { var page = document.GetPage(1); var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters); - var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words, options); + var blocks = new DocstrumBoundingBoxes(options).GetBlocks(words); Assert.Equal(expected.Length, blocks.Count); var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X) diff --git a/src/UglyToad.PdfPig.Tests/Dla/RecursiveXYCutTests.cs b/src/UglyToad.PdfPig.Tests/Dla/RecursiveXYCutTests.cs index 619356a8..69f9906f 100644 --- a/src/UglyToad.PdfPig.Tests/Dla/RecursiveXYCutTests.cs +++ b/src/UglyToad.PdfPig.Tests/Dla/RecursiveXYCutTests.cs @@ -40,7 +40,7 @@ var page = document.GetPage(1); var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters); var options = new RecursiveXYCut.RecursiveXYCutOptions() { MinimumWidth = page.Width / 3.0, LineSeparator = " " }; - var blocks = RecursiveXYCut.Instance.GetBlocks(words, options); + var blocks = new RecursiveXYCut(options).GetBlocks(words); Assert.Equal(expected.Length, blocks.Count); var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X) diff --git a/src/UglyToad.PdfPig.Tests/Integration/PageXmlTextExporterTests.cs b/src/UglyToad.PdfPig.Tests/Integration/PageXmlTextExporterTests.cs index 519a932c..1860f24e 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/PageXmlTextExporterTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/PageXmlTextExporterTests.cs @@ -43,7 +43,7 @@ DefaultWordExtractor.Instance, RecursiveXYCut.Instance, UnsupervisedReadingOrderDetector.Instance); - var xml = GetXml(pageXmlTextExporter); + var xml = GetXml(pageXmlTextExporter); Assert.Contains("", xml); Assert.Contains("", xml); @@ -69,7 +69,7 @@ public void ContainsExpectedText() { var xml = GetXml(); - Assert.Contains(@"2006 Swedish Touring Car Championship", xml); + Assert.Contains("2006 Swedish Touring Car Championship", xml); // the coords for that text Assert.Contains(@"", xml); } diff --git a/src/UglyToad.PdfPig/Content/Page.cs b/src/UglyToad.PdfPig/Content/Page.cs index 1b3ac8ab..31acdd2b 100644 --- a/src/UglyToad.PdfPig/Content/Page.cs +++ b/src/UglyToad.PdfPig/Content/Page.cs @@ -79,10 +79,10 @@ public int NumberOfImages => Content.NumberOfImages; /// - /// The parsed graphics state operations in the content stream for this page. + /// The parsed graphics state operations in the content stream for this page. /// public IReadOnlyList Operations => Content.GraphicsStateOperations; - + /// /// Access to members whose future locations within the API will change without warning. /// @@ -97,7 +97,7 @@ { throw new ArgumentOutOfRangeException(nameof(number), "Page number cannot be 0 or negative."); } - + Dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary)); Number = number;