Support for hORC, AtloXml and PageXml output formats

Tested with:
- 'hocrjs' for hORC (see https://unpkg.com/hocrjs)
- 'PAGE Viewer' for hORC, AtloXml and PageXml (see http://www.primaresearch.org/tools/PAGEViewer)
This commit is contained in:
BobLd
2019-10-07 15:19:30 +01:00
parent c3da10055b
commit 93313118e9
8 changed files with 15547 additions and 1 deletions

View File

@@ -177,6 +177,49 @@
{
return letter.PointSize;
}
/// <summary>
/// Get the hOCR (html) string of the page layout.
/// <para>This is considered experimental because it needs more testing.</para>
/// </summary>
/// <param name="wordExtractor">The word extractor to use to generate words.</param>
/// <param name="pageSegmenter">The page segmenter to use.</param>
/// <param name="indent">Indent character to use.</param>
/// <param name="drawPaths">Draw <see cref="PdfPath"/>s present in the page.</param>
/// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the interface to a plain hOCR file.<para>See https://github.com/kba/hocrjs for more information.</para></param>
public string GetHOCR(IWordExtractor wordExtractor, DocumentLayoutAnalysis.IPageSegmenter pageSegmenter, string indent = "\t", bool drawPaths = false, bool useHocrjs = false)
{
var hocr = new Export.HOcrTextExporter(wordExtractor, pageSegmenter, 2, indent);
return hocr.Get(page, drawPaths, useHocrjs: useHocrjs);
}
/// <summary>
/// Get the Alto (xml) string of the page layout.
/// <para>This is considered experimental because it needs more testing.</para>
/// </summary>
/// <param name="wordExtractor">The word extractor to use to generate words.</param>
/// <param name="pageSegmenter">The page segmenter to use.</param>
/// <param name="indent">Indent character to use.</param>
/// <param name="drawPaths">Draw <see cref="PdfPath"/>s present in the page.</param>
public string GetAltoXml(IWordExtractor wordExtractor, DocumentLayoutAnalysis.IPageSegmenter pageSegmenter, string indent = "\t", bool drawPaths = false)
{
var alto = new Export.AltoXmlTextExporter(wordExtractor, pageSegmenter, 2, indent);
return alto.Get(page, drawPaths);
}
/// <summary>
/// Get the PageXml (xml) string of the page layout.
/// <para>This is considered experimental because it needs more testing.</para>
/// </summary>
/// <param name="wordExtractor">The word extractor to use to generate words.</param>
/// <param name="pageSegmenter">The page segmenter to use.</param>
/// <param name="indent">Indent character to use.</param>
/// <param name="drawPaths">Draw <see cref="PdfPath"/>s present in the page.</param>
public string GetPageXml(IWordExtractor wordExtractor, DocumentLayoutAnalysis.IPageSegmenter pageSegmenter, string indent = "\t", bool drawPaths = false)
{
var pageXml = new Export.PageXmlTextExporter(wordExtractor, pageSegmenter, 2, indent);
return pageXml.Get(page, drawPaths);
}
}
}
}

View File

@@ -0,0 +1,29 @@
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Util;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// Default Page Segmenter. All words are included in one block.
/// </summary>
public class DefaultPageSegmenter : IPageSegmenter
{
/// <summary>
/// Create an instance of default page segmenter, <see cref="DefaultPageSegmenter"/>.
/// </summary>
public static DefaultPageSegmenter Instance { get; } = new DefaultPageSegmenter();
/// <summary>
/// Get the blocks.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords)
{
if (pageWords.Count() == 0) return EmptyArray<TextBlock>.Instance;
return new List<TextBlock>() { new TextBlock(new XYLeaf(pageWords).GetLines()) };
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,345 @@
using System;
using System.Linq;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis;
using UglyToad.PdfPig.Geometry;
using UglyToad.PdfPig.Util;
namespace UglyToad.PdfPig.Export
{
/// <summary>
/// hOCR v1.2 (HTML) text exporter.
/// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
/// </summary>
internal class HOcrTextExporter : ITextExporter
{
private const string xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n";
private const string hocrjs = "<script src='https://unpkg.com/hocrjs'></script>\n";
private IPageSegmenter pageSegmenter;
private IWordExtractor wordExtractor;
private decimal scale;
private string indentChar;
private int pageCount = 0;
private int areaCount = 0;
private int lineCount = 0;
private int wordCount = 0;
private int pathCount = 0;
private int paraCount = 0;
/// <summary>
/// hOCR v1.2 (HTML)
/// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
/// </summary>
/// <param name="wordExtractor"></param>
/// <param name="pageSegmenter"></param>
/// <param name="scale"></param>
/// <param name="indent">Indent character.</param>
public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t")
{
this.wordExtractor = wordExtractor;
this.pageSegmenter = pageSegmenter;
this.scale = (decimal)scale;
this.indentChar = indent;
}
/// <summary>
/// Get the hORC (HTML) string of the page layout.
/// </summary>
/// <param name="document">The document.</param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
/// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the
/// interface to a plain hOCR file.<para>See https://github.com/kba/hocrjs for more information.</para></param>
public string Get(PdfDocument document, bool includePaths = false, bool useHocrjs = false)
{
string hocr = GetHead() + indentChar + "<body>\n";
for (var i = 0; i < document.NumberOfPages; i++)
{
var page = document.GetPage(i + 1);
hocr += GetCode(page, includePaths) + "\n";
}
if (useHocrjs) hocr += indentChar + indentChar + hocrjs;
hocr += indentChar + "</body>";
hocr = xmlHeader + AddHtmlHeader(hocr);
return hocr;
}
/// <summary>
/// Get the hORC (HTML) string of the page layout. Excludes <see cref="PdfPath"/>s.
/// </summary>
/// <param name="page">The page.</param>
/// <returns></returns>
public string Get(Page page)
{
return Get(page, false);
}
/// <summary>
/// Get the hORC (HTML) string of the page layout.
/// </summary>
/// <param name="page">The page.</param>
/// <param name="imageName">The image name, if any.</param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
/// <param name="useHocrjs">Will add a reference to the 'hocrjs' script just before the closing 'body' tag, adding the interface to a plain hOCR file.<para>See https://github.com/kba/hocrjs for more information.</para></param>
public string Get(Page page, bool includePaths = false, string imageName = "unknown", bool useHocrjs = false)
{
string hocr = GetHead() + indentChar + "<body>\n";
hocr += GetCode(page, includePaths, imageName) + "\n";
if (useHocrjs) hocr += indentChar + indentChar + hocrjs;
hocr += indentChar + "</body>";
hocr = xmlHeader + AddHtmlHeader(hocr);
return hocr;
}
private string GetHead()
{
return indentChar + "<head>" +
"\n" + indentChar + indentChar + "<title></title>" +
"\n" + indentChar + indentChar + "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />" +
"\n" + indentChar + indentChar + "<meta name='ocr-system' content='" + pageSegmenter.GetType().Name + "|" + wordExtractor.GetType().Name + "' />" +
"\n" + indentChar + indentChar + "<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocr_linedrawing' />" +
"\n" + indentChar + "</head>\n";
}
private string AddHtmlHeader(string content)
{
return "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n" + content + "\n</html>";
}
/// <summary>
/// Get indent string from level.
/// </summary>
/// <param name="level">The indent level.</param>
/// <returns></returns>
private string GetIndent(int level)
{
string indent = "";
for (int i = 0; i < level; i++)
{
indent += indentChar;
}
return indent;
}
/// <summary>
/// Get the hORC string for the page.
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page</para>
/// </summary>
/// <param name="page"></param>
/// <param name="imageName"></param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
private string GetCode(Page page, bool includePaths, string imageName = "unknown")
{
pageCount++;
int level = 2;
string hocr = GetIndent(level) + @"<div class='ocr_page' id='page_" + page.Number.ToString() +
"' title='image \"" + imageName + "\"; bbox 0 0 " +
(int)Math.Round(page.Width * scale) + " " + (int)Math.Round(page.Height * scale) +
"; ppageno " + (page.Number - 1) + "\'>";
if (includePaths)
{
foreach (var path in page.ExperimentalAccess.Paths)
{
hocr += "\n" + GetCode(path, page.Height, true, level + 1);
}
}
var words = page.GetWords(wordExtractor);
if (words.Count() > 0)
{
var blocks = pageSegmenter.GetBlocks(words);
foreach (var block in blocks)
{
hocr += "\n" + GetCodeArea(block, page.Height, level + 1);
}
}
hocr += "\n" + GetIndent(level) + @"</div>";
return hocr;
}
/// <summary>
/// Get the hORC string for the path.
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_linedrawing</para>
/// </summary>
/// <param name="path"></param>
/// <param name="pageHeight"></param>
/// <param name="subPaths"></param>
/// <param name="level">The indent level.</param>
/// <returns></returns>
private string GetCode(PdfPath path, decimal pageHeight, bool subPaths, int level)
{
if (path == null) return string.Empty;
string hocr = string.Empty;
if (subPaths)
{
var bbox = path.GetBoundingRectangle();
if (bbox.HasValue)
{
areaCount++;
hocr += GetIndent(level) + @"<div class='ocr_carea' id='block_" + pageCount + "_"
+ areaCount + "' title='" + GetCode(bbox.Value, pageHeight) + "'>\n";
foreach (var subPath in path.Commands)
{
var subBbox = subPath.GetBoundingRectangle();
if (subBbox.HasValue)
{
pathCount++;
hocr += GetIndent(level + 1) + @"<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
+ pathCount + "' title='" + GetCode(subBbox.Value, pageHeight) + "' />\n";
}
}
hocr += GetIndent(level) + @"</div>";
}
}
else
{
var bbox = path.GetBoundingRectangle();
if (bbox.HasValue)
{
pathCount++;
hocr += GetIndent(level) + @"<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
+ pathCount + "' title='" + GetCode(bbox.Value, pageHeight) + "' />";
}
}
return hocr;
}
/// <summary>
/// Get the hORC string for the area.
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_carea</para>
/// </summary>
/// <param name="block">The text area.</param>
/// <param name="pageHeight"></param>
/// <param name="level">The indent level.</param>
private string GetCodeArea(TextBlock block, decimal pageHeight, int level)
{
areaCount++;
string bbox = GetCode(block.BoundingBox, pageHeight);
string hocr = GetIndent(level) + @"<div class='ocr_carea' id='block_" + pageCount + "_"
+ areaCount + "' title='" + GetCode(block.BoundingBox, pageHeight) + "'>";
hocr += GetCodeParagraph(block, pageHeight, level + 1); // we concider 1 area = 1 block. should change in the future
hocr += "\n" + GetIndent(level) + @"</div>";
return hocr;
}
/// <summary>
/// Get the hORC string for the paragraph.
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_par</para>
/// </summary>
/// <param name="block">The paragraph.</param>
/// <param name="pageHeight"></param>
/// <param name="level">The indent level.</param>
/// <returns></returns>
private string GetCodeParagraph(TextBlock block, decimal pageHeight, int level)
{
paraCount++;
string hocr = "\n" + GetIndent(level) + @"<p class='ocr_par' id='par_" + pageCount + "_"
+ paraCount + "' title='" + GetCode(block.BoundingBox, pageHeight) + "'>"; // lang='eng'
foreach (var line in block.TextLines)
{
hocr += "\n" + GetCode(line, pageHeight, level + 1);
}
hocr += "\n" + GetIndent(level) + @"</p>";
return hocr;
}
/// <summary>
/// Get the hORC string for the text line.
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_line</para>
/// </summary>
/// <param name="line"></param>
/// <param name="pageHeight"></param>
/// <param name="level">The indent level.</param>
private string GetCode(TextLine line, decimal pageHeight, int level)
{
lineCount++;
double angle = 0;
// http://kba.cloud/hocr-spec/1.2/#propdef-baseline
// below will be 0 as long as the word's bounding box bottom is the BaseLine and not 'Bottom'
double baseLine = (double)line.Words[0].Letters[0].StartBaseLine.Y;
baseLine = (double)line.BoundingBox.Bottom - baseLine;
string hocr = GetIndent(level) + @"<span class='ocr_line' id='line_" + pageCount + "_" + lineCount + "' title='" +
GetCode(line.BoundingBox, pageHeight) + "; baseline " + angle + " 0'>"; //"; x_size 42; x_descenders 5; x_ascenders 12' >";
foreach (var word in line.Words)
{
hocr += "\n" + GetCode(word, pageHeight, level + 1);
}
hocr += "\n" + GetIndent(level) + @"</span>";
return hocr;
}
/// <summary>
/// Get the hORC string for the word.
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocrx_word</para>
/// </summary>
/// <param name="word"></param>
/// <param name="pageHeight"></param>
/// <param name="level">The indent level.</param>
private string GetCode(Word word, decimal pageHeight, int level)
{
wordCount++;
string hocr = GetIndent(level) +
@"<span class='ocrx_word' id='word_" + pageCount + "_" + wordCount +
"' title='" + GetCode(word.BoundingBox, pageHeight) + "; x_wconf " + GetConfidence(word);
hocr += "; x_font " + word.FontName;
if (word.Letters.Count > 0 && word.Letters[0].FontSize != 1)
{
hocr += "; x_fsize " + word.Letters[0].FontSize;
}
hocr += "'";
hocr += ">" + word.Text + "</span> ";
return hocr;
}
private int GetConfidence(Word word)
{
return 100;
}
/// <summary>
/// Get the hORC string for the bounding box.
/// <para>See http://kba.cloud/hocr-spec/1.2/#propdef-bbox</para>
/// </summary>
/// <param name="rectangle"></param>
/// <param name="pageHeight"></param>
private string GetCode(PdfRectangle rectangle, decimal pageHeight)
{
// the values are with reference to the the top-left
// corner of the document image and measured in pixels
var left = (int)Math.Round(rectangle.Left * scale);
var top = (int)Math.Round((pageHeight - rectangle.Top) * scale);
var right = (int)Math.Round(rectangle.Right * scale);
var bottom = (int)Math.Round((pageHeight - rectangle.Bottom) * scale);
return @"bbox " + (left > 0 ? left : 0) + " "
+ (top > 0 ? top : 0) + " "
+ (right > 0 ? right : 0) + " "
+ (bottom > 0 ? bottom : 0);
}
}
}

View File

@@ -0,0 +1,17 @@
using UglyToad.PdfPig.Content;
namespace UglyToad.PdfPig.Export
{
/// <summary>
/// Exports the page's text into the desired format.
/// </summary>
public interface ITextExporter
{
/// <summary>
/// Get the text representation.
/// </summary>
/// <param name="page"></param>
/// <returns></returns>
string Get(Page page);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -33,7 +33,7 @@
/// <summary>
/// Centroid point of the rectangle.
/// </summary>
public PdfPoint Centroid => new PdfPoint(TopLeft.X + (TopLeft.X - TopLeft.X) / 2, BottomLeft.Y + (TopLeft.Y - BottomLeft.Y) / 2);
public PdfPoint Centroid => new PdfPoint(Left + (Right - Left) / 2, Bottom + (Top - Bottom) / 2);
/// <summary>
/// Width of the rectangle.