Add reading order in PageXml export

This commit is contained in:
BobLd
2020-01-12 11:06:04 +00:00
committed by Eliot Jones
parent e7417be75a
commit e8216b29c5

View File

@@ -11,6 +11,7 @@
using System.Xml; using System.Xml;
using System.Xml.Serialization; using System.Xml.Serialization;
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
using Util; using Util;
/// <summary> /// <summary>
@@ -21,6 +22,7 @@
{ {
private readonly IPageSegmenter pageSegmenter; private readonly IPageSegmenter pageSegmenter;
private readonly IWordExtractor wordExtractor; private readonly IWordExtractor wordExtractor;
private readonly IReadingOrderDetector readingOrderDetector;
private readonly double scale; private readonly double scale;
private readonly string indentChar; private readonly string indentChar;
@@ -29,6 +31,9 @@
private int wordCount; private int wordCount;
private int glyphCount; private int glyphCount;
private int regionCount; private int regionCount;
private int groupOrderCount;
private List<PageXmlDocument.PageXmlRegionRefIndexed> orderedRegions;
/// <summary> /// <summary>
/// PAGE-XML 2019-07-15 (XML) text exporter. /// PAGE-XML 2019-07-15 (XML) text exporter.
@@ -36,14 +41,16 @@
/// </summary> /// </summary>
/// <param name="wordExtractor"></param> /// <param name="wordExtractor"></param>
/// <param name="pageSegmenter"></param> /// <param name="pageSegmenter"></param>
/// <param name="readingOrderDetector"></param>
/// <param name="scale"></param> /// <param name="scale"></param>
/// <param name="indent">Indent character.</param> /// <param name="indent">Indent character.</param>
public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t") public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, IReadingOrderDetector readingOrderDetector = null, double scale = 1.0, string indent = "\t")
{ {
this.wordExtractor = wordExtractor; this.wordExtractor = wordExtractor;
this.pageSegmenter = pageSegmenter; this.pageSegmenter = pageSegmenter;
this.readingOrderDetector = readingOrderDetector;
this.scale = scale; this.scale = scale;
indentChar = indent; this.indentChar = indent;
} }
/// <summary> /// <summary>
@@ -72,6 +79,13 @@
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param> /// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
public string Get(Page page, bool includePaths) public string Get(Page page, bool includePaths)
{ {
lineCount = 0;
wordCount = 0;
glyphCount = 0;
regionCount = 0;
groupOrderCount = 0;
orderedRegions = new List<PageXmlDocument.PageXmlRegionRefIndexed>();
PageXmlDocument pageXmlDocument = new PageXmlDocument() PageXmlDocument pageXmlDocument = new PageXmlDocument()
{ {
Metadata = new PageXmlDocument.PageXmlMetadata() Metadata = new PageXmlDocument.PageXmlMetadata()
@@ -145,7 +159,25 @@
if (words.Count > 0) if (words.Count > 0)
{ {
var blocks = pageSegmenter.GetBlocks(words); var blocks = pageSegmenter.GetBlocks(words);
if (readingOrderDetector != null)
{
blocks = readingOrderDetector.Get(blocks).ToList();
}
regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Height))); regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Height)));
if (orderedRegions.Any())
{
pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder()
{
Item = new PageXmlDocument.PageXmlOrderedGroup()
{
Items = orderedRegions.ToArray(),
Id = "g" + groupOrderCount++
}
};
}
} }
var images = page.GetImages().ToList(); var images = page.GetImages().ToList();
@@ -196,13 +228,24 @@
private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double height) private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double height)
{ {
regionCount++; regionCount++;
string regionId = "r" + regionCount;
if (readingOrderDetector != null && textBlock.ReadingOrder > -1)
{
orderedRegions.Add(new PageXmlDocument.PageXmlRegionRefIndexed()
{
RegionRef = regionId,
Index = textBlock.ReadingOrder
});
}
return new PageXmlDocument.PageXmlTextRegion() return new PageXmlDocument.PageXmlTextRegion()
{ {
Coords = ToCoords(textBlock.BoundingBox, height), Coords = ToCoords(textBlock.BoundingBox, height),
Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph, Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph,
TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, height)).ToArray(), TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, height)).ToArray(),
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textBlock.Text } }, TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textBlock.Text } },
Id = "r" + regionCount Id = regionId
}; };
} }