mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 03:34:52 +08:00
Add reading order in PageXml export
This commit is contained in:
@@ -11,6 +11,7 @@
|
|||||||
using System.Xml;
|
using System.Xml;
|
||||||
using System.Xml.Serialization;
|
using System.Xml.Serialization;
|
||||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
|
||||||
using Util;
|
using Util;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -21,6 +22,7 @@
|
|||||||
{
|
{
|
||||||
private readonly IPageSegmenter pageSegmenter;
|
private readonly IPageSegmenter pageSegmenter;
|
||||||
private readonly IWordExtractor wordExtractor;
|
private readonly IWordExtractor wordExtractor;
|
||||||
|
private readonly IReadingOrderDetector readingOrderDetector;
|
||||||
|
|
||||||
private readonly double scale;
|
private readonly double scale;
|
||||||
private readonly string indentChar;
|
private readonly string indentChar;
|
||||||
@@ -29,6 +31,9 @@
|
|||||||
private int wordCount;
|
private int wordCount;
|
||||||
private int glyphCount;
|
private int glyphCount;
|
||||||
private int regionCount;
|
private int regionCount;
|
||||||
|
private int groupOrderCount;
|
||||||
|
|
||||||
|
private List<PageXmlDocument.PageXmlRegionRefIndexed> orderedRegions;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// PAGE-XML 2019-07-15 (XML) text exporter.
|
/// PAGE-XML 2019-07-15 (XML) text exporter.
|
||||||
@@ -36,14 +41,16 @@
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="wordExtractor"></param>
|
/// <param name="wordExtractor"></param>
|
||||||
/// <param name="pageSegmenter"></param>
|
/// <param name="pageSegmenter"></param>
|
||||||
|
/// <param name="readingOrderDetector"></param>
|
||||||
/// <param name="scale"></param>
|
/// <param name="scale"></param>
|
||||||
/// <param name="indent">Indent character.</param>
|
/// <param name="indent">Indent character.</param>
|
||||||
public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t")
|
public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, IReadingOrderDetector readingOrderDetector = null, double scale = 1.0, string indent = "\t")
|
||||||
{
|
{
|
||||||
this.wordExtractor = wordExtractor;
|
this.wordExtractor = wordExtractor;
|
||||||
this.pageSegmenter = pageSegmenter;
|
this.pageSegmenter = pageSegmenter;
|
||||||
|
this.readingOrderDetector = readingOrderDetector;
|
||||||
this.scale = scale;
|
this.scale = scale;
|
||||||
indentChar = indent;
|
this.indentChar = indent;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -72,6 +79,13 @@
|
|||||||
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
|
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
|
||||||
public string Get(Page page, bool includePaths)
|
public string Get(Page page, bool includePaths)
|
||||||
{
|
{
|
||||||
|
lineCount = 0;
|
||||||
|
wordCount = 0;
|
||||||
|
glyphCount = 0;
|
||||||
|
regionCount = 0;
|
||||||
|
groupOrderCount = 0;
|
||||||
|
orderedRegions = new List<PageXmlDocument.PageXmlRegionRefIndexed>();
|
||||||
|
|
||||||
PageXmlDocument pageXmlDocument = new PageXmlDocument()
|
PageXmlDocument pageXmlDocument = new PageXmlDocument()
|
||||||
{
|
{
|
||||||
Metadata = new PageXmlDocument.PageXmlMetadata()
|
Metadata = new PageXmlDocument.PageXmlMetadata()
|
||||||
@@ -145,7 +159,25 @@
|
|||||||
if (words.Count > 0)
|
if (words.Count > 0)
|
||||||
{
|
{
|
||||||
var blocks = pageSegmenter.GetBlocks(words);
|
var blocks = pageSegmenter.GetBlocks(words);
|
||||||
|
|
||||||
|
if (readingOrderDetector != null)
|
||||||
|
{
|
||||||
|
blocks = readingOrderDetector.Get(blocks).ToList();
|
||||||
|
}
|
||||||
|
|
||||||
regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Height)));
|
regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Height)));
|
||||||
|
|
||||||
|
if (orderedRegions.Any())
|
||||||
|
{
|
||||||
|
pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder()
|
||||||
|
{
|
||||||
|
Item = new PageXmlDocument.PageXmlOrderedGroup()
|
||||||
|
{
|
||||||
|
Items = orderedRegions.ToArray(),
|
||||||
|
Id = "g" + groupOrderCount++
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var images = page.GetImages().ToList();
|
var images = page.GetImages().ToList();
|
||||||
@@ -196,13 +228,24 @@
|
|||||||
private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double height)
|
private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double height)
|
||||||
{
|
{
|
||||||
regionCount++;
|
regionCount++;
|
||||||
|
string regionId = "r" + regionCount;
|
||||||
|
|
||||||
|
if (readingOrderDetector != null && textBlock.ReadingOrder > -1)
|
||||||
|
{
|
||||||
|
orderedRegions.Add(new PageXmlDocument.PageXmlRegionRefIndexed()
|
||||||
|
{
|
||||||
|
RegionRef = regionId,
|
||||||
|
Index = textBlock.ReadingOrder
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
return new PageXmlDocument.PageXmlTextRegion()
|
return new PageXmlDocument.PageXmlTextRegion()
|
||||||
{
|
{
|
||||||
Coords = ToCoords(textBlock.BoundingBox, height),
|
Coords = ToCoords(textBlock.BoundingBox, height),
|
||||||
Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph,
|
Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph,
|
||||||
TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, height)).ToArray(),
|
TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, height)).ToArray(),
|
||||||
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textBlock.Text } },
|
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textBlock.Text } },
|
||||||
Id = "r" + regionCount
|
Id = regionId
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user