diff --git a/src/UglyToad.PdfPig/Export/AltoXmlTextExporter.cs b/src/UglyToad.PdfPig/Export/AltoXmlTextExporter.cs
index b377841f..9897d5ba 100644
--- a/src/UglyToad.PdfPig/Export/AltoXmlTextExporter.cs
+++ b/src/UglyToad.PdfPig/Export/AltoXmlTextExporter.cs
@@ -29,6 +29,7 @@ namespace UglyToad.PdfPig.Export
int pageCount = 0;
int pageSpaceCount = 0;
int graphicalElementCount = 0;
+ int illustrationCount = 0;
int textBlockCount = 0;
int textLineCount = 0;
int stringCount = 0;
@@ -128,7 +129,6 @@ namespace UglyToad.PdfPig.Export
pageCount = page.Number;
pageSpaceCount++;
- var words = page.GetWords(wordExtractor);
var altoPage = new AltoDocument.AltoPage()
{
Height = (float)Math.Round(page.Height * scale),
@@ -164,10 +164,17 @@ namespace UglyToad.PdfPig.Export
Id = "P" + pageCount
};
+ var words = page.GetWords(wordExtractor);
if (words.Count() > 0)
{
var blocks = pageSegmenter.GetBlocks(words);
- altoPage.PrintSpace.TextBlock = blocks.Select(b => ToAltoTextBlock(b, page.Height)).ToArray();
+ altoPage.PrintSpace.TextBlock = blocks.Select(b => ToAltoTextBlock(b, page.Height)).ToArray();
+ }
+
+ var images = page.GetImages();
+ if (images.Count() > 0)
+ {
+ altoPage.PrintSpace.Illustrations = images.Select(i => ToAltoIllustration(i, page.Height)).ToArray();
}
if (includePaths)
@@ -213,6 +220,24 @@ namespace UglyToad.PdfPig.Export
return null;
}
+ private AltoDocument.AltoIllustration ToAltoIllustration(IPdfImage pdfImage, decimal height)
+ {
+ illustrationCount++;
+ var rectangle = pdfImage.Bounds;
+
+ return new AltoDocument.AltoIllustration()
+ {
+ VPos = (float)Math.Round((height - rectangle.Top) * scale),
+ HPos = (float)Math.Round(rectangle.Left * scale),
+ Height = (float)Math.Round(rectangle.Height * scale),
+ Width = (float)Math.Round(rectangle.Width * scale),
+ FileId = "",
+ Rotation = 0,
+ //IdNext = "NA", // for reading order
+ Id = "P" + pageCount + "_I" + illustrationCount.ToString("#00000")
+ };
+ }
+
///
///
///
@@ -559,9 +584,11 @@ namespace UglyToad.PdfPig.Export
}
}
- ///
+ ///
+ /// Element deprecated. 'Processing' should be used instead.
+ ///
[XmlElementAttribute("OCRProcessing")]
- [Obsolete("Element deprecated. 'Processing' should be used instead.")]
+ //[Obsolete("Element deprecated. 'Processing' should be used instead.")]
public AltoDescriptionOcrProcessing[] OCRProcessing
{
get
@@ -2466,7 +2493,7 @@ namespace UglyToad.PdfPig.Export
/// Attribute deprecated. LANG should be used instead.
///
[XmlAttributeAttribute("language", DataType = "language")]
- [Obsolete("Attribute deprecated. LANG should be used instead.")]
+ //[Obsolete("Attribute deprecated. LANG should be used instead.")]
public string Language
{
get
@@ -4785,6 +4812,7 @@ namespace UglyToad.PdfPig.Export
///
/// [Alto] Ocr Processing
+ /// Element deprecated. 'AltoProcessing' should be used instead.
///
[EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
[GeneratedCodeAttribute("xsd", "4.6.1055.0")]
@@ -4792,7 +4820,7 @@ namespace UglyToad.PdfPig.Export
[DebuggerStepThroughAttribute()]
[DesignerCategoryAttribute("code")]
[XmlTypeAttribute(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
- [Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")]
+ //[Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")]
public class AltoOcrProcessing
{
@@ -4897,6 +4925,7 @@ namespace UglyToad.PdfPig.Export
///
/// [Alto] Description Ocr Processing
+ /// Element deprecated. 'AltoProcessing' should be used instead.
///
[EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
[GeneratedCodeAttribute("xsd", "4.6.1055.0")]
@@ -4904,7 +4933,7 @@ namespace UglyToad.PdfPig.Export
[DebuggerStepThroughAttribute()]
[DesignerCategoryAttribute("code")]
[XmlTypeAttribute(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
- [Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")]
+ //[Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")]
public class AltoDescriptionOcrProcessing : AltoOcrProcessing
{
diff --git a/src/UglyToad.PdfPig/Export/HOcrTextExporter.cs b/src/UglyToad.PdfPig/Export/HOcrTextExporter.cs
index 41d9f0eb..d94b27e1 100644
--- a/src/UglyToad.PdfPig/Export/HOcrTextExporter.cs
+++ b/src/UglyToad.PdfPig/Export/HOcrTextExporter.cs
@@ -28,6 +28,7 @@ namespace UglyToad.PdfPig.Export
private int wordCount = 0;
private int pathCount = 0;
private int paraCount = 0;
+ private int imageCount = 0;
///
/// hOCR v1.2 (HTML)
@@ -152,6 +153,11 @@ namespace UglyToad.PdfPig.Export
}
}
+ foreach (var image in page.GetImages())
+ {
+ hocr += "\n" + GetCode(image, page.Height, level + 1);
+ }
+
var words = page.GetWords(wordExtractor);
if (words.Count() > 0)
@@ -212,12 +218,19 @@ namespace UglyToad.PdfPig.Export
hocr += GetIndent(level) + @"";
}
-
}
return hocr;
}
+ private string GetCode(IPdfImage pdfImage, decimal pageHeight, int level)
+ {
+ imageCount++;
+ var bbox = pdfImage.Bounds;
+ return GetIndent(level) + @"";
+ }
+
///
/// Get the hORC string for the area.
/// http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_carea
diff --git a/src/UglyToad.PdfPig/Export/PageXmlTextExporter.cs b/src/UglyToad.PdfPig/Export/PageXmlTextExporter.cs
index a3d72b50..f1853965 100644
--- a/src/UglyToad.PdfPig/Export/PageXmlTextExporter.cs
+++ b/src/UglyToad.PdfPig/Export/PageXmlTextExporter.cs
@@ -186,15 +186,21 @@ namespace UglyToad.PdfPig.Export
//}
};
- var words = page.GetWords(wordExtractor);
var regions = new List();
+ var words = page.GetWords(wordExtractor);
if (words.Count() > 0)
{
var blocks = pageSegmenter.GetBlocks(words);
regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Height)));
}
+ var images = page.GetImages();
+ if (images.Count() > 0)
+ {
+ regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Height)));
+ }
+
if (includePaths)
{
var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Height));
@@ -223,6 +229,17 @@ namespace UglyToad.PdfPig.Export
return null;
}
+ private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, decimal height)
+ {
+ regionCount++;
+ var bbox = pdfImage.Bounds;
+ return new PageXmlDocument.PageXmlImageRegion()
+ {
+ Coords = ToCoords(bbox, height),
+ Id = "r" + regionCount
+ };
+ }
+
///
///
///
diff --git a/src/UglyToad.PdfPig/Geometry/PdfPath.cs b/src/UglyToad.PdfPig/Geometry/PdfPath.cs
index a741d77b..d1a2d30e 100644
--- a/src/UglyToad.PdfPig/Geometry/PdfPath.cs
+++ b/src/UglyToad.PdfPig/Geometry/PdfPath.cs
@@ -211,6 +211,10 @@ namespace UglyToad.PdfPig.Geometry
commands.Add(new Close());
}
+ ///
+ /// The rectangle completely containing the path.
+ ///
+ ///
public PdfRectangle? GetBoundingRectangle()
{
if (commands.Count == 0)