diff --git a/src/UglyToad.PdfPig/Export/AltoXmlTextExporter.cs b/src/UglyToad.PdfPig/Export/AltoXmlTextExporter.cs index b377841f..9897d5ba 100644 --- a/src/UglyToad.PdfPig/Export/AltoXmlTextExporter.cs +++ b/src/UglyToad.PdfPig/Export/AltoXmlTextExporter.cs @@ -29,6 +29,7 @@ namespace UglyToad.PdfPig.Export int pageCount = 0; int pageSpaceCount = 0; int graphicalElementCount = 0; + int illustrationCount = 0; int textBlockCount = 0; int textLineCount = 0; int stringCount = 0; @@ -128,7 +129,6 @@ namespace UglyToad.PdfPig.Export pageCount = page.Number; pageSpaceCount++; - var words = page.GetWords(wordExtractor); var altoPage = new AltoDocument.AltoPage() { Height = (float)Math.Round(page.Height * scale), @@ -164,10 +164,17 @@ namespace UglyToad.PdfPig.Export Id = "P" + pageCount }; + var words = page.GetWords(wordExtractor); if (words.Count() > 0) { var blocks = pageSegmenter.GetBlocks(words); - altoPage.PrintSpace.TextBlock = blocks.Select(b => ToAltoTextBlock(b, page.Height)).ToArray(); + altoPage.PrintSpace.TextBlock = blocks.Select(b => ToAltoTextBlock(b, page.Height)).ToArray(); + } + + var images = page.GetImages(); + if (images.Count() > 0) + { + altoPage.PrintSpace.Illustrations = images.Select(i => ToAltoIllustration(i, page.Height)).ToArray(); } if (includePaths) @@ -213,6 +220,24 @@ namespace UglyToad.PdfPig.Export return null; } + private AltoDocument.AltoIllustration ToAltoIllustration(IPdfImage pdfImage, decimal height) + { + illustrationCount++; + var rectangle = pdfImage.Bounds; + + return new AltoDocument.AltoIllustration() + { + VPos = (float)Math.Round((height - rectangle.Top) * scale), + HPos = (float)Math.Round(rectangle.Left * scale), + Height = (float)Math.Round(rectangle.Height * scale), + Width = (float)Math.Round(rectangle.Width * scale), + FileId = "", + Rotation = 0, + //IdNext = "NA", // for reading order + Id = "P" + pageCount + "_I" + illustrationCount.ToString("#00000") + }; + } + /// /// /// @@ -559,9 +584,11 @@ namespace UglyToad.PdfPig.Export } } - /// + /// + /// Element deprecated. 'Processing' should be used instead. + /// [XmlElementAttribute("OCRProcessing")] - [Obsolete("Element deprecated. 'Processing' should be used instead.")] + //[Obsolete("Element deprecated. 'Processing' should be used instead.")] public AltoDescriptionOcrProcessing[] OCRProcessing { get @@ -2466,7 +2493,7 @@ namespace UglyToad.PdfPig.Export /// Attribute deprecated. LANG should be used instead. /// [XmlAttributeAttribute("language", DataType = "language")] - [Obsolete("Attribute deprecated. LANG should be used instead.")] + //[Obsolete("Attribute deprecated. LANG should be used instead.")] public string Language { get @@ -4785,6 +4812,7 @@ namespace UglyToad.PdfPig.Export /// /// [Alto] Ocr Processing + /// Element deprecated. 'AltoProcessing' should be used instead. /// [EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)] [GeneratedCodeAttribute("xsd", "4.6.1055.0")] @@ -4792,7 +4820,7 @@ namespace UglyToad.PdfPig.Export [DebuggerStepThroughAttribute()] [DesignerCategoryAttribute("code")] [XmlTypeAttribute(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")] - [Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")] + //[Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")] public class AltoOcrProcessing { @@ -4897,6 +4925,7 @@ namespace UglyToad.PdfPig.Export /// /// [Alto] Description Ocr Processing + /// Element deprecated. 'AltoProcessing' should be used instead. /// [EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)] [GeneratedCodeAttribute("xsd", "4.6.1055.0")] @@ -4904,7 +4933,7 @@ namespace UglyToad.PdfPig.Export [DebuggerStepThroughAttribute()] [DesignerCategoryAttribute("code")] [XmlTypeAttribute(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")] - [Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")] + //[Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")] public class AltoDescriptionOcrProcessing : AltoOcrProcessing { diff --git a/src/UglyToad.PdfPig/Export/HOcrTextExporter.cs b/src/UglyToad.PdfPig/Export/HOcrTextExporter.cs index 41d9f0eb..d94b27e1 100644 --- a/src/UglyToad.PdfPig/Export/HOcrTextExporter.cs +++ b/src/UglyToad.PdfPig/Export/HOcrTextExporter.cs @@ -28,6 +28,7 @@ namespace UglyToad.PdfPig.Export private int wordCount = 0; private int pathCount = 0; private int paraCount = 0; + private int imageCount = 0; /// /// hOCR v1.2 (HTML) @@ -152,6 +153,11 @@ namespace UglyToad.PdfPig.Export } } + foreach (var image in page.GetImages()) + { + hocr += "\n" + GetCode(image, page.Height, level + 1); + } + var words = page.GetWords(wordExtractor); if (words.Count() > 0) @@ -212,12 +218,19 @@ namespace UglyToad.PdfPig.Export hocr += GetIndent(level) + @""; } - } return hocr; } + private string GetCode(IPdfImage pdfImage, decimal pageHeight, int level) + { + imageCount++; + var bbox = pdfImage.Bounds; + return GetIndent(level) + @""; + } + /// /// Get the hORC string for the area. /// http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_carea diff --git a/src/UglyToad.PdfPig/Export/PageXmlTextExporter.cs b/src/UglyToad.PdfPig/Export/PageXmlTextExporter.cs index a3d72b50..f1853965 100644 --- a/src/UglyToad.PdfPig/Export/PageXmlTextExporter.cs +++ b/src/UglyToad.PdfPig/Export/PageXmlTextExporter.cs @@ -186,15 +186,21 @@ namespace UglyToad.PdfPig.Export //} }; - var words = page.GetWords(wordExtractor); var regions = new List(); + var words = page.GetWords(wordExtractor); if (words.Count() > 0) { var blocks = pageSegmenter.GetBlocks(words); regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Height))); } + var images = page.GetImages(); + if (images.Count() > 0) + { + regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Height))); + } + if (includePaths) { var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Height)); @@ -223,6 +229,17 @@ namespace UglyToad.PdfPig.Export return null; } + private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, decimal height) + { + regionCount++; + var bbox = pdfImage.Bounds; + return new PageXmlDocument.PageXmlImageRegion() + { + Coords = ToCoords(bbox, height), + Id = "r" + regionCount + }; + } + /// /// /// diff --git a/src/UglyToad.PdfPig/Geometry/PdfPath.cs b/src/UglyToad.PdfPig/Geometry/PdfPath.cs index a741d77b..d1a2d30e 100644 --- a/src/UglyToad.PdfPig/Geometry/PdfPath.cs +++ b/src/UglyToad.PdfPig/Geometry/PdfPath.cs @@ -211,6 +211,10 @@ namespace UglyToad.PdfPig.Geometry commands.Add(new Close()); } + /// + /// The rectangle completely containing the path. + /// + /// public PdfRectangle? GetBoundingRectangle() { if (commands.Count == 0)