mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 19:54:52 +08:00
Adding images regions
This commit is contained in:
@@ -29,6 +29,7 @@ namespace UglyToad.PdfPig.Export
|
||||
int pageCount = 0;
|
||||
int pageSpaceCount = 0;
|
||||
int graphicalElementCount = 0;
|
||||
int illustrationCount = 0;
|
||||
int textBlockCount = 0;
|
||||
int textLineCount = 0;
|
||||
int stringCount = 0;
|
||||
@@ -128,7 +129,6 @@ namespace UglyToad.PdfPig.Export
|
||||
pageCount = page.Number;
|
||||
pageSpaceCount++;
|
||||
|
||||
var words = page.GetWords(wordExtractor);
|
||||
var altoPage = new AltoDocument.AltoPage()
|
||||
{
|
||||
Height = (float)Math.Round(page.Height * scale),
|
||||
@@ -164,12 +164,19 @@ namespace UglyToad.PdfPig.Export
|
||||
Id = "P" + pageCount
|
||||
};
|
||||
|
||||
var words = page.GetWords(wordExtractor);
|
||||
if (words.Count() > 0)
|
||||
{
|
||||
var blocks = pageSegmenter.GetBlocks(words);
|
||||
altoPage.PrintSpace.TextBlock = blocks.Select(b => ToAltoTextBlock(b, page.Height)).ToArray();
|
||||
}
|
||||
|
||||
var images = page.GetImages();
|
||||
if (images.Count() > 0)
|
||||
{
|
||||
altoPage.PrintSpace.Illustrations = images.Select(i => ToAltoIllustration(i, page.Height)).ToArray();
|
||||
}
|
||||
|
||||
if (includePaths)
|
||||
{
|
||||
var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToAltoGraphicalElement(p, page.Height));
|
||||
@@ -213,6 +220,24 @@ namespace UglyToad.PdfPig.Export
|
||||
return null;
|
||||
}
|
||||
|
||||
private AltoDocument.AltoIllustration ToAltoIllustration(IPdfImage pdfImage, decimal height)
|
||||
{
|
||||
illustrationCount++;
|
||||
var rectangle = pdfImage.Bounds;
|
||||
|
||||
return new AltoDocument.AltoIllustration()
|
||||
{
|
||||
VPos = (float)Math.Round((height - rectangle.Top) * scale),
|
||||
HPos = (float)Math.Round(rectangle.Left * scale),
|
||||
Height = (float)Math.Round(rectangle.Height * scale),
|
||||
Width = (float)Math.Round(rectangle.Width * scale),
|
||||
FileId = "",
|
||||
Rotation = 0,
|
||||
//IdNext = "NA", // for reading order
|
||||
Id = "P" + pageCount + "_I" + illustrationCount.ToString("#00000")
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
@@ -559,9 +584,11 @@ namespace UglyToad.PdfPig.Export
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
/// <summary>
|
||||
/// Element deprecated. 'Processing' should be used instead.
|
||||
/// </summary>
|
||||
[XmlElementAttribute("OCRProcessing")]
|
||||
[Obsolete("Element deprecated. 'Processing' should be used instead.")]
|
||||
//[Obsolete("Element deprecated. 'Processing' should be used instead.")]
|
||||
public AltoDescriptionOcrProcessing[] OCRProcessing
|
||||
{
|
||||
get
|
||||
@@ -2466,7 +2493,7 @@ namespace UglyToad.PdfPig.Export
|
||||
/// Attribute deprecated. LANG should be used instead.
|
||||
/// </summary>
|
||||
[XmlAttributeAttribute("language", DataType = "language")]
|
||||
[Obsolete("Attribute deprecated. LANG should be used instead.")]
|
||||
//[Obsolete("Attribute deprecated. LANG should be used instead.")]
|
||||
public string Language
|
||||
{
|
||||
get
|
||||
@@ -4785,6 +4812,7 @@ namespace UglyToad.PdfPig.Export
|
||||
|
||||
/// <summary>
|
||||
/// [Alto] Ocr Processing
|
||||
/// <para>Element deprecated. 'AltoProcessing' should be used instead.</para>
|
||||
/// </summary>
|
||||
[EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
|
||||
[GeneratedCodeAttribute("xsd", "4.6.1055.0")]
|
||||
@@ -4792,7 +4820,7 @@ namespace UglyToad.PdfPig.Export
|
||||
[DebuggerStepThroughAttribute()]
|
||||
[DesignerCategoryAttribute("code")]
|
||||
[XmlTypeAttribute(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
[Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")]
|
||||
//[Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")]
|
||||
public class AltoOcrProcessing
|
||||
{
|
||||
|
||||
@@ -4897,6 +4925,7 @@ namespace UglyToad.PdfPig.Export
|
||||
|
||||
/// <summary>
|
||||
/// [Alto] Description Ocr Processing
|
||||
/// <para>Element deprecated. 'AltoProcessing' should be used instead.</para>
|
||||
/// </summary>
|
||||
[EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
|
||||
[GeneratedCodeAttribute("xsd", "4.6.1055.0")]
|
||||
@@ -4904,7 +4933,7 @@ namespace UglyToad.PdfPig.Export
|
||||
[DebuggerStepThroughAttribute()]
|
||||
[DesignerCategoryAttribute("code")]
|
||||
[XmlTypeAttribute(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
[Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")]
|
||||
//[Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")]
|
||||
public class AltoDescriptionOcrProcessing : AltoOcrProcessing
|
||||
{
|
||||
|
||||
|
@@ -28,6 +28,7 @@ namespace UglyToad.PdfPig.Export
|
||||
private int wordCount = 0;
|
||||
private int pathCount = 0;
|
||||
private int paraCount = 0;
|
||||
private int imageCount = 0;
|
||||
|
||||
/// <summary>
|
||||
/// hOCR v1.2 (HTML)
|
||||
@@ -152,6 +153,11 @@ namespace UglyToad.PdfPig.Export
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var image in page.GetImages())
|
||||
{
|
||||
hocr += "\n" + GetCode(image, page.Height, level + 1);
|
||||
}
|
||||
|
||||
var words = page.GetWords(wordExtractor);
|
||||
|
||||
if (words.Count() > 0)
|
||||
@@ -212,12 +218,19 @@ namespace UglyToad.PdfPig.Export
|
||||
hocr += GetIndent(level) + @"<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
|
||||
+ pathCount + "' title='" + GetCode(bbox.Value, pageHeight) + "' />";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return hocr;
|
||||
}
|
||||
|
||||
private string GetCode(IPdfImage pdfImage, decimal pageHeight, int level)
|
||||
{
|
||||
imageCount++;
|
||||
var bbox = pdfImage.Bounds;
|
||||
return GetIndent(level) + @"<span class='ocr_image' id='image_" + pageCount + "_"
|
||||
+ imageCount + "' title='" + GetCode(bbox, pageHeight) + "' />";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the hORC string for the area.
|
||||
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_carea</para>
|
||||
|
@@ -186,15 +186,21 @@ namespace UglyToad.PdfPig.Export
|
||||
//}
|
||||
};
|
||||
|
||||
var words = page.GetWords(wordExtractor);
|
||||
var regions = new List<PageXmlDocument.PageXmlRegion>();
|
||||
|
||||
var words = page.GetWords(wordExtractor);
|
||||
if (words.Count() > 0)
|
||||
{
|
||||
var blocks = pageSegmenter.GetBlocks(words);
|
||||
regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Height)));
|
||||
}
|
||||
|
||||
var images = page.GetImages();
|
||||
if (images.Count() > 0)
|
||||
{
|
||||
regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Height)));
|
||||
}
|
||||
|
||||
if (includePaths)
|
||||
{
|
||||
var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Height));
|
||||
@@ -223,6 +229,17 @@ namespace UglyToad.PdfPig.Export
|
||||
return null;
|
||||
}
|
||||
|
||||
private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, decimal height)
|
||||
{
|
||||
regionCount++;
|
||||
var bbox = pdfImage.Bounds;
|
||||
return new PageXmlDocument.PageXmlImageRegion()
|
||||
{
|
||||
Coords = ToCoords(bbox, height),
|
||||
Id = "r" + regionCount
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
|
@@ -211,6 +211,10 @@ namespace UglyToad.PdfPig.Geometry
|
||||
commands.Add(new Close());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The rectangle completely containing the path.
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public PdfRectangle? GetBoundingRectangle()
|
||||
{
|
||||
if (commands.Count == 0)
|
||||
|
Reference in New Issue
Block a user