mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 19:54:52 +08:00
Adding images regions
This commit is contained in:
@@ -29,6 +29,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
int pageCount = 0;
|
int pageCount = 0;
|
||||||
int pageSpaceCount = 0;
|
int pageSpaceCount = 0;
|
||||||
int graphicalElementCount = 0;
|
int graphicalElementCount = 0;
|
||||||
|
int illustrationCount = 0;
|
||||||
int textBlockCount = 0;
|
int textBlockCount = 0;
|
||||||
int textLineCount = 0;
|
int textLineCount = 0;
|
||||||
int stringCount = 0;
|
int stringCount = 0;
|
||||||
@@ -128,7 +129,6 @@ namespace UglyToad.PdfPig.Export
|
|||||||
pageCount = page.Number;
|
pageCount = page.Number;
|
||||||
pageSpaceCount++;
|
pageSpaceCount++;
|
||||||
|
|
||||||
var words = page.GetWords(wordExtractor);
|
|
||||||
var altoPage = new AltoDocument.AltoPage()
|
var altoPage = new AltoDocument.AltoPage()
|
||||||
{
|
{
|
||||||
Height = (float)Math.Round(page.Height * scale),
|
Height = (float)Math.Round(page.Height * scale),
|
||||||
@@ -164,10 +164,17 @@ namespace UglyToad.PdfPig.Export
|
|||||||
Id = "P" + pageCount
|
Id = "P" + pageCount
|
||||||
};
|
};
|
||||||
|
|
||||||
|
var words = page.GetWords(wordExtractor);
|
||||||
if (words.Count() > 0)
|
if (words.Count() > 0)
|
||||||
{
|
{
|
||||||
var blocks = pageSegmenter.GetBlocks(words);
|
var blocks = pageSegmenter.GetBlocks(words);
|
||||||
altoPage.PrintSpace.TextBlock = blocks.Select(b => ToAltoTextBlock(b, page.Height)).ToArray();
|
altoPage.PrintSpace.TextBlock = blocks.Select(b => ToAltoTextBlock(b, page.Height)).ToArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
var images = page.GetImages();
|
||||||
|
if (images.Count() > 0)
|
||||||
|
{
|
||||||
|
altoPage.PrintSpace.Illustrations = images.Select(i => ToAltoIllustration(i, page.Height)).ToArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (includePaths)
|
if (includePaths)
|
||||||
@@ -213,6 +220,24 @@ namespace UglyToad.PdfPig.Export
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private AltoDocument.AltoIllustration ToAltoIllustration(IPdfImage pdfImage, decimal height)
|
||||||
|
{
|
||||||
|
illustrationCount++;
|
||||||
|
var rectangle = pdfImage.Bounds;
|
||||||
|
|
||||||
|
return new AltoDocument.AltoIllustration()
|
||||||
|
{
|
||||||
|
VPos = (float)Math.Round((height - rectangle.Top) * scale),
|
||||||
|
HPos = (float)Math.Round(rectangle.Left * scale),
|
||||||
|
Height = (float)Math.Round(rectangle.Height * scale),
|
||||||
|
Width = (float)Math.Round(rectangle.Width * scale),
|
||||||
|
FileId = "",
|
||||||
|
Rotation = 0,
|
||||||
|
//IdNext = "NA", // for reading order
|
||||||
|
Id = "P" + pageCount + "_I" + illustrationCount.ToString("#00000")
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
///
|
///
|
||||||
/// </summary>
|
/// </summary>
|
||||||
@@ -559,9 +584,11 @@ namespace UglyToad.PdfPig.Export
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <remarks/>
|
/// <summary>
|
||||||
|
/// Element deprecated. 'Processing' should be used instead.
|
||||||
|
/// </summary>
|
||||||
[XmlElementAttribute("OCRProcessing")]
|
[XmlElementAttribute("OCRProcessing")]
|
||||||
[Obsolete("Element deprecated. 'Processing' should be used instead.")]
|
//[Obsolete("Element deprecated. 'Processing' should be used instead.")]
|
||||||
public AltoDescriptionOcrProcessing[] OCRProcessing
|
public AltoDescriptionOcrProcessing[] OCRProcessing
|
||||||
{
|
{
|
||||||
get
|
get
|
||||||
@@ -2466,7 +2493,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
/// Attribute deprecated. LANG should be used instead.
|
/// Attribute deprecated. LANG should be used instead.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
[XmlAttributeAttribute("language", DataType = "language")]
|
[XmlAttributeAttribute("language", DataType = "language")]
|
||||||
[Obsolete("Attribute deprecated. LANG should be used instead.")]
|
//[Obsolete("Attribute deprecated. LANG should be used instead.")]
|
||||||
public string Language
|
public string Language
|
||||||
{
|
{
|
||||||
get
|
get
|
||||||
@@ -4785,6 +4812,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// [Alto] Ocr Processing
|
/// [Alto] Ocr Processing
|
||||||
|
/// <para>Element deprecated. 'AltoProcessing' should be used instead.</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
[EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
|
[EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
|
||||||
[GeneratedCodeAttribute("xsd", "4.6.1055.0")]
|
[GeneratedCodeAttribute("xsd", "4.6.1055.0")]
|
||||||
@@ -4792,7 +4820,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
[DebuggerStepThroughAttribute()]
|
[DebuggerStepThroughAttribute()]
|
||||||
[DesignerCategoryAttribute("code")]
|
[DesignerCategoryAttribute("code")]
|
||||||
[XmlTypeAttribute(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
[XmlTypeAttribute(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||||
[Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")]
|
//[Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")]
|
||||||
public class AltoOcrProcessing
|
public class AltoOcrProcessing
|
||||||
{
|
{
|
||||||
|
|
||||||
@@ -4897,6 +4925,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// [Alto] Description Ocr Processing
|
/// [Alto] Description Ocr Processing
|
||||||
|
/// <para>Element deprecated. 'AltoProcessing' should be used instead.</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
[EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
|
[EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
|
||||||
[GeneratedCodeAttribute("xsd", "4.6.1055.0")]
|
[GeneratedCodeAttribute("xsd", "4.6.1055.0")]
|
||||||
@@ -4904,7 +4933,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
[DebuggerStepThroughAttribute()]
|
[DebuggerStepThroughAttribute()]
|
||||||
[DesignerCategoryAttribute("code")]
|
[DesignerCategoryAttribute("code")]
|
||||||
[XmlTypeAttribute(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
[XmlTypeAttribute(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||||
[Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")]
|
//[Obsolete("Element deprecated. 'AltoProcessing' should be used instead.")]
|
||||||
public class AltoDescriptionOcrProcessing : AltoOcrProcessing
|
public class AltoDescriptionOcrProcessing : AltoOcrProcessing
|
||||||
{
|
{
|
||||||
|
|
||||||
|
@@ -28,6 +28,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
private int wordCount = 0;
|
private int wordCount = 0;
|
||||||
private int pathCount = 0;
|
private int pathCount = 0;
|
||||||
private int paraCount = 0;
|
private int paraCount = 0;
|
||||||
|
private int imageCount = 0;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// hOCR v1.2 (HTML)
|
/// hOCR v1.2 (HTML)
|
||||||
@@ -152,6 +153,11 @@ namespace UglyToad.PdfPig.Export
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
foreach (var image in page.GetImages())
|
||||||
|
{
|
||||||
|
hocr += "\n" + GetCode(image, page.Height, level + 1);
|
||||||
|
}
|
||||||
|
|
||||||
var words = page.GetWords(wordExtractor);
|
var words = page.GetWords(wordExtractor);
|
||||||
|
|
||||||
if (words.Count() > 0)
|
if (words.Count() > 0)
|
||||||
@@ -212,12 +218,19 @@ namespace UglyToad.PdfPig.Export
|
|||||||
hocr += GetIndent(level) + @"<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
|
hocr += GetIndent(level) + @"<span class='ocr_linedrawing' id='drawing_" + pageCount + "_"
|
||||||
+ pathCount + "' title='" + GetCode(bbox.Value, pageHeight) + "' />";
|
+ pathCount + "' title='" + GetCode(bbox.Value, pageHeight) + "' />";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return hocr;
|
return hocr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private string GetCode(IPdfImage pdfImage, decimal pageHeight, int level)
|
||||||
|
{
|
||||||
|
imageCount++;
|
||||||
|
var bbox = pdfImage.Bounds;
|
||||||
|
return GetIndent(level) + @"<span class='ocr_image' id='image_" + pageCount + "_"
|
||||||
|
+ imageCount + "' title='" + GetCode(bbox, pageHeight) + "' />";
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the hORC string for the area.
|
/// Get the hORC string for the area.
|
||||||
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_carea</para>
|
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_carea</para>
|
||||||
|
@@ -186,15 +186,21 @@ namespace UglyToad.PdfPig.Export
|
|||||||
//}
|
//}
|
||||||
};
|
};
|
||||||
|
|
||||||
var words = page.GetWords(wordExtractor);
|
|
||||||
var regions = new List<PageXmlDocument.PageXmlRegion>();
|
var regions = new List<PageXmlDocument.PageXmlRegion>();
|
||||||
|
|
||||||
|
var words = page.GetWords(wordExtractor);
|
||||||
if (words.Count() > 0)
|
if (words.Count() > 0)
|
||||||
{
|
{
|
||||||
var blocks = pageSegmenter.GetBlocks(words);
|
var blocks = pageSegmenter.GetBlocks(words);
|
||||||
regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Height)));
|
regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Height)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var images = page.GetImages();
|
||||||
|
if (images.Count() > 0)
|
||||||
|
{
|
||||||
|
regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Height)));
|
||||||
|
}
|
||||||
|
|
||||||
if (includePaths)
|
if (includePaths)
|
||||||
{
|
{
|
||||||
var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Height));
|
var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Height));
|
||||||
@@ -223,6 +229,17 @@ namespace UglyToad.PdfPig.Export
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, decimal height)
|
||||||
|
{
|
||||||
|
regionCount++;
|
||||||
|
var bbox = pdfImage.Bounds;
|
||||||
|
return new PageXmlDocument.PageXmlImageRegion()
|
||||||
|
{
|
||||||
|
Coords = ToCoords(bbox, height),
|
||||||
|
Id = "r" + regionCount
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
///
|
///
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
@@ -211,6 +211,10 @@ namespace UglyToad.PdfPig.Geometry
|
|||||||
commands.Add(new Close());
|
commands.Add(new Close());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The rectangle completely containing the path.
|
||||||
|
/// </summary>
|
||||||
|
/// <returns></returns>
|
||||||
public PdfRectangle? GetBoundingRectangle()
|
public PdfRectangle? GetBoundingRectangle()
|
||||||
{
|
{
|
||||||
if (commands.Count == 0)
|
if (commands.Count == 0)
|
||||||
|
Reference in New Issue
Block a user