mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-20 03:17:57 +08:00
Better handling of UTF8 in XmlWriter
This commit is contained in:
@@ -56,7 +56,6 @@ namespace UglyToad.PdfPig.Export
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="document"></param>
|
/// <param name="document"></param>
|
||||||
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
|
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
|
||||||
/// <returns></returns>
|
|
||||||
public string Get(PdfDocument document, bool includePaths = false)
|
public string Get(PdfDocument document, bool includePaths = false)
|
||||||
{
|
{
|
||||||
AltoDocument alto = CreateAltoDocument("unknown");
|
AltoDocument alto = CreateAltoDocument("unknown");
|
||||||
@@ -76,7 +75,6 @@ namespace UglyToad.PdfPig.Export
|
|||||||
/// Get the Alto (XML) string of the page layout. Excludes <see cref="PdfPath"/>s.
|
/// Get the Alto (XML) string of the page layout. Excludes <see cref="PdfPath"/>s.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="page"></param>
|
/// <param name="page"></param>
|
||||||
/// <returns></returns>
|
|
||||||
public string Get(Page page)
|
public string Get(Page page)
|
||||||
{
|
{
|
||||||
return Get(page, false);
|
return Get(page, false);
|
||||||
@@ -87,7 +85,6 @@ namespace UglyToad.PdfPig.Export
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="page"></param>
|
/// <param name="page"></param>
|
||||||
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
|
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
|
||||||
/// <returns></returns>
|
|
||||||
public string Get(Page page, bool includePaths)
|
public string Get(Page page, bool includePaths)
|
||||||
{
|
{
|
||||||
AltoDocument alto = CreateAltoDocument("unknown");
|
AltoDocument alto = CreateAltoDocument("unknown");
|
||||||
@@ -102,7 +99,6 @@ namespace UglyToad.PdfPig.Export
|
|||||||
/// Create an empty <see cref="AltoDocument"/>.
|
/// Create an empty <see cref="AltoDocument"/>.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="fileName"></param>
|
/// <param name="fileName"></param>
|
||||||
/// <returns></returns>
|
|
||||||
private AltoDocument CreateAltoDocument(string fileName)
|
private AltoDocument CreateAltoDocument(string fileName)
|
||||||
{
|
{
|
||||||
return new AltoDocument()
|
return new AltoDocument()
|
||||||
@@ -113,17 +109,9 @@ namespace UglyToad.PdfPig.Export
|
|||||||
},
|
},
|
||||||
Description = GetAltoDescription(fileName),
|
Description = GetAltoDescription(fileName),
|
||||||
SchemaVersion = "4",
|
SchemaVersion = "4",
|
||||||
//Styles = new AltoStyles() { },
|
|
||||||
//Tags = new AltoTags() { }
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="page"></param>
|
|
||||||
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
|
private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
|
||||||
{
|
{
|
||||||
pageCount = page.Number;
|
pageCount = page.Number;
|
||||||
@@ -159,7 +147,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
Illustrations = null, // TBD
|
Illustrations = null, // TBD
|
||||||
ProcessingRefs = null, // TBD
|
ProcessingRefs = null, // TBD
|
||||||
StyleRefs = null, // TBD
|
StyleRefs = null, // TBD
|
||||||
Id = "P" + pageCount + "_PS" + pageSpaceCount.ToString("#00000") //P1_PS00001
|
Id = "P" + pageCount + "_PS" + pageSpaceCount.ToString("#00000")
|
||||||
},
|
},
|
||||||
Id = "P" + pageCount
|
Id = "P" + pageCount
|
||||||
};
|
};
|
||||||
@@ -188,12 +176,6 @@ namespace UglyToad.PdfPig.Export
|
|||||||
return altoPage;
|
return altoPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="pdfPath"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private AltoDocument.AltoGraphicalElement ToAltoGraphicalElement(PdfPath pdfPath, decimal height)
|
private AltoDocument.AltoGraphicalElement ToAltoGraphicalElement(PdfPath pdfPath, decimal height)
|
||||||
{
|
{
|
||||||
graphicalElementCount++;
|
graphicalElementCount++;
|
||||||
@@ -208,12 +190,10 @@ namespace UglyToad.PdfPig.Export
|
|||||||
Height = (float)Math.Round(rectangle.Value.Height * scale),
|
Height = (float)Math.Round(rectangle.Value.Height * scale),
|
||||||
Width = (float)Math.Round(rectangle.Value.Width * scale),
|
Width = (float)Math.Round(rectangle.Value.Width * scale),
|
||||||
Rotation = 0,
|
Rotation = 0,
|
||||||
//Cs = false,
|
|
||||||
StyleRefs = null,
|
StyleRefs = null,
|
||||||
TagRefs = null,
|
TagRefs = null,
|
||||||
title = null,
|
title = null,
|
||||||
type = null,
|
type = null,
|
||||||
//IdNext = "NA", // for reading order
|
|
||||||
Id = "P" + pageCount + "_GE" + graphicalElementCount.ToString("#00000")
|
Id = "P" + pageCount + "_GE" + graphicalElementCount.ToString("#00000")
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -233,17 +213,10 @@ namespace UglyToad.PdfPig.Export
|
|||||||
Width = (float)Math.Round(rectangle.Width * scale),
|
Width = (float)Math.Round(rectangle.Width * scale),
|
||||||
FileId = "",
|
FileId = "",
|
||||||
Rotation = 0,
|
Rotation = 0,
|
||||||
//IdNext = "NA", // for reading order
|
|
||||||
Id = "P" + pageCount + "_I" + illustrationCount.ToString("#00000")
|
Id = "P" + pageCount + "_I" + illustrationCount.ToString("#00000")
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="textBlock"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private AltoDocument.AltoTextBlock ToAltoTextBlock(TextBlock textBlock, decimal height)
|
private AltoDocument.AltoTextBlock ToAltoTextBlock(TextBlock textBlock, decimal height)
|
||||||
{
|
{
|
||||||
textBlockCount++;
|
textBlockCount++;
|
||||||
@@ -254,24 +227,16 @@ namespace UglyToad.PdfPig.Export
|
|||||||
HPos = (float)Math.Round(textBlock.BoundingBox.Left * scale),
|
HPos = (float)Math.Round(textBlock.BoundingBox.Left * scale),
|
||||||
Height = (float)Math.Round(textBlock.BoundingBox.Height * scale),
|
Height = (float)Math.Round(textBlock.BoundingBox.Height * scale),
|
||||||
Width = (float)Math.Round(textBlock.BoundingBox.Width * scale),
|
Width = (float)Math.Round(textBlock.BoundingBox.Width * scale),
|
||||||
Rotation = 0, // check textBlock.TextDirection
|
Rotation = 0,
|
||||||
TextLines = textBlock.TextLines.Select(l => ToAltoTextLine(l, height)).ToArray(),
|
TextLines = textBlock.TextLines.Select(l => ToAltoTextLine(l, height)).ToArray(),
|
||||||
//Cs = false,
|
|
||||||
StyleRefs = null,
|
StyleRefs = null,
|
||||||
TagRefs = null,
|
TagRefs = null,
|
||||||
title = null,
|
title = null,
|
||||||
type = null,
|
type = null,
|
||||||
//IdNext = "NA", // for reading order
|
|
||||||
Id = "P" + pageCount + "_TB" + textBlockCount.ToString("#00000")
|
Id = "P" + pageCount + "_TB" + textBlockCount.ToString("#00000")
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="textLine"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private AltoDocument.AltoTextBlockTextLine ToAltoTextLine(TextLine textLine, decimal height)
|
private AltoDocument.AltoTextBlockTextLine ToAltoTextLine(TextLine textLine, decimal height)
|
||||||
{
|
{
|
||||||
textLineCount++;
|
textLineCount++;
|
||||||
@@ -283,23 +248,15 @@ namespace UglyToad.PdfPig.Export
|
|||||||
HPos = (float)Math.Round(textLine.BoundingBox.Left * scale),
|
HPos = (float)Math.Round(textLine.BoundingBox.Left * scale),
|
||||||
Height = (float)Math.Round(textLine.BoundingBox.Height * scale),
|
Height = (float)Math.Round(textLine.BoundingBox.Height * scale),
|
||||||
Width = (float)Math.Round(textLine.BoundingBox.Width * scale),
|
Width = (float)Math.Round(textLine.BoundingBox.Width * scale),
|
||||||
BaseLine = float.NaN, // TBD
|
BaseLine = float.NaN,
|
||||||
//Hyp = new AltoTextBlockTextLineHyp() { }, // TBD
|
|
||||||
Strings = strings,
|
Strings = strings,
|
||||||
Lang = null,
|
Lang = null,
|
||||||
//Sp = new AltoSP[0], // TBD
|
|
||||||
StyleRefs = null,
|
StyleRefs = null,
|
||||||
TagRefs = null,
|
TagRefs = null,
|
||||||
Id = "P" + pageCount + "_TL" + textLineCount.ToString("#00000")
|
Id = "P" + pageCount + "_TL" + textLineCount.ToString("#00000")
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="word"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private AltoDocument.AltoString ToAltoString(Word word, decimal height)
|
private AltoDocument.AltoString ToAltoString(Word word, decimal height)
|
||||||
{
|
{
|
||||||
stringCount++;
|
stringCount++;
|
||||||
@@ -313,24 +270,15 @@ namespace UglyToad.PdfPig.Export
|
|||||||
Glyph = glyphs,
|
Glyph = glyphs,
|
||||||
Cc = string.Join("", glyphs.Select(g => 9f * (1f - g.Gc))), // from 0->1 to 9->0
|
Cc = string.Join("", glyphs.Select(g => 9f * (1f - g.Gc))), // from 0->1 to 9->0
|
||||||
Content = word.Text,
|
Content = word.Text,
|
||||||
//Cs = false,
|
|
||||||
Lang = null,
|
Lang = null,
|
||||||
//Style = AltoFontStyles.Bold,
|
|
||||||
StyleRefs = null,
|
StyleRefs = null,
|
||||||
SubsContent = null,
|
SubsContent = null,
|
||||||
//SubsType = AltoSubsType.Abbreviation,
|
|
||||||
TagRefs = null,
|
TagRefs = null,
|
||||||
Wc = float.NaN,
|
Wc = float.NaN,
|
||||||
Id = "P" + pageCount + "_ST" + stringCount.ToString("#00000")
|
Id = "P" + pageCount + "_ST" + stringCount.ToString("#00000")
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="letter"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private AltoDocument.AltoGlyph ToAltoGlyph(Letter letter, decimal height)
|
private AltoDocument.AltoGlyph ToAltoGlyph(Letter letter, decimal height)
|
||||||
{
|
{
|
||||||
glyphCount++;
|
glyphCount++;
|
||||||
@@ -346,17 +294,12 @@ namespace UglyToad.PdfPig.Export
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="fileName"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private AltoDocument.AltoDescription GetAltoDescription(string fileName)
|
private AltoDocument.AltoDescription GetAltoDescription(string fileName)
|
||||||
{
|
{
|
||||||
var processing = new AltoDocument.AltoDescriptionProcessing()
|
var processing = new AltoDocument.AltoDescriptionProcessing()
|
||||||
{
|
{
|
||||||
ProcessingAgency = null,
|
ProcessingAgency = null,
|
||||||
ProcessingCategory = AltoDocument.AltoProcessingCategory.Other, // TBD
|
ProcessingCategory = AltoDocument.AltoProcessingCategory.Other,
|
||||||
ProcessingDateTime = DateTime.UtcNow.ToString(),
|
ProcessingDateTime = DateTime.UtcNow.ToString(),
|
||||||
ProcessingSoftware = new AltoDocument.AltoProcessingSoftware()
|
ProcessingSoftware = new AltoDocument.AltoProcessingSoftware()
|
||||||
{
|
{
|
||||||
@@ -384,7 +327,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
|
|
||||||
return new AltoDocument.AltoDescription()
|
return new AltoDocument.AltoDescription()
|
||||||
{
|
{
|
||||||
MeasurementUnit = AltoDocument.AltoMeasurementUnit.Pixel, // need to check that
|
MeasurementUnit = AltoDocument.AltoMeasurementUnit.Pixel,
|
||||||
Processings = new[] { processing },
|
Processings = new[] { processing },
|
||||||
SourceImageInformation = new AltoDocument.AltoSourceImageInformation()
|
SourceImageInformation = new AltoDocument.AltoSourceImageInformation()
|
||||||
{
|
{
|
||||||
@@ -400,18 +343,16 @@ namespace UglyToad.PdfPig.Export
|
|||||||
XmlSerializer serializer = new XmlSerializer(typeof(AltoDocument));
|
XmlSerializer serializer = new XmlSerializer(typeof(AltoDocument));
|
||||||
var settings = new XmlWriterSettings()
|
var settings = new XmlWriterSettings()
|
||||||
{
|
{
|
||||||
//Encoding = new System.Text.UTF8Encoding(true),
|
Encoding = System.Text.Encoding.UTF8,
|
||||||
Indent = true,
|
Indent = true,
|
||||||
IndentChars = indentChar,
|
IndentChars = indentChar,
|
||||||
OmitXmlDeclaration = true // hack to manually handle utf-8
|
|
||||||
};
|
};
|
||||||
|
|
||||||
using (var stringWriter = new System.IO.StringWriter())
|
using (var memoryStream = new System.IO.MemoryStream())
|
||||||
using (var xmlWriter = XmlWriter.Create(stringWriter, settings))
|
using (var xmlWriter = XmlWriter.Create(memoryStream, settings))
|
||||||
{
|
{
|
||||||
stringWriter.WriteLine("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); // hack to manually handle utf-8
|
|
||||||
serializer.Serialize(xmlWriter, altoDocument);
|
serializer.Serialize(xmlWriter, altoDocument);
|
||||||
return stringWriter.ToString();
|
return System.Text.Encoding.UTF8.GetString(memoryStream.ToArray());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -47,7 +47,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the hORC (HTML) string of the page layout.
|
/// Get the hOCR (HTML) string of the page layout.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="document">The document.</param>
|
/// <param name="document">The document.</param>
|
||||||
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
|
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
|
||||||
@@ -70,17 +70,16 @@ namespace UglyToad.PdfPig.Export
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the hORC (HTML) string of the page layout. Excludes <see cref="PdfPath"/>s.
|
/// Get the hOCR (HTML) string of the page layout. Excludes <see cref="PdfPath"/>s.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="page">The page.</param>
|
/// <param name="page">The page.</param>
|
||||||
/// <returns></returns>
|
|
||||||
public string Get(Page page)
|
public string Get(Page page)
|
||||||
{
|
{
|
||||||
return Get(page, false);
|
return Get(page, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the hORC (HTML) string of the page layout.
|
/// Get the hOCR (HTML) string of the page layout.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="page">The page.</param>
|
/// <param name="page">The page.</param>
|
||||||
/// <param name="imageName">The image name, if any.</param>
|
/// <param name="imageName">The image name, if any.</param>
|
||||||
@@ -117,7 +116,6 @@ namespace UglyToad.PdfPig.Export
|
|||||||
/// Get indent string from level.
|
/// Get indent string from level.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="level">The indent level.</param>
|
/// <param name="level">The indent level.</param>
|
||||||
/// <returns></returns>
|
|
||||||
private string GetIndent(int level)
|
private string GetIndent(int level)
|
||||||
{
|
{
|
||||||
string indent = "";
|
string indent = "";
|
||||||
@@ -129,7 +127,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the hORC string for the page.
|
/// Get the hOCR string for the page.
|
||||||
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page</para>
|
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="page"></param>
|
/// <param name="page"></param>
|
||||||
@@ -174,14 +172,13 @@ namespace UglyToad.PdfPig.Export
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the hORC string for the path.
|
/// Get the hOCR string for the path.
|
||||||
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_linedrawing</para>
|
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_linedrawing</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="path"></param>
|
/// <param name="path"></param>
|
||||||
/// <param name="pageHeight"></param>
|
/// <param name="pageHeight"></param>
|
||||||
/// <param name="subPaths"></param>
|
/// <param name="subPaths"></param>
|
||||||
/// <param name="level">The indent level.</param>
|
/// <param name="level">The indent level.</param>
|
||||||
/// <returns></returns>
|
|
||||||
private string GetCode(PdfPath path, decimal pageHeight, bool subPaths, int level)
|
private string GetCode(PdfPath path, decimal pageHeight, bool subPaths, int level)
|
||||||
{
|
{
|
||||||
if (path == null) return string.Empty;
|
if (path == null) return string.Empty;
|
||||||
@@ -232,7 +229,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the hORC string for the area.
|
/// Get the hOCR string for the area.
|
||||||
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_carea</para>
|
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_carea</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="block">The text area.</param>
|
/// <param name="block">The text area.</param>
|
||||||
@@ -252,13 +249,12 @@ namespace UglyToad.PdfPig.Export
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the hORC string for the paragraph.
|
/// Get the hOCR string for the paragraph.
|
||||||
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_par</para>
|
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_par</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="block">The paragraph.</param>
|
/// <param name="block">The paragraph.</param>
|
||||||
/// <param name="pageHeight"></param>
|
/// <param name="pageHeight"></param>
|
||||||
/// <param name="level">The indent level.</param>
|
/// <param name="level">The indent level.</param>
|
||||||
/// <returns></returns>
|
|
||||||
private string GetCodeParagraph(TextBlock block, decimal pageHeight, int level)
|
private string GetCodeParagraph(TextBlock block, decimal pageHeight, int level)
|
||||||
{
|
{
|
||||||
paraCount++;
|
paraCount++;
|
||||||
@@ -275,7 +271,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the hORC string for the text line.
|
/// Get the hOCR string for the text line.
|
||||||
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_line</para>
|
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_line</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="line"></param>
|
/// <param name="line"></param>
|
||||||
@@ -303,7 +299,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the hORC string for the word.
|
/// Get the hOCR string for the word.
|
||||||
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocrx_word</para>
|
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocrx_word</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="word"></param>
|
/// <param name="word"></param>
|
||||||
@@ -334,7 +330,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the hORC string for the bounding box.
|
/// Get the hOCR string for the bounding box.
|
||||||
/// <para>See http://kba.cloud/hocr-spec/1.2/#propdef-bbox</para>
|
/// <para>See http://kba.cloud/hocr-spec/1.2/#propdef-bbox</para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="rectangle"></param>
|
/// <param name="rectangle"></param>
|
||||||
|
@@ -4,6 +4,7 @@ using System.Collections.Generic;
|
|||||||
using System.ComponentModel;
|
using System.ComponentModel;
|
||||||
using System.Diagnostics;
|
using System.Diagnostics;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
|
using System.Xml;
|
||||||
using System.Xml.Serialization;
|
using System.Xml.Serialization;
|
||||||
using UglyToad.PdfPig.Content;
|
using UglyToad.PdfPig.Content;
|
||||||
using UglyToad.PdfPig.DocumentLayoutAnalysis;
|
using UglyToad.PdfPig.DocumentLayoutAnalysis;
|
||||||
@@ -60,7 +61,6 @@ namespace UglyToad.PdfPig.Export
|
|||||||
/// Get the PAGE-XML (XML) string of the pages layout. Excludes <see cref="PdfPath"/>s.
|
/// Get the PAGE-XML (XML) string of the pages layout. Excludes <see cref="PdfPath"/>s.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="page"></param>
|
/// <param name="page"></param>
|
||||||
/// <returns></returns>
|
|
||||||
public string Get(Page page)
|
public string Get(Page page)
|
||||||
{
|
{
|
||||||
return Get(page, false);
|
return Get(page, false);
|
||||||
@@ -90,12 +90,6 @@ namespace UglyToad.PdfPig.Export
|
|||||||
return Serialize(pageXmlDocument);
|
return Serialize(pageXmlDocument);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="point"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private string PointToString(PdfPoint point, decimal height)
|
private string PointToString(PdfPoint point, decimal height)
|
||||||
{
|
{
|
||||||
decimal x = Math.Round(point.X * scale);
|
decimal x = Math.Round(point.X * scale);
|
||||||
@@ -103,39 +97,20 @@ namespace UglyToad.PdfPig.Export
|
|||||||
return (x > 0 ? x : 0).ToString("0") + "," + (y > 0 ? y : 0).ToString("0");
|
return (x > 0 ? x : 0).ToString("0") + "," + (y > 0 ? y : 0).ToString("0");
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="points"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private string ToPoints(IEnumerable<PdfPoint> points, decimal height)
|
private string ToPoints(IEnumerable<PdfPoint> points, decimal height)
|
||||||
{
|
{
|
||||||
return string.Join(" ", points.Select(p => PointToString(p, height)));
|
return string.Join(" ", points.Select(p => PointToString(p, height)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="pdfRectangle"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private string ToPoints(PdfRectangle pdfRectangle, decimal height)
|
private string ToPoints(PdfRectangle pdfRectangle, decimal height)
|
||||||
{
|
{
|
||||||
return ToPoints(new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight }, height);
|
return ToPoints(new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight }, height);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="pdfRectangle"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, decimal height)
|
private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, decimal height)
|
||||||
{
|
{
|
||||||
return new PageXmlDocument.PageXmlCoords()
|
return new PageXmlDocument.PageXmlCoords()
|
||||||
{
|
{
|
||||||
//Conf = 1,
|
|
||||||
Points = ToPoints(pdfRectangle, height)
|
Points = ToPoints(pdfRectangle, height)
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -152,38 +127,17 @@ namespace UglyToad.PdfPig.Export
|
|||||||
int blue = 65536 * (int)Math.Round(255f * (float)rgb.b);
|
int blue = 65536 * (int)Math.Round(255f * (float)rgb.b);
|
||||||
int sum = red + green + blue;
|
int sum = red + green + blue;
|
||||||
|
|
||||||
// as per below, red and blue order might be inverted...
|
// as per below, red and blue order might be inverted... var colorWin = System.Drawing.Color.FromArgb(sum);
|
||||||
//var colorWin = System.Drawing.Color.FromArgb(sum);
|
|
||||||
|
|
||||||
return sum.ToString();
|
return sum.ToString();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="page"></param>
|
|
||||||
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
|
|
||||||
private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths)
|
private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths)
|
||||||
{
|
{
|
||||||
var pageXmlPage = new PageXmlDocument.PageXmlPage()
|
var pageXmlPage = new PageXmlDocument.PageXmlPage()
|
||||||
{
|
{
|
||||||
//Border = new PageXmlBorder()
|
|
||||||
//{
|
|
||||||
// Coords = new PageXmlCoords()
|
|
||||||
// {
|
|
||||||
// Points = page.
|
|
||||||
// }
|
|
||||||
//},
|
|
||||||
ImageFilename = "unknown",
|
ImageFilename = "unknown",
|
||||||
ImageHeight = (int)Math.Round(page.Height * scale),
|
ImageHeight = (int)Math.Round(page.Height * scale),
|
||||||
ImageWidth = (int)Math.Round(page.Width * scale),
|
ImageWidth = (int)Math.Round(page.Width * scale),
|
||||||
//PrintSpace = new PageXmlPrintSpace()
|
|
||||||
//{
|
|
||||||
// Coords = new PageXmlCoords()
|
|
||||||
// {
|
|
||||||
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
var regions = new List<PageXmlDocument.PageXmlRegion>();
|
var regions = new List<PageXmlDocument.PageXmlRegion>();
|
||||||
@@ -240,12 +194,6 @@ namespace UglyToad.PdfPig.Export
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="textBlock"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, decimal height)
|
private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, decimal height)
|
||||||
{
|
{
|
||||||
regionCount++;
|
regionCount++;
|
||||||
@@ -258,33 +206,19 @@ namespace UglyToad.PdfPig.Export
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="textLine"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, decimal height)
|
private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, decimal height)
|
||||||
{
|
{
|
||||||
lineCount++;
|
lineCount++;
|
||||||
return new PageXmlDocument.PageXmlTextLine()
|
return new PageXmlDocument.PageXmlTextLine()
|
||||||
{
|
{
|
||||||
Coords = ToCoords(textLine.BoundingBox, height),
|
Coords = ToCoords(textLine.BoundingBox, height),
|
||||||
//Baseline = new PageXmlBaseline() { },
|
|
||||||
Production = PageXmlDocument.PageXmlProductionSimpleType.Printed,
|
Production = PageXmlDocument.PageXmlProductionSimpleType.Printed,
|
||||||
//ReadingDirection = PageXmlReadingDirectionSimpleType.LeftToRight,
|
|
||||||
Words = textLine.Words.Select(w => ToPageXmlWord(w, height)).ToArray(),
|
Words = textLine.Words.Select(w => ToPageXmlWord(w, height)).ToArray(),
|
||||||
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } },
|
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } },
|
||||||
Id = "l" + lineCount
|
Id = "l" + lineCount
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="word"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, decimal height)
|
private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, decimal height)
|
||||||
{
|
{
|
||||||
wordCount++;
|
wordCount++;
|
||||||
@@ -297,12 +231,6 @@ namespace UglyToad.PdfPig.Export
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
///
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="letter"></param>
|
|
||||||
/// <param name="height"></param>
|
|
||||||
/// <returns></returns>
|
|
||||||
private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, decimal height)
|
private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, decimal height)
|
||||||
{
|
{
|
||||||
glyphCount++;
|
glyphCount++;
|
||||||
@@ -326,7 +254,7 @@ namespace UglyToad.PdfPig.Export
|
|||||||
{
|
{
|
||||||
XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument));
|
XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument));
|
||||||
|
|
||||||
using (var reader = System.Xml.XmlReader.Create(xmlPath))
|
using (var reader = XmlReader.Create(xmlPath))
|
||||||
{
|
{
|
||||||
return (PageXmlDocument)serializer.Deserialize(reader);
|
return (PageXmlDocument)serializer.Deserialize(reader);
|
||||||
}
|
}
|
||||||
@@ -335,20 +263,18 @@ namespace UglyToad.PdfPig.Export
|
|||||||
private string Serialize(PageXmlDocument pageXmlDocument)
|
private string Serialize(PageXmlDocument pageXmlDocument)
|
||||||
{
|
{
|
||||||
XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument));
|
XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument));
|
||||||
var settings = new System.Xml.XmlWriterSettings()
|
var settings = new XmlWriterSettings()
|
||||||
{
|
{
|
||||||
//Encoding = new System.Text.UTF8Encoding(true),
|
Encoding = System.Text.Encoding.UTF8,
|
||||||
Indent = true,
|
Indent = true,
|
||||||
IndentChars = indentChar,
|
IndentChars = indentChar,
|
||||||
OmitXmlDeclaration = true // hack to manually handle utf-8
|
|
||||||
};
|
};
|
||||||
|
|
||||||
using (var stringWriter = new System.IO.StringWriter())
|
using (var memoryStream = new System.IO.MemoryStream())
|
||||||
using (var xmlWriter = System.Xml.XmlWriter.Create(stringWriter, settings))
|
using (var xmlWriter = XmlWriter.Create(memoryStream, settings))
|
||||||
{
|
{
|
||||||
stringWriter.WriteLine("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); // hack to manually handle utf-8
|
|
||||||
serializer.Serialize(xmlWriter, pageXmlDocument);
|
serializer.Serialize(xmlWriter, pageXmlDocument);
|
||||||
return stringWriter.ToString();
|
return System.Text.Encoding.UTF8.GetString(memoryStream.ToArray());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user