Better handling of UTF8 in XmlWriter

This commit is contained in:
BobLd
2019-10-10 14:14:05 +01:00
parent fe1a3c4b8b
commit a15f56a6ac
3 changed files with 27 additions and 164 deletions

View File

@@ -56,7 +56,6 @@ namespace UglyToad.PdfPig.Export
/// </summary> /// </summary>
/// <param name="document"></param> /// <param name="document"></param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param> /// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
/// <returns></returns>
public string Get(PdfDocument document, bool includePaths = false) public string Get(PdfDocument document, bool includePaths = false)
{ {
AltoDocument alto = CreateAltoDocument("unknown"); AltoDocument alto = CreateAltoDocument("unknown");
@@ -76,7 +75,6 @@ namespace UglyToad.PdfPig.Export
/// Get the Alto (XML) string of the page layout. Excludes <see cref="PdfPath"/>s. /// Get the Alto (XML) string of the page layout. Excludes <see cref="PdfPath"/>s.
/// </summary> /// </summary>
/// <param name="page"></param> /// <param name="page"></param>
/// <returns></returns>
public string Get(Page page) public string Get(Page page)
{ {
return Get(page, false); return Get(page, false);
@@ -87,7 +85,6 @@ namespace UglyToad.PdfPig.Export
/// </summary> /// </summary>
/// <param name="page"></param> /// <param name="page"></param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param> /// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
/// <returns></returns>
public string Get(Page page, bool includePaths) public string Get(Page page, bool includePaths)
{ {
AltoDocument alto = CreateAltoDocument("unknown"); AltoDocument alto = CreateAltoDocument("unknown");
@@ -102,7 +99,6 @@ namespace UglyToad.PdfPig.Export
/// Create an empty <see cref="AltoDocument"/>. /// Create an empty <see cref="AltoDocument"/>.
/// </summary> /// </summary>
/// <param name="fileName"></param> /// <param name="fileName"></param>
/// <returns></returns>
private AltoDocument CreateAltoDocument(string fileName) private AltoDocument CreateAltoDocument(string fileName)
{ {
return new AltoDocument() return new AltoDocument()
@@ -113,17 +109,9 @@ namespace UglyToad.PdfPig.Export
}, },
Description = GetAltoDescription(fileName), Description = GetAltoDescription(fileName),
SchemaVersion = "4", SchemaVersion = "4",
//Styles = new AltoStyles() { },
//Tags = new AltoTags() { }
}; };
} }
/// <summary>
///
/// </summary>
/// <param name="page"></param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
/// <returns></returns>
private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths) private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
{ {
pageCount = page.Number; pageCount = page.Number;
@@ -159,7 +147,7 @@ namespace UglyToad.PdfPig.Export
Illustrations = null, // TBD Illustrations = null, // TBD
ProcessingRefs = null, // TBD ProcessingRefs = null, // TBD
StyleRefs = null, // TBD StyleRefs = null, // TBD
Id = "P" + pageCount + "_PS" + pageSpaceCount.ToString("#00000") //P1_PS00001 Id = "P" + pageCount + "_PS" + pageSpaceCount.ToString("#00000")
}, },
Id = "P" + pageCount Id = "P" + pageCount
}; };
@@ -188,12 +176,6 @@ namespace UglyToad.PdfPig.Export
return altoPage; return altoPage;
} }
/// <summary>
///
/// </summary>
/// <param name="pdfPath"></param>
/// <param name="height"></param>
/// <returns></returns>
private AltoDocument.AltoGraphicalElement ToAltoGraphicalElement(PdfPath pdfPath, decimal height) private AltoDocument.AltoGraphicalElement ToAltoGraphicalElement(PdfPath pdfPath, decimal height)
{ {
graphicalElementCount++; graphicalElementCount++;
@@ -208,12 +190,10 @@ namespace UglyToad.PdfPig.Export
Height = (float)Math.Round(rectangle.Value.Height * scale), Height = (float)Math.Round(rectangle.Value.Height * scale),
Width = (float)Math.Round(rectangle.Value.Width * scale), Width = (float)Math.Round(rectangle.Value.Width * scale),
Rotation = 0, Rotation = 0,
//Cs = false,
StyleRefs = null, StyleRefs = null,
TagRefs = null, TagRefs = null,
title = null, title = null,
type = null, type = null,
//IdNext = "NA", // for reading order
Id = "P" + pageCount + "_GE" + graphicalElementCount.ToString("#00000") Id = "P" + pageCount + "_GE" + graphicalElementCount.ToString("#00000")
}; };
} }
@@ -233,17 +213,10 @@ namespace UglyToad.PdfPig.Export
Width = (float)Math.Round(rectangle.Width * scale), Width = (float)Math.Round(rectangle.Width * scale),
FileId = "", FileId = "",
Rotation = 0, Rotation = 0,
//IdNext = "NA", // for reading order
Id = "P" + pageCount + "_I" + illustrationCount.ToString("#00000") Id = "P" + pageCount + "_I" + illustrationCount.ToString("#00000")
}; };
} }
/// <summary>
///
/// </summary>
/// <param name="textBlock"></param>
/// <param name="height"></param>
/// <returns></returns>
private AltoDocument.AltoTextBlock ToAltoTextBlock(TextBlock textBlock, decimal height) private AltoDocument.AltoTextBlock ToAltoTextBlock(TextBlock textBlock, decimal height)
{ {
textBlockCount++; textBlockCount++;
@@ -254,24 +227,16 @@ namespace UglyToad.PdfPig.Export
HPos = (float)Math.Round(textBlock.BoundingBox.Left * scale), HPos = (float)Math.Round(textBlock.BoundingBox.Left * scale),
Height = (float)Math.Round(textBlock.BoundingBox.Height * scale), Height = (float)Math.Round(textBlock.BoundingBox.Height * scale),
Width = (float)Math.Round(textBlock.BoundingBox.Width * scale), Width = (float)Math.Round(textBlock.BoundingBox.Width * scale),
Rotation = 0, // check textBlock.TextDirection Rotation = 0,
TextLines = textBlock.TextLines.Select(l => ToAltoTextLine(l, height)).ToArray(), TextLines = textBlock.TextLines.Select(l => ToAltoTextLine(l, height)).ToArray(),
//Cs = false,
StyleRefs = null, StyleRefs = null,
TagRefs = null, TagRefs = null,
title = null, title = null,
type = null, type = null,
//IdNext = "NA", // for reading order
Id = "P" + pageCount + "_TB" + textBlockCount.ToString("#00000") Id = "P" + pageCount + "_TB" + textBlockCount.ToString("#00000")
}; };
} }
/// <summary>
///
/// </summary>
/// <param name="textLine"></param>
/// <param name="height"></param>
/// <returns></returns>
private AltoDocument.AltoTextBlockTextLine ToAltoTextLine(TextLine textLine, decimal height) private AltoDocument.AltoTextBlockTextLine ToAltoTextLine(TextLine textLine, decimal height)
{ {
textLineCount++; textLineCount++;
@@ -283,23 +248,15 @@ namespace UglyToad.PdfPig.Export
HPos = (float)Math.Round(textLine.BoundingBox.Left * scale), HPos = (float)Math.Round(textLine.BoundingBox.Left * scale),
Height = (float)Math.Round(textLine.BoundingBox.Height * scale), Height = (float)Math.Round(textLine.BoundingBox.Height * scale),
Width = (float)Math.Round(textLine.BoundingBox.Width * scale), Width = (float)Math.Round(textLine.BoundingBox.Width * scale),
BaseLine = float.NaN, // TBD BaseLine = float.NaN,
//Hyp = new AltoTextBlockTextLineHyp() { }, // TBD
Strings = strings, Strings = strings,
Lang = null, Lang = null,
//Sp = new AltoSP[0], // TBD
StyleRefs = null, StyleRefs = null,
TagRefs = null, TagRefs = null,
Id = "P" + pageCount + "_TL" + textLineCount.ToString("#00000") Id = "P" + pageCount + "_TL" + textLineCount.ToString("#00000")
}; };
} }
/// <summary>
///
/// </summary>
/// <param name="word"></param>
/// <param name="height"></param>
/// <returns></returns>
private AltoDocument.AltoString ToAltoString(Word word, decimal height) private AltoDocument.AltoString ToAltoString(Word word, decimal height)
{ {
stringCount++; stringCount++;
@@ -313,24 +270,15 @@ namespace UglyToad.PdfPig.Export
Glyph = glyphs, Glyph = glyphs,
Cc = string.Join("", glyphs.Select(g => 9f * (1f - g.Gc))), // from 0->1 to 9->0 Cc = string.Join("", glyphs.Select(g => 9f * (1f - g.Gc))), // from 0->1 to 9->0
Content = word.Text, Content = word.Text,
//Cs = false,
Lang = null, Lang = null,
//Style = AltoFontStyles.Bold,
StyleRefs = null, StyleRefs = null,
SubsContent = null, SubsContent = null,
//SubsType = AltoSubsType.Abbreviation,
TagRefs = null, TagRefs = null,
Wc = float.NaN, Wc = float.NaN,
Id = "P" + pageCount + "_ST" + stringCount.ToString("#00000") Id = "P" + pageCount + "_ST" + stringCount.ToString("#00000")
}; };
} }
/// <summary>
///
/// </summary>
/// <param name="letter"></param>
/// <param name="height"></param>
/// <returns></returns>
private AltoDocument.AltoGlyph ToAltoGlyph(Letter letter, decimal height) private AltoDocument.AltoGlyph ToAltoGlyph(Letter letter, decimal height)
{ {
glyphCount++; glyphCount++;
@@ -346,17 +294,12 @@ namespace UglyToad.PdfPig.Export
}; };
} }
/// <summary>
///
/// </summary>
/// <param name="fileName"></param>
/// <returns></returns>
private AltoDocument.AltoDescription GetAltoDescription(string fileName) private AltoDocument.AltoDescription GetAltoDescription(string fileName)
{ {
var processing = new AltoDocument.AltoDescriptionProcessing() var processing = new AltoDocument.AltoDescriptionProcessing()
{ {
ProcessingAgency = null, ProcessingAgency = null,
ProcessingCategory = AltoDocument.AltoProcessingCategory.Other, // TBD ProcessingCategory = AltoDocument.AltoProcessingCategory.Other,
ProcessingDateTime = DateTime.UtcNow.ToString(), ProcessingDateTime = DateTime.UtcNow.ToString(),
ProcessingSoftware = new AltoDocument.AltoProcessingSoftware() ProcessingSoftware = new AltoDocument.AltoProcessingSoftware()
{ {
@@ -384,7 +327,7 @@ namespace UglyToad.PdfPig.Export
return new AltoDocument.AltoDescription() return new AltoDocument.AltoDescription()
{ {
MeasurementUnit = AltoDocument.AltoMeasurementUnit.Pixel, // need to check that MeasurementUnit = AltoDocument.AltoMeasurementUnit.Pixel,
Processings = new[] { processing }, Processings = new[] { processing },
SourceImageInformation = new AltoDocument.AltoSourceImageInformation() SourceImageInformation = new AltoDocument.AltoSourceImageInformation()
{ {
@@ -400,18 +343,16 @@ namespace UglyToad.PdfPig.Export
XmlSerializer serializer = new XmlSerializer(typeof(AltoDocument)); XmlSerializer serializer = new XmlSerializer(typeof(AltoDocument));
var settings = new XmlWriterSettings() var settings = new XmlWriterSettings()
{ {
//Encoding = new System.Text.UTF8Encoding(true), Encoding = System.Text.Encoding.UTF8,
Indent = true, Indent = true,
IndentChars = indentChar, IndentChars = indentChar,
OmitXmlDeclaration = true // hack to manually handle utf-8
}; };
using (var stringWriter = new System.IO.StringWriter()) using (var memoryStream = new System.IO.MemoryStream())
using (var xmlWriter = XmlWriter.Create(stringWriter, settings)) using (var xmlWriter = XmlWriter.Create(memoryStream, settings))
{ {
stringWriter.WriteLine("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); // hack to manually handle utf-8
serializer.Serialize(xmlWriter, altoDocument); serializer.Serialize(xmlWriter, altoDocument);
return stringWriter.ToString(); return System.Text.Encoding.UTF8.GetString(memoryStream.ToArray());
} }
} }

View File

@@ -47,7 +47,7 @@ namespace UglyToad.PdfPig.Export
} }
/// <summary> /// <summary>
/// Get the hORC (HTML) string of the page layout. /// Get the hOCR (HTML) string of the page layout.
/// </summary> /// </summary>
/// <param name="document">The document.</param> /// <param name="document">The document.</param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param> /// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
@@ -70,17 +70,16 @@ namespace UglyToad.PdfPig.Export
} }
/// <summary> /// <summary>
/// Get the hORC (HTML) string of the page layout. Excludes <see cref="PdfPath"/>s. /// Get the hOCR (HTML) string of the page layout. Excludes <see cref="PdfPath"/>s.
/// </summary> /// </summary>
/// <param name="page">The page.</param> /// <param name="page">The page.</param>
/// <returns></returns>
public string Get(Page page) public string Get(Page page)
{ {
return Get(page, false); return Get(page, false);
} }
/// <summary> /// <summary>
/// Get the hORC (HTML) string of the page layout. /// Get the hOCR (HTML) string of the page layout.
/// </summary> /// </summary>
/// <param name="page">The page.</param> /// <param name="page">The page.</param>
/// <param name="imageName">The image name, if any.</param> /// <param name="imageName">The image name, if any.</param>
@@ -117,7 +116,6 @@ namespace UglyToad.PdfPig.Export
/// Get indent string from level. /// Get indent string from level.
/// </summary> /// </summary>
/// <param name="level">The indent level.</param> /// <param name="level">The indent level.</param>
/// <returns></returns>
private string GetIndent(int level) private string GetIndent(int level)
{ {
string indent = ""; string indent = "";
@@ -129,7 +127,7 @@ namespace UglyToad.PdfPig.Export
} }
/// <summary> /// <summary>
/// Get the hORC string for the page. /// Get the hOCR string for the page.
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page</para> /// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page</para>
/// </summary> /// </summary>
/// <param name="page"></param> /// <param name="page"></param>
@@ -174,14 +172,13 @@ namespace UglyToad.PdfPig.Export
} }
/// <summary> /// <summary>
/// Get the hORC string for the path. /// Get the hOCR string for the path.
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_linedrawing</para> /// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_linedrawing</para>
/// </summary> /// </summary>
/// <param name="path"></param> /// <param name="path"></param>
/// <param name="pageHeight"></param> /// <param name="pageHeight"></param>
/// <param name="subPaths"></param> /// <param name="subPaths"></param>
/// <param name="level">The indent level.</param> /// <param name="level">The indent level.</param>
/// <returns></returns>
private string GetCode(PdfPath path, decimal pageHeight, bool subPaths, int level) private string GetCode(PdfPath path, decimal pageHeight, bool subPaths, int level)
{ {
if (path == null) return string.Empty; if (path == null) return string.Empty;
@@ -232,7 +229,7 @@ namespace UglyToad.PdfPig.Export
} }
/// <summary> /// <summary>
/// Get the hORC string for the area. /// Get the hOCR string for the area.
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_carea</para> /// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_carea</para>
/// </summary> /// </summary>
/// <param name="block">The text area.</param> /// <param name="block">The text area.</param>
@@ -252,13 +249,12 @@ namespace UglyToad.PdfPig.Export
} }
/// <summary> /// <summary>
/// Get the hORC string for the paragraph. /// Get the hOCR string for the paragraph.
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_par</para> /// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_par</para>
/// </summary> /// </summary>
/// <param name="block">The paragraph.</param> /// <param name="block">The paragraph.</param>
/// <param name="pageHeight"></param> /// <param name="pageHeight"></param>
/// <param name="level">The indent level.</param> /// <param name="level">The indent level.</param>
/// <returns></returns>
private string GetCodeParagraph(TextBlock block, decimal pageHeight, int level) private string GetCodeParagraph(TextBlock block, decimal pageHeight, int level)
{ {
paraCount++; paraCount++;
@@ -275,7 +271,7 @@ namespace UglyToad.PdfPig.Export
} }
/// <summary> /// <summary>
/// Get the hORC string for the text line. /// Get the hOCR string for the text line.
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_line</para> /// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_line</para>
/// </summary> /// </summary>
/// <param name="line"></param> /// <param name="line"></param>
@@ -303,7 +299,7 @@ namespace UglyToad.PdfPig.Export
} }
/// <summary> /// <summary>
/// Get the hORC string for the word. /// Get the hOCR string for the word.
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocrx_word</para> /// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocrx_word</para>
/// </summary> /// </summary>
/// <param name="word"></param> /// <param name="word"></param>
@@ -334,7 +330,7 @@ namespace UglyToad.PdfPig.Export
} }
/// <summary> /// <summary>
/// Get the hORC string for the bounding box. /// Get the hOCR string for the bounding box.
/// <para>See http://kba.cloud/hocr-spec/1.2/#propdef-bbox</para> /// <para>See http://kba.cloud/hocr-spec/1.2/#propdef-bbox</para>
/// </summary> /// </summary>
/// <param name="rectangle"></param> /// <param name="rectangle"></param>

View File

@@ -4,6 +4,7 @@ using System.Collections.Generic;
using System.ComponentModel; using System.ComponentModel;
using System.Diagnostics; using System.Diagnostics;
using System.Linq; using System.Linq;
using System.Xml;
using System.Xml.Serialization; using System.Xml.Serialization;
using UglyToad.PdfPig.Content; using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis; using UglyToad.PdfPig.DocumentLayoutAnalysis;
@@ -60,7 +61,6 @@ namespace UglyToad.PdfPig.Export
/// Get the PAGE-XML (XML) string of the pages layout. Excludes <see cref="PdfPath"/>s. /// Get the PAGE-XML (XML) string of the pages layout. Excludes <see cref="PdfPath"/>s.
/// </summary> /// </summary>
/// <param name="page"></param> /// <param name="page"></param>
/// <returns></returns>
public string Get(Page page) public string Get(Page page)
{ {
return Get(page, false); return Get(page, false);
@@ -90,12 +90,6 @@ namespace UglyToad.PdfPig.Export
return Serialize(pageXmlDocument); return Serialize(pageXmlDocument);
} }
/// <summary>
///
/// </summary>
/// <param name="point"></param>
/// <param name="height"></param>
/// <returns></returns>
private string PointToString(PdfPoint point, decimal height) private string PointToString(PdfPoint point, decimal height)
{ {
decimal x = Math.Round(point.X * scale); decimal x = Math.Round(point.X * scale);
@@ -103,39 +97,20 @@ namespace UglyToad.PdfPig.Export
return (x > 0 ? x : 0).ToString("0") + "," + (y > 0 ? y : 0).ToString("0"); return (x > 0 ? x : 0).ToString("0") + "," + (y > 0 ? y : 0).ToString("0");
} }
/// <summary>
///
/// </summary>
/// <param name="points"></param>
/// <param name="height"></param>
/// <returns></returns>
private string ToPoints(IEnumerable<PdfPoint> points, decimal height) private string ToPoints(IEnumerable<PdfPoint> points, decimal height)
{ {
return string.Join(" ", points.Select(p => PointToString(p, height))); return string.Join(" ", points.Select(p => PointToString(p, height)));
} }
/// <summary>
///
/// </summary>
/// <param name="pdfRectangle"></param>
/// <param name="height"></param>
/// <returns></returns>
private string ToPoints(PdfRectangle pdfRectangle, decimal height) private string ToPoints(PdfRectangle pdfRectangle, decimal height)
{ {
return ToPoints(new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight }, height); return ToPoints(new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight }, height);
} }
/// <summary>
///
/// </summary>
/// <param name="pdfRectangle"></param>
/// <param name="height"></param>
/// <returns></returns>
private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, decimal height) private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, decimal height)
{ {
return new PageXmlDocument.PageXmlCoords() return new PageXmlDocument.PageXmlCoords()
{ {
//Conf = 1,
Points = ToPoints(pdfRectangle, height) Points = ToPoints(pdfRectangle, height)
}; };
} }
@@ -152,38 +127,17 @@ namespace UglyToad.PdfPig.Export
int blue = 65536 * (int)Math.Round(255f * (float)rgb.b); int blue = 65536 * (int)Math.Round(255f * (float)rgb.b);
int sum = red + green + blue; int sum = red + green + blue;
// as per below, red and blue order might be inverted... // as per below, red and blue order might be inverted... var colorWin = System.Drawing.Color.FromArgb(sum);
//var colorWin = System.Drawing.Color.FromArgb(sum);
return sum.ToString(); return sum.ToString();
} }
/// <summary>
///
/// </summary>
/// <param name="page"></param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths) private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths)
{ {
var pageXmlPage = new PageXmlDocument.PageXmlPage() var pageXmlPage = new PageXmlDocument.PageXmlPage()
{ {
//Border = new PageXmlBorder()
//{
// Coords = new PageXmlCoords()
// {
// Points = page.
// }
//},
ImageFilename = "unknown", ImageFilename = "unknown",
ImageHeight = (int)Math.Round(page.Height * scale), ImageHeight = (int)Math.Round(page.Height * scale),
ImageWidth = (int)Math.Round(page.Width * scale), ImageWidth = (int)Math.Round(page.Width * scale),
//PrintSpace = new PageXmlPrintSpace()
//{
// Coords = new PageXmlCoords()
// {
// }
//}
}; };
var regions = new List<PageXmlDocument.PageXmlRegion>(); var regions = new List<PageXmlDocument.PageXmlRegion>();
@@ -240,12 +194,6 @@ namespace UglyToad.PdfPig.Export
}; };
} }
/// <summary>
///
/// </summary>
/// <param name="textBlock"></param>
/// <param name="height"></param>
/// <returns></returns>
private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, decimal height) private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, decimal height)
{ {
regionCount++; regionCount++;
@@ -258,33 +206,19 @@ namespace UglyToad.PdfPig.Export
}; };
} }
/// <summary>
///
/// </summary>
/// <param name="textLine"></param>
/// <param name="height"></param>
/// <returns></returns>
private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, decimal height) private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, decimal height)
{ {
lineCount++; lineCount++;
return new PageXmlDocument.PageXmlTextLine() return new PageXmlDocument.PageXmlTextLine()
{ {
Coords = ToCoords(textLine.BoundingBox, height), Coords = ToCoords(textLine.BoundingBox, height),
//Baseline = new PageXmlBaseline() { },
Production = PageXmlDocument.PageXmlProductionSimpleType.Printed, Production = PageXmlDocument.PageXmlProductionSimpleType.Printed,
//ReadingDirection = PageXmlReadingDirectionSimpleType.LeftToRight,
Words = textLine.Words.Select(w => ToPageXmlWord(w, height)).ToArray(), Words = textLine.Words.Select(w => ToPageXmlWord(w, height)).ToArray(),
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } }, TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } },
Id = "l" + lineCount Id = "l" + lineCount
}; };
} }
/// <summary>
///
/// </summary>
/// <param name="word"></param>
/// <param name="height"></param>
/// <returns></returns>
private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, decimal height) private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, decimal height)
{ {
wordCount++; wordCount++;
@@ -297,12 +231,6 @@ namespace UglyToad.PdfPig.Export
}; };
} }
/// <summary>
///
/// </summary>
/// <param name="letter"></param>
/// <param name="height"></param>
/// <returns></returns>
private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, decimal height) private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, decimal height)
{ {
glyphCount++; glyphCount++;
@@ -326,7 +254,7 @@ namespace UglyToad.PdfPig.Export
{ {
XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument)); XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument));
using (var reader = System.Xml.XmlReader.Create(xmlPath)) using (var reader = XmlReader.Create(xmlPath))
{ {
return (PageXmlDocument)serializer.Deserialize(reader); return (PageXmlDocument)serializer.Deserialize(reader);
} }
@@ -335,20 +263,18 @@ namespace UglyToad.PdfPig.Export
private string Serialize(PageXmlDocument pageXmlDocument) private string Serialize(PageXmlDocument pageXmlDocument)
{ {
XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument)); XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument));
var settings = new System.Xml.XmlWriterSettings() var settings = new XmlWriterSettings()
{ {
//Encoding = new System.Text.UTF8Encoding(true), Encoding = System.Text.Encoding.UTF8,
Indent = true, Indent = true,
IndentChars = indentChar, IndentChars = indentChar,
OmitXmlDeclaration = true // hack to manually handle utf-8
}; };
using (var stringWriter = new System.IO.StringWriter()) using (var memoryStream = new System.IO.MemoryStream())
using (var xmlWriter = System.Xml.XmlWriter.Create(stringWriter, settings)) using (var xmlWriter = XmlWriter.Create(memoryStream, settings))
{ {
stringWriter.WriteLine("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); // hack to manually handle utf-8
serializer.Serialize(xmlWriter, pageXmlDocument); serializer.Serialize(xmlWriter, pageXmlDocument);
return stringWriter.ToString(); return System.Text.Encoding.UTF8.GetString(memoryStream.ToArray());
} }
} }
} }