Better handling of UTF8 in XmlWriter

This commit is contained in:
BobLd
2019-10-10 14:14:05 +01:00
parent fe1a3c4b8b
commit a15f56a6ac
3 changed files with 27 additions and 164 deletions

View File

@@ -56,7 +56,6 @@ namespace UglyToad.PdfPig.Export
/// </summary>
/// <param name="document"></param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
/// <returns></returns>
public string Get(PdfDocument document, bool includePaths = false)
{
AltoDocument alto = CreateAltoDocument("unknown");
@@ -76,7 +75,6 @@ namespace UglyToad.PdfPig.Export
/// Get the Alto (XML) string of the page layout. Excludes <see cref="PdfPath"/>s.
/// </summary>
/// <param name="page"></param>
/// <returns></returns>
public string Get(Page page)
{
return Get(page, false);
@@ -87,7 +85,6 @@ namespace UglyToad.PdfPig.Export
/// </summary>
/// <param name="page"></param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
/// <returns></returns>
public string Get(Page page, bool includePaths)
{
AltoDocument alto = CreateAltoDocument("unknown");
@@ -102,7 +99,6 @@ namespace UglyToad.PdfPig.Export
/// Create an empty <see cref="AltoDocument"/>.
/// </summary>
/// <param name="fileName"></param>
/// <returns></returns>
private AltoDocument CreateAltoDocument(string fileName)
{
return new AltoDocument()
@@ -113,17 +109,9 @@ namespace UglyToad.PdfPig.Export
},
Description = GetAltoDescription(fileName),
SchemaVersion = "4",
//Styles = new AltoStyles() { },
//Tags = new AltoTags() { }
};
}
/// <summary>
///
/// </summary>
/// <param name="page"></param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
/// <returns></returns>
private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
{
pageCount = page.Number;
@@ -159,7 +147,7 @@ namespace UglyToad.PdfPig.Export
Illustrations = null, // TBD
ProcessingRefs = null, // TBD
StyleRefs = null, // TBD
Id = "P" + pageCount + "_PS" + pageSpaceCount.ToString("#00000") //P1_PS00001
Id = "P" + pageCount + "_PS" + pageSpaceCount.ToString("#00000")
},
Id = "P" + pageCount
};
@@ -188,12 +176,6 @@ namespace UglyToad.PdfPig.Export
return altoPage;
}
/// <summary>
///
/// </summary>
/// <param name="pdfPath"></param>
/// <param name="height"></param>
/// <returns></returns>
private AltoDocument.AltoGraphicalElement ToAltoGraphicalElement(PdfPath pdfPath, decimal height)
{
graphicalElementCount++;
@@ -208,12 +190,10 @@ namespace UglyToad.PdfPig.Export
Height = (float)Math.Round(rectangle.Value.Height * scale),
Width = (float)Math.Round(rectangle.Value.Width * scale),
Rotation = 0,
//Cs = false,
StyleRefs = null,
TagRefs = null,
title = null,
type = null,
//IdNext = "NA", // for reading order
Id = "P" + pageCount + "_GE" + graphicalElementCount.ToString("#00000")
};
}
@@ -233,17 +213,10 @@ namespace UglyToad.PdfPig.Export
Width = (float)Math.Round(rectangle.Width * scale),
FileId = "",
Rotation = 0,
//IdNext = "NA", // for reading order
Id = "P" + pageCount + "_I" + illustrationCount.ToString("#00000")
};
}
/// <summary>
///
/// </summary>
/// <param name="textBlock"></param>
/// <param name="height"></param>
/// <returns></returns>
private AltoDocument.AltoTextBlock ToAltoTextBlock(TextBlock textBlock, decimal height)
{
textBlockCount++;
@@ -254,24 +227,16 @@ namespace UglyToad.PdfPig.Export
HPos = (float)Math.Round(textBlock.BoundingBox.Left * scale),
Height = (float)Math.Round(textBlock.BoundingBox.Height * scale),
Width = (float)Math.Round(textBlock.BoundingBox.Width * scale),
Rotation = 0, // check textBlock.TextDirection
Rotation = 0,
TextLines = textBlock.TextLines.Select(l => ToAltoTextLine(l, height)).ToArray(),
//Cs = false,
StyleRefs = null,
TagRefs = null,
title = null,
type = null,
//IdNext = "NA", // for reading order
Id = "P" + pageCount + "_TB" + textBlockCount.ToString("#00000")
};
}
/// <summary>
///
/// </summary>
/// <param name="textLine"></param>
/// <param name="height"></param>
/// <returns></returns>
private AltoDocument.AltoTextBlockTextLine ToAltoTextLine(TextLine textLine, decimal height)
{
textLineCount++;
@@ -283,23 +248,15 @@ namespace UglyToad.PdfPig.Export
HPos = (float)Math.Round(textLine.BoundingBox.Left * scale),
Height = (float)Math.Round(textLine.BoundingBox.Height * scale),
Width = (float)Math.Round(textLine.BoundingBox.Width * scale),
BaseLine = float.NaN, // TBD
//Hyp = new AltoTextBlockTextLineHyp() { }, // TBD
BaseLine = float.NaN,
Strings = strings,
Lang = null,
//Sp = new AltoSP[0], // TBD
StyleRefs = null,
TagRefs = null,
Id = "P" + pageCount + "_TL" + textLineCount.ToString("#00000")
};
}
/// <summary>
///
/// </summary>
/// <param name="word"></param>
/// <param name="height"></param>
/// <returns></returns>
private AltoDocument.AltoString ToAltoString(Word word, decimal height)
{
stringCount++;
@@ -313,24 +270,15 @@ namespace UglyToad.PdfPig.Export
Glyph = glyphs,
Cc = string.Join("", glyphs.Select(g => 9f * (1f - g.Gc))), // from 0->1 to 9->0
Content = word.Text,
//Cs = false,
Lang = null,
//Style = AltoFontStyles.Bold,
StyleRefs = null,
SubsContent = null,
//SubsType = AltoSubsType.Abbreviation,
TagRefs = null,
Wc = float.NaN,
Id = "P" + pageCount + "_ST" + stringCount.ToString("#00000")
};
}
/// <summary>
///
/// </summary>
/// <param name="letter"></param>
/// <param name="height"></param>
/// <returns></returns>
private AltoDocument.AltoGlyph ToAltoGlyph(Letter letter, decimal height)
{
glyphCount++;
@@ -346,17 +294,12 @@ namespace UglyToad.PdfPig.Export
};
}
/// <summary>
///
/// </summary>
/// <param name="fileName"></param>
/// <returns></returns>
private AltoDocument.AltoDescription GetAltoDescription(string fileName)
{
var processing = new AltoDocument.AltoDescriptionProcessing()
{
ProcessingAgency = null,
ProcessingCategory = AltoDocument.AltoProcessingCategory.Other, // TBD
ProcessingCategory = AltoDocument.AltoProcessingCategory.Other,
ProcessingDateTime = DateTime.UtcNow.ToString(),
ProcessingSoftware = new AltoDocument.AltoProcessingSoftware()
{
@@ -384,7 +327,7 @@ namespace UglyToad.PdfPig.Export
return new AltoDocument.AltoDescription()
{
MeasurementUnit = AltoDocument.AltoMeasurementUnit.Pixel, // need to check that
MeasurementUnit = AltoDocument.AltoMeasurementUnit.Pixel,
Processings = new[] { processing },
SourceImageInformation = new AltoDocument.AltoSourceImageInformation()
{
@@ -400,18 +343,16 @@ namespace UglyToad.PdfPig.Export
XmlSerializer serializer = new XmlSerializer(typeof(AltoDocument));
var settings = new XmlWriterSettings()
{
//Encoding = new System.Text.UTF8Encoding(true),
Encoding = System.Text.Encoding.UTF8,
Indent = true,
IndentChars = indentChar,
OmitXmlDeclaration = true // hack to manually handle utf-8
};
using (var stringWriter = new System.IO.StringWriter())
using (var xmlWriter = XmlWriter.Create(stringWriter, settings))
using (var memoryStream = new System.IO.MemoryStream())
using (var xmlWriter = XmlWriter.Create(memoryStream, settings))
{
stringWriter.WriteLine("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); // hack to manually handle utf-8
serializer.Serialize(xmlWriter, altoDocument);
return stringWriter.ToString();
return System.Text.Encoding.UTF8.GetString(memoryStream.ToArray());
}
}

View File

@@ -47,7 +47,7 @@ namespace UglyToad.PdfPig.Export
}
/// <summary>
/// Get the hORC (HTML) string of the page layout.
/// Get the hOCR (HTML) string of the page layout.
/// </summary>
/// <param name="document">The document.</param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
@@ -70,17 +70,16 @@ namespace UglyToad.PdfPig.Export
}
/// <summary>
/// Get the hORC (HTML) string of the page layout. Excludes <see cref="PdfPath"/>s.
/// Get the hOCR (HTML) string of the page layout. Excludes <see cref="PdfPath"/>s.
/// </summary>
/// <param name="page">The page.</param>
/// <returns></returns>
public string Get(Page page)
{
return Get(page, false);
}
/// <summary>
/// Get the hORC (HTML) string of the page layout.
/// Get the hOCR (HTML) string of the page layout.
/// </summary>
/// <param name="page">The page.</param>
/// <param name="imageName">The image name, if any.</param>
@@ -117,7 +116,6 @@ namespace UglyToad.PdfPig.Export
/// Get indent string from level.
/// </summary>
/// <param name="level">The indent level.</param>
/// <returns></returns>
private string GetIndent(int level)
{
string indent = "";
@@ -129,7 +127,7 @@ namespace UglyToad.PdfPig.Export
}
/// <summary>
/// Get the hORC string for the page.
/// Get the hOCR string for the page.
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page</para>
/// </summary>
/// <param name="page"></param>
@@ -174,14 +172,13 @@ namespace UglyToad.PdfPig.Export
}
/// <summary>
/// Get the hORC string for the path.
/// Get the hOCR string for the path.
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_linedrawing</para>
/// </summary>
/// <param name="path"></param>
/// <param name="pageHeight"></param>
/// <param name="subPaths"></param>
/// <param name="level">The indent level.</param>
/// <returns></returns>
private string GetCode(PdfPath path, decimal pageHeight, bool subPaths, int level)
{
if (path == null) return string.Empty;
@@ -232,7 +229,7 @@ namespace UglyToad.PdfPig.Export
}
/// <summary>
/// Get the hORC string for the area.
/// Get the hOCR string for the area.
/// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_carea</para>
/// </summary>
/// <param name="block">The text area.</param>
@@ -252,13 +249,12 @@ namespace UglyToad.PdfPig.Export
}
/// <summary>
/// Get the hORC string for the paragraph.
/// Get the hOCR string for the paragraph.
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_par</para>
/// </summary>
/// <param name="block">The paragraph.</param>
/// <param name="pageHeight"></param>
/// <param name="level">The indent level.</param>
/// <returns></returns>
private string GetCodeParagraph(TextBlock block, decimal pageHeight, int level)
{
paraCount++;
@@ -275,7 +271,7 @@ namespace UglyToad.PdfPig.Export
}
/// <summary>
/// Get the hORC string for the text line.
/// Get the hOCR string for the text line.
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_line</para>
/// </summary>
/// <param name="line"></param>
@@ -303,7 +299,7 @@ namespace UglyToad.PdfPig.Export
}
/// <summary>
/// Get the hORC string for the word.
/// Get the hOCR string for the word.
/// <para>See http://kba.cloud/hocr-spec/1.2/#elementdef-ocrx_word</para>
/// </summary>
/// <param name="word"></param>
@@ -334,7 +330,7 @@ namespace UglyToad.PdfPig.Export
}
/// <summary>
/// Get the hORC string for the bounding box.
/// Get the hOCR string for the bounding box.
/// <para>See http://kba.cloud/hocr-spec/1.2/#propdef-bbox</para>
/// </summary>
/// <param name="rectangle"></param>

View File

@@ -4,6 +4,7 @@ using System.Collections.Generic;
using System.ComponentModel;
using System.Diagnostics;
using System.Linq;
using System.Xml;
using System.Xml.Serialization;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis;
@@ -60,7 +61,6 @@ namespace UglyToad.PdfPig.Export
/// Get the PAGE-XML (XML) string of the pages layout. Excludes <see cref="PdfPath"/>s.
/// </summary>
/// <param name="page"></param>
/// <returns></returns>
public string Get(Page page)
{
return Get(page, false);
@@ -90,12 +90,6 @@ namespace UglyToad.PdfPig.Export
return Serialize(pageXmlDocument);
}
/// <summary>
///
/// </summary>
/// <param name="point"></param>
/// <param name="height"></param>
/// <returns></returns>
private string PointToString(PdfPoint point, decimal height)
{
decimal x = Math.Round(point.X * scale);
@@ -103,39 +97,20 @@ namespace UglyToad.PdfPig.Export
return (x > 0 ? x : 0).ToString("0") + "," + (y > 0 ? y : 0).ToString("0");
}
/// <summary>
///
/// </summary>
/// <param name="points"></param>
/// <param name="height"></param>
/// <returns></returns>
private string ToPoints(IEnumerable<PdfPoint> points, decimal height)
{
return string.Join(" ", points.Select(p => PointToString(p, height)));
}
/// <summary>
///
/// </summary>
/// <param name="pdfRectangle"></param>
/// <param name="height"></param>
/// <returns></returns>
private string ToPoints(PdfRectangle pdfRectangle, decimal height)
{
return ToPoints(new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight }, height);
}
/// <summary>
///
/// </summary>
/// <param name="pdfRectangle"></param>
/// <param name="height"></param>
/// <returns></returns>
private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, decimal height)
{
return new PageXmlDocument.PageXmlCoords()
{
//Conf = 1,
Points = ToPoints(pdfRectangle, height)
};
}
@@ -152,38 +127,17 @@ namespace UglyToad.PdfPig.Export
int blue = 65536 * (int)Math.Round(255f * (float)rgb.b);
int sum = red + green + blue;
// as per below, red and blue order might be inverted...
//var colorWin = System.Drawing.Color.FromArgb(sum);
// as per below, red and blue order might be inverted... var colorWin = System.Drawing.Color.FromArgb(sum);
return sum.ToString();
}
/// <summary>
///
/// </summary>
/// <param name="page"></param>
/// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths)
{
var pageXmlPage = new PageXmlDocument.PageXmlPage()
{
//Border = new PageXmlBorder()
//{
// Coords = new PageXmlCoords()
// {
// Points = page.
// }
//},
ImageFilename = "unknown",
ImageHeight = (int)Math.Round(page.Height * scale),
ImageWidth = (int)Math.Round(page.Width * scale),
//PrintSpace = new PageXmlPrintSpace()
//{
// Coords = new PageXmlCoords()
// {
// }
//}
};
var regions = new List<PageXmlDocument.PageXmlRegion>();
@@ -240,12 +194,6 @@ namespace UglyToad.PdfPig.Export
};
}
/// <summary>
///
/// </summary>
/// <param name="textBlock"></param>
/// <param name="height"></param>
/// <returns></returns>
private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, decimal height)
{
regionCount++;
@@ -258,33 +206,19 @@ namespace UglyToad.PdfPig.Export
};
}
/// <summary>
///
/// </summary>
/// <param name="textLine"></param>
/// <param name="height"></param>
/// <returns></returns>
private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, decimal height)
{
lineCount++;
return new PageXmlDocument.PageXmlTextLine()
{
Coords = ToCoords(textLine.BoundingBox, height),
//Baseline = new PageXmlBaseline() { },
Production = PageXmlDocument.PageXmlProductionSimpleType.Printed,
//ReadingDirection = PageXmlReadingDirectionSimpleType.LeftToRight,
Words = textLine.Words.Select(w => ToPageXmlWord(w, height)).ToArray(),
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } },
Id = "l" + lineCount
};
}
/// <summary>
///
/// </summary>
/// <param name="word"></param>
/// <param name="height"></param>
/// <returns></returns>
private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, decimal height)
{
wordCount++;
@@ -297,12 +231,6 @@ namespace UglyToad.PdfPig.Export
};
}
/// <summary>
///
/// </summary>
/// <param name="letter"></param>
/// <param name="height"></param>
/// <returns></returns>
private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, decimal height)
{
glyphCount++;
@@ -326,7 +254,7 @@ namespace UglyToad.PdfPig.Export
{
XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument));
using (var reader = System.Xml.XmlReader.Create(xmlPath))
using (var reader = XmlReader.Create(xmlPath))
{
return (PageXmlDocument)serializer.Deserialize(reader);
}
@@ -335,20 +263,18 @@ namespace UglyToad.PdfPig.Export
private string Serialize(PageXmlDocument pageXmlDocument)
{
XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument));
var settings = new System.Xml.XmlWriterSettings()
var settings = new XmlWriterSettings()
{
//Encoding = new System.Text.UTF8Encoding(true),
Encoding = System.Text.Encoding.UTF8,
Indent = true,
IndentChars = indentChar,
OmitXmlDeclaration = true // hack to manually handle utf-8
};
using (var stringWriter = new System.IO.StringWriter())
using (var xmlWriter = System.Xml.XmlWriter.Create(stringWriter, settings))
using (var memoryStream = new System.IO.MemoryStream())
using (var xmlWriter = XmlWriter.Create(memoryStream, settings))
{
stringWriter.WriteLine("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); // hack to manually handle utf-8
serializer.Serialize(xmlWriter, pageXmlDocument);
return stringWriter.ToString();
return System.Text.Encoding.UTF8.GetString(memoryStream.ToArray());
}
}
}