diff --git a/src/UglyToad.PdfPig/Export/AltoXmlTextExporter.cs b/src/UglyToad.PdfPig/Export/AltoXmlTextExporter.cs index 9897d5ba..99e1727a 100644 --- a/src/UglyToad.PdfPig/Export/AltoXmlTextExporter.cs +++ b/src/UglyToad.PdfPig/Export/AltoXmlTextExporter.cs @@ -56,7 +56,6 @@ namespace UglyToad.PdfPig.Export /// /// /// Draw s present in the page. - /// public string Get(PdfDocument document, bool includePaths = false) { AltoDocument alto = CreateAltoDocument("unknown"); @@ -76,7 +75,6 @@ namespace UglyToad.PdfPig.Export /// Get the Alto (XML) string of the page layout. Excludes s. /// /// - /// public string Get(Page page) { return Get(page, false); @@ -87,7 +85,6 @@ namespace UglyToad.PdfPig.Export /// /// /// Draw s present in the page. - /// public string Get(Page page, bool includePaths) { AltoDocument alto = CreateAltoDocument("unknown"); @@ -102,7 +99,6 @@ namespace UglyToad.PdfPig.Export /// Create an empty . /// /// - /// private AltoDocument CreateAltoDocument(string fileName) { return new AltoDocument() @@ -113,17 +109,9 @@ namespace UglyToad.PdfPig.Export }, Description = GetAltoDescription(fileName), SchemaVersion = "4", - //Styles = new AltoStyles() { }, - //Tags = new AltoTags() { } }; } - /// - /// - /// - /// - /// Draw s present in the page. - /// private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths) { pageCount = page.Number; @@ -159,7 +147,7 @@ namespace UglyToad.PdfPig.Export Illustrations = null, // TBD ProcessingRefs = null, // TBD StyleRefs = null, // TBD - Id = "P" + pageCount + "_PS" + pageSpaceCount.ToString("#00000") //P1_PS00001 + Id = "P" + pageCount + "_PS" + pageSpaceCount.ToString("#00000") }, Id = "P" + pageCount }; @@ -188,12 +176,6 @@ namespace UglyToad.PdfPig.Export return altoPage; } - /// - /// - /// - /// - /// - /// private AltoDocument.AltoGraphicalElement ToAltoGraphicalElement(PdfPath pdfPath, decimal height) { graphicalElementCount++; @@ -208,12 +190,10 @@ namespace UglyToad.PdfPig.Export Height = (float)Math.Round(rectangle.Value.Height * scale), Width = (float)Math.Round(rectangle.Value.Width * scale), Rotation = 0, - //Cs = false, StyleRefs = null, TagRefs = null, title = null, type = null, - //IdNext = "NA", // for reading order Id = "P" + pageCount + "_GE" + graphicalElementCount.ToString("#00000") }; } @@ -233,17 +213,10 @@ namespace UglyToad.PdfPig.Export Width = (float)Math.Round(rectangle.Width * scale), FileId = "", Rotation = 0, - //IdNext = "NA", // for reading order Id = "P" + pageCount + "_I" + illustrationCount.ToString("#00000") }; } - /// - /// - /// - /// - /// - /// private AltoDocument.AltoTextBlock ToAltoTextBlock(TextBlock textBlock, decimal height) { textBlockCount++; @@ -254,24 +227,16 @@ namespace UglyToad.PdfPig.Export HPos = (float)Math.Round(textBlock.BoundingBox.Left * scale), Height = (float)Math.Round(textBlock.BoundingBox.Height * scale), Width = (float)Math.Round(textBlock.BoundingBox.Width * scale), - Rotation = 0, // check textBlock.TextDirection + Rotation = 0, TextLines = textBlock.TextLines.Select(l => ToAltoTextLine(l, height)).ToArray(), - //Cs = false, StyleRefs = null, TagRefs = null, title = null, type = null, - //IdNext = "NA", // for reading order Id = "P" + pageCount + "_TB" + textBlockCount.ToString("#00000") }; } - /// - /// - /// - /// - /// - /// private AltoDocument.AltoTextBlockTextLine ToAltoTextLine(TextLine textLine, decimal height) { textLineCount++; @@ -283,23 +248,15 @@ namespace UglyToad.PdfPig.Export HPos = (float)Math.Round(textLine.BoundingBox.Left * scale), Height = (float)Math.Round(textLine.BoundingBox.Height * scale), Width = (float)Math.Round(textLine.BoundingBox.Width * scale), - BaseLine = float.NaN, // TBD - //Hyp = new AltoTextBlockTextLineHyp() { }, // TBD + BaseLine = float.NaN, Strings = strings, Lang = null, - //Sp = new AltoSP[0], // TBD StyleRefs = null, TagRefs = null, Id = "P" + pageCount + "_TL" + textLineCount.ToString("#00000") }; } - /// - /// - /// - /// - /// - /// private AltoDocument.AltoString ToAltoString(Word word, decimal height) { stringCount++; @@ -313,24 +270,15 @@ namespace UglyToad.PdfPig.Export Glyph = glyphs, Cc = string.Join("", glyphs.Select(g => 9f * (1f - g.Gc))), // from 0->1 to 9->0 Content = word.Text, - //Cs = false, Lang = null, - //Style = AltoFontStyles.Bold, StyleRefs = null, SubsContent = null, - //SubsType = AltoSubsType.Abbreviation, TagRefs = null, Wc = float.NaN, Id = "P" + pageCount + "_ST" + stringCount.ToString("#00000") }; } - /// - /// - /// - /// - /// - /// private AltoDocument.AltoGlyph ToAltoGlyph(Letter letter, decimal height) { glyphCount++; @@ -346,17 +294,12 @@ namespace UglyToad.PdfPig.Export }; } - /// - /// - /// - /// - /// private AltoDocument.AltoDescription GetAltoDescription(string fileName) { var processing = new AltoDocument.AltoDescriptionProcessing() { ProcessingAgency = null, - ProcessingCategory = AltoDocument.AltoProcessingCategory.Other, // TBD + ProcessingCategory = AltoDocument.AltoProcessingCategory.Other, ProcessingDateTime = DateTime.UtcNow.ToString(), ProcessingSoftware = new AltoDocument.AltoProcessingSoftware() { @@ -384,7 +327,7 @@ namespace UglyToad.PdfPig.Export return new AltoDocument.AltoDescription() { - MeasurementUnit = AltoDocument.AltoMeasurementUnit.Pixel, // need to check that + MeasurementUnit = AltoDocument.AltoMeasurementUnit.Pixel, Processings = new[] { processing }, SourceImageInformation = new AltoDocument.AltoSourceImageInformation() { @@ -400,18 +343,16 @@ namespace UglyToad.PdfPig.Export XmlSerializer serializer = new XmlSerializer(typeof(AltoDocument)); var settings = new XmlWriterSettings() { - //Encoding = new System.Text.UTF8Encoding(true), + Encoding = System.Text.Encoding.UTF8, Indent = true, IndentChars = indentChar, - OmitXmlDeclaration = true // hack to manually handle utf-8 }; - using (var stringWriter = new System.IO.StringWriter()) - using (var xmlWriter = XmlWriter.Create(stringWriter, settings)) + using (var memoryStream = new System.IO.MemoryStream()) + using (var xmlWriter = XmlWriter.Create(memoryStream, settings)) { - stringWriter.WriteLine(""); // hack to manually handle utf-8 serializer.Serialize(xmlWriter, altoDocument); - return stringWriter.ToString(); + return System.Text.Encoding.UTF8.GetString(memoryStream.ToArray()); } } diff --git a/src/UglyToad.PdfPig/Export/HOcrTextExporter.cs b/src/UglyToad.PdfPig/Export/HOcrTextExporter.cs index d94b27e1..a380a3da 100644 --- a/src/UglyToad.PdfPig/Export/HOcrTextExporter.cs +++ b/src/UglyToad.PdfPig/Export/HOcrTextExporter.cs @@ -47,7 +47,7 @@ namespace UglyToad.PdfPig.Export } /// - /// Get the hORC (HTML) string of the page layout. + /// Get the hOCR (HTML) string of the page layout. /// /// The document. /// Draw s present in the page. @@ -70,17 +70,16 @@ namespace UglyToad.PdfPig.Export } /// - /// Get the hORC (HTML) string of the page layout. Excludes s. + /// Get the hOCR (HTML) string of the page layout. Excludes s. /// /// The page. - /// public string Get(Page page) { return Get(page, false); } /// - /// Get the hORC (HTML) string of the page layout. + /// Get the hOCR (HTML) string of the page layout. /// /// The page. /// The image name, if any. @@ -117,7 +116,6 @@ namespace UglyToad.PdfPig.Export /// Get indent string from level. /// /// The indent level. - /// private string GetIndent(int level) { string indent = ""; @@ -129,7 +127,7 @@ namespace UglyToad.PdfPig.Export } /// - /// Get the hORC string for the page. + /// Get the hOCR string for the page. /// http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page /// /// @@ -174,14 +172,13 @@ namespace UglyToad.PdfPig.Export } /// - /// Get the hORC string for the path. + /// Get the hOCR string for the path. /// http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_linedrawing /// /// /// /// /// The indent level. - /// private string GetCode(PdfPath path, decimal pageHeight, bool subPaths, int level) { if (path == null) return string.Empty; @@ -232,7 +229,7 @@ namespace UglyToad.PdfPig.Export } /// - /// Get the hORC string for the area. + /// Get the hOCR string for the area. /// http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_carea /// /// The text area. @@ -252,13 +249,12 @@ namespace UglyToad.PdfPig.Export } /// - /// Get the hORC string for the paragraph. + /// Get the hOCR string for the paragraph. /// See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_par /// /// The paragraph. /// /// The indent level. - /// private string GetCodeParagraph(TextBlock block, decimal pageHeight, int level) { paraCount++; @@ -275,7 +271,7 @@ namespace UglyToad.PdfPig.Export } /// - /// Get the hORC string for the text line. + /// Get the hOCR string for the text line. /// See http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_line /// /// @@ -303,7 +299,7 @@ namespace UglyToad.PdfPig.Export } /// - /// Get the hORC string for the word. + /// Get the hOCR string for the word. /// See http://kba.cloud/hocr-spec/1.2/#elementdef-ocrx_word /// /// @@ -334,7 +330,7 @@ namespace UglyToad.PdfPig.Export } /// - /// Get the hORC string for the bounding box. + /// Get the hOCR string for the bounding box. /// See http://kba.cloud/hocr-spec/1.2/#propdef-bbox /// /// diff --git a/src/UglyToad.PdfPig/Export/PageXmlTextExporter.cs b/src/UglyToad.PdfPig/Export/PageXmlTextExporter.cs index f1853965..e6594f0e 100644 --- a/src/UglyToad.PdfPig/Export/PageXmlTextExporter.cs +++ b/src/UglyToad.PdfPig/Export/PageXmlTextExporter.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.ComponentModel; using System.Diagnostics; using System.Linq; +using System.Xml; using System.Xml.Serialization; using UglyToad.PdfPig.Content; using UglyToad.PdfPig.DocumentLayoutAnalysis; @@ -60,7 +61,6 @@ namespace UglyToad.PdfPig.Export /// Get the PAGE-XML (XML) string of the pages layout. Excludes s. /// /// - /// public string Get(Page page) { return Get(page, false); @@ -90,12 +90,6 @@ namespace UglyToad.PdfPig.Export return Serialize(pageXmlDocument); } - /// - /// - /// - /// - /// - /// private string PointToString(PdfPoint point, decimal height) { decimal x = Math.Round(point.X * scale); @@ -103,39 +97,20 @@ namespace UglyToad.PdfPig.Export return (x > 0 ? x : 0).ToString("0") + "," + (y > 0 ? y : 0).ToString("0"); } - /// - /// - /// - /// - /// - /// private string ToPoints(IEnumerable points, decimal height) { return string.Join(" ", points.Select(p => PointToString(p, height))); } - /// - /// - /// - /// - /// - /// private string ToPoints(PdfRectangle pdfRectangle, decimal height) { return ToPoints(new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight }, height); } - /// - /// - /// - /// - /// - /// private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, decimal height) { return new PageXmlDocument.PageXmlCoords() { - //Conf = 1, Points = ToPoints(pdfRectangle, height) }; } @@ -152,38 +127,17 @@ namespace UglyToad.PdfPig.Export int blue = 65536 * (int)Math.Round(255f * (float)rgb.b); int sum = red + green + blue; - // as per below, red and blue order might be inverted... - //var colorWin = System.Drawing.Color.FromArgb(sum); - + // as per below, red and blue order might be inverted... var colorWin = System.Drawing.Color.FromArgb(sum); return sum.ToString(); } - /// - /// - /// - /// - /// Draw s present in the page. private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths) { var pageXmlPage = new PageXmlDocument.PageXmlPage() { - //Border = new PageXmlBorder() - //{ - // Coords = new PageXmlCoords() - // { - // Points = page. - // } - //}, ImageFilename = "unknown", ImageHeight = (int)Math.Round(page.Height * scale), ImageWidth = (int)Math.Round(page.Width * scale), - //PrintSpace = new PageXmlPrintSpace() - //{ - // Coords = new PageXmlCoords() - // { - - // } - //} }; var regions = new List(); @@ -240,12 +194,6 @@ namespace UglyToad.PdfPig.Export }; } - /// - /// - /// - /// - /// - /// private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, decimal height) { regionCount++; @@ -258,33 +206,19 @@ namespace UglyToad.PdfPig.Export }; } - /// - /// - /// - /// - /// - /// private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, decimal height) { lineCount++; return new PageXmlDocument.PageXmlTextLine() { Coords = ToCoords(textLine.BoundingBox, height), - //Baseline = new PageXmlBaseline() { }, Production = PageXmlDocument.PageXmlProductionSimpleType.Printed, - //ReadingDirection = PageXmlReadingDirectionSimpleType.LeftToRight, Words = textLine.Words.Select(w => ToPageXmlWord(w, height)).ToArray(), TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } }, Id = "l" + lineCount }; } - /// - /// - /// - /// - /// - /// private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, decimal height) { wordCount++; @@ -297,12 +231,6 @@ namespace UglyToad.PdfPig.Export }; } - /// - /// - /// - /// - /// - /// private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, decimal height) { glyphCount++; @@ -326,7 +254,7 @@ namespace UglyToad.PdfPig.Export { XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument)); - using (var reader = System.Xml.XmlReader.Create(xmlPath)) + using (var reader = XmlReader.Create(xmlPath)) { return (PageXmlDocument)serializer.Deserialize(reader); } @@ -335,20 +263,18 @@ namespace UglyToad.PdfPig.Export private string Serialize(PageXmlDocument pageXmlDocument) { XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument)); - var settings = new System.Xml.XmlWriterSettings() + var settings = new XmlWriterSettings() { - //Encoding = new System.Text.UTF8Encoding(true), + Encoding = System.Text.Encoding.UTF8, Indent = true, IndentChars = indentChar, - OmitXmlDeclaration = true // hack to manually handle utf-8 }; - using (var stringWriter = new System.IO.StringWriter()) - using (var xmlWriter = System.Xml.XmlWriter.Create(stringWriter, settings)) + using (var memoryStream = new System.IO.MemoryStream()) + using (var xmlWriter = XmlWriter.Create(memoryStream, settings)) { - stringWriter.WriteLine(""); // hack to manually handle utf-8 serializer.Serialize(xmlWriter, pageXmlDocument); - return stringWriter.ToString(); + return System.Text.Encoding.UTF8.GetString(memoryStream.ToArray()); } } }