fix tests for renaming and validating generate alto xml

This commit is contained in:
Eliot Jones
2019-10-15 13:59:09 +01:00
parent d68bd88824
commit f14c52a05a
2 changed files with 28 additions and 18 deletions

View File

@@ -5,6 +5,7 @@
using System.IO;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Linq;
using DocumentLayoutAnalysis;
using Export.Alto;
@@ -136,17 +137,35 @@
[Fact]
public void CanExportAltoXmlFormat()
{
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var exporter = new AltoXmlTextExporter(new NearestNeighbourWordExtractor(), new DocstrumBoundingBoxes());
var xml = exporter.Get(document.GetPage(4), true);
Assert.NotNull(xml);
using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml)))
using (var xmlReader = new XmlTextReader(xmlStream))
{
var xDocument = XDocument.Load(xmlReader);
Assert.NotNull(xDocument);
}
}
}
[Fact]
public void CanExportAltoXmlFormatPage16()
{
// Page 16 contains an unprintable string and a single line of text which causes problems for Docstrum.
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var exporter = new AltoXmlTextExporter(new NearestNeighbourWordExtractor(), new DocstrumBoundingBoxes());
var xml = exporter.Get(document.GetPage(16), true);
Assert.NotNull(xml);
// TODO: generated XML is invalid due to BOM.
//using (var stringStream = GenerateStreamFromString(xml, Encoding.UTF8))
//{
// var xDocument = XDocument.Load(stringStream);
// Assert.NotNull(xDocument);
//}
using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml)))
using (var xmlReader = new XmlTextReader(xmlStream))
{
var xDocument = XDocument.Load(xmlReader);
Assert.NotNull(xDocument);
}
}
}
@@ -181,16 +200,6 @@
return result;
}
private static Stream GenerateStreamFromString(string s, Encoding encoding)
{
var stream = new MemoryStream();
var writer = new StreamWriter(stream, encoding);
writer.Write(s);
writer.Flush();
stream.Position = 0;
return stream;
}
private const string WordsPage4 = @"Disclaimer
The designations employed end the presentation of the material in this information
product do not imply the expression of any opinion whatsoever on the part of the