format and tidy up alto export autogenerated code. tidy up docstrum

This commit is contained in:
Eliot Jones
2019-10-14 18:30:18 +01:00
parent e2c9db8d50
commit d68bd88824
53 changed files with 3114 additions and 5378 deletions

View File

@@ -4,6 +4,10 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Xml.Linq;
using DocumentLayoutAnalysis;
using Export.Alto;
using Xunit;
public class PigProductionHandbookTests
@@ -32,7 +36,7 @@
var page = document.GetPage(1);
// Pinkish.
var (r, g , b) = page.Letters[0].Color.ToRGBValues();
var (r, g, b) = page.Letters[0].Color.ToRGBValues();
Assert.Equal(1, r);
Assert.Equal(0.914m, g);
@@ -98,7 +102,7 @@
[Fact]
public void Page4HasCorrectWords()
{
var expected = WordsPage4.Split(new[] {"\r", "\r\n", "\n", " "}, StringSplitOptions.RemoveEmptyEntries);
var expected = WordsPage4.Split(new[] { "\r", "\r\n", "\n", " " }, StringSplitOptions.RemoveEmptyEntries);
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(4);
@@ -129,6 +133,23 @@
}
}
[Fact]
public void CanExportAltoXmlFormat()
{
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var exporter = new AltoXmlTextExporter(new NearestNeighbourWordExtractor(), new DocstrumBoundingBoxes());
var xml = exporter.Get(document.GetPage(16), true);
Assert.NotNull(xml);
// TODO: generated XML is invalid due to BOM.
//using (var stringStream = GenerateStreamFromString(xml, Encoding.UTF8))
//{
// var xDocument = XDocument.Load(stringStream);
// Assert.NotNull(xDocument);
//}
}
}
[Fact]
public void LettersHaveCorrectPosition()
{
@@ -160,6 +181,16 @@
return result;
}
private static Stream GenerateStreamFromString(string s, Encoding encoding)
{
var stream = new MemoryStream();
var writer = new StreamWriter(stream, encoding);
writer.Write(s);
writer.Flush();
stream.Position = 0;
return stream;
}
private const string WordsPage4 = @"Disclaimer
The designations employed end the presentation of the material in this information
product do not imply the expression of any opinion whatsoever on the part of the