mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 19:54:52 +08:00
fix tests for renaming and validating generate alto xml
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Xml;
|
||||
using System.Xml.Linq;
|
||||
using DocumentLayoutAnalysis;
|
||||
using Export.Alto;
|
||||
@@ -136,17 +137,35 @@
|
||||
[Fact]
|
||||
public void CanExportAltoXmlFormat()
|
||||
{
|
||||
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
var exporter = new AltoXmlTextExporter(new NearestNeighbourWordExtractor(), new DocstrumBoundingBoxes());
|
||||
var xml = exporter.Get(document.GetPage(4), true);
|
||||
Assert.NotNull(xml);
|
||||
using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml)))
|
||||
using (var xmlReader = new XmlTextReader(xmlStream))
|
||||
{
|
||||
var xDocument = XDocument.Load(xmlReader);
|
||||
Assert.NotNull(xDocument);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanExportAltoXmlFormatPage16()
|
||||
{
|
||||
// Page 16 contains an unprintable string and a single line of text which causes problems for Docstrum.
|
||||
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
var exporter = new AltoXmlTextExporter(new NearestNeighbourWordExtractor(), new DocstrumBoundingBoxes());
|
||||
var xml = exporter.Get(document.GetPage(16), true);
|
||||
Assert.NotNull(xml);
|
||||
// TODO: generated XML is invalid due to BOM.
|
||||
//using (var stringStream = GenerateStreamFromString(xml, Encoding.UTF8))
|
||||
//{
|
||||
// var xDocument = XDocument.Load(stringStream);
|
||||
// Assert.NotNull(xDocument);
|
||||
//}
|
||||
using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml)))
|
||||
using (var xmlReader = new XmlTextReader(xmlStream))
|
||||
{
|
||||
var xDocument = XDocument.Load(xmlReader);
|
||||
Assert.NotNull(xDocument);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,16 +200,6 @@
|
||||
return result;
|
||||
}
|
||||
|
||||
private static Stream GenerateStreamFromString(string s, Encoding encoding)
|
||||
{
|
||||
var stream = new MemoryStream();
|
||||
var writer = new StreamWriter(stream, encoding);
|
||||
writer.Write(s);
|
||||
writer.Flush();
|
||||
stream.Position = 0;
|
||||
return stream;
|
||||
}
|
||||
|
||||
private const string WordsPage4 = @"Disclaimer
|
||||
The designations employed end the presentation of the material in this information
|
||||
product do not imply the expression of any opinion whatsoever on the part of the
|
||||
|
@@ -77,7 +77,7 @@
|
||||
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
|
||||
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.Distances",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBB",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBoundingBoxes",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.DefaultPageSegmenter",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.IPageSegmenter",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions",
|
||||
@@ -88,7 +88,8 @@
|
||||
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
|
||||
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
|
||||
"UglyToad.PdfPig.Export.ITextExporter",
|
||||
"UglyToad.PdfPig.Export.AltoXmlTextExporter",
|
||||
"UglyToad.PdfPig.Export.Alto.AltoDocument",
|
||||
"UglyToad.PdfPig.Export.Alto.AltoXmlTextExporter",
|
||||
"UglyToad.PdfPig.Export.HOcrTextExporter",
|
||||
"UglyToad.PdfPig.Export.PageXmlTextExporter",
|
||||
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
|
||||
|
Reference in New Issue
Block a user