PageXmlTextExporter: ensure no coords on the page boundaries #399

This commit is contained in:
davmarksman 2021-12-19 15:21:15 +00:00
parent 5524473e0e
commit b3ace45e96
2 changed files with 138 additions and 28 deletions

View File

@ -104,28 +104,44 @@
return Serialize(pageXmlDocument);
}
private string PointToString(PdfPoint point, double height)
/// <summary>
/// Converts a point to a string
/// </summary>
/// <param name="point"></param>
/// <param name="pageWidth">The width of the page where the pdf point is located on</param>
/// <param name="pageHeight">The height of the page where the pdf point is located on</param>
/// <param name="scaleToApply"></param>
/// <returns></returns>
public static string PointToString(PdfPoint point, double pageWidth, double pageHeight, double scaleToApply = 1.0)
{
double x = Math.Round(point.X * scale);
double y = Math.Round((height - point.Y) * scale);
return (x > 0 ? x : 0).ToString("0") + "," + (y > 0 ? y : 0).ToString("0");
double x = Math.Round(point.X * scaleToApply);
double y = Math.Round((pageHeight - point.Y) * scaleToApply);
// move away from borders
x = x > 1 ? x : 1;
y = y > 1 ? y : 1;
x = x < (pageWidth - 1) ? x : pageWidth - 1;
y = y < (pageHeight - 1) ? y : pageHeight - 1;
return x.ToString("0") + "," + y.ToString("0");
}
private string ToPoints(IEnumerable<PdfPoint> points, double height)
private string ToPoints(IEnumerable<PdfPoint> points, double pageWidth, double pageHeight)
{
return string.Join(" ", points.Select(p => PointToString(p, height)));
return string.Join(" ", points.Select(p => PointToString(p, pageWidth, pageHeight, this.scale)));
}
private string ToPoints(PdfRectangle pdfRectangle, double height)
private string ToPoints(PdfRectangle pdfRectangle, double pageWidth, double pageHeight)
{
return ToPoints(new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight }, height);
return ToPoints(new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight }, pageWidth, pageHeight);
}
private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, double height)
private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, double pageWidth, double pageHeight)
{
return new PageXmlDocument.PageXmlCoords()
{
Points = ToPoints(pdfRectangle, height)
Points = ToPoints(pdfRectangle, pageWidth, pageHeight)
};
}
@ -166,7 +182,7 @@
blocks = readingOrderDetector.Get(blocks).ToList();
}
regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Height)));
regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Width, page.Height)));
if (orderedRegions.Any())
{
@ -184,12 +200,12 @@
var images = page.GetImages().ToList();
if (images.Count > 0)
{
regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Height)));
regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Width, page.Height)));
}
if (includePaths)
{
var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Height));
var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Width, page.Height));
if (graphicalElements.Where(g => g != null).Count() > 0)
{
regions.AddRange(graphicalElements.Where(g => g != null));
@ -200,7 +216,7 @@
return pageXmlPage;
}
private PageXmlDocument.PageXmlLineDrawingRegion ToPageXmlLineDrawingRegion(PdfPath pdfPath, double height)
private PageXmlDocument.PageXmlLineDrawingRegion ToPageXmlLineDrawingRegion(PdfPath pdfPath, double pageWidth, double pageHeight)
{
var bbox = pdfPath.GetBoundingRectangle();
if (bbox.HasValue)
@ -208,25 +224,25 @@
regionCount++;
return new PageXmlDocument.PageXmlLineDrawingRegion()
{
Coords = ToCoords(bbox.Value, height),
Coords = ToCoords(bbox.Value, pageWidth, pageHeight),
Id = "r" + regionCount
};
}
return null;
}
private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, double height)
private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, double pageWidth, double pageHeight)
{
regionCount++;
var bbox = pdfImage.Bounds;
return new PageXmlDocument.PageXmlImageRegion()
{
Coords = ToCoords(bbox, height),
Coords = ToCoords(bbox, pageWidth, pageHeight),
Id = "r" + regionCount
};
}
private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double height)
private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double pageWidth, double pageHeight)
{
regionCount++;
string regionId = "r" + regionCount;
@ -242,45 +258,45 @@
return new PageXmlDocument.PageXmlTextRegion()
{
Coords = ToCoords(textBlock.BoundingBox, height),
Coords = ToCoords(textBlock.BoundingBox, pageWidth, pageHeight),
Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph,
TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, height)).ToArray(),
TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, pageWidth, pageHeight)).ToArray(),
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textBlock.Text } },
Id = regionId
};
}
private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, double height)
private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, double pageWidth, double pageHeight)
{
lineCount++;
return new PageXmlDocument.PageXmlTextLine()
{
Coords = ToCoords(textLine.BoundingBox, height),
Coords = ToCoords(textLine.BoundingBox, pageWidth, pageHeight),
Production = PageXmlDocument.PageXmlProductionSimpleType.Printed,
Words = textLine.Words.Select(w => ToPageXmlWord(w, height)).ToArray(),
Words = textLine.Words.Select(w => ToPageXmlWord(w, pageWidth, pageHeight)).ToArray(),
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } },
Id = "l" + lineCount
};
}
private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, double height)
private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, double pageWidth, double pageHeight)
{
wordCount++;
return new PageXmlDocument.PageXmlWord()
{
Coords = ToCoords(word.BoundingBox, height),
Glyphs = word.Letters.Select(l => ToPageXmlGlyph(l, height)).ToArray(),
Coords = ToCoords(word.BoundingBox, pageWidth, pageHeight),
Glyphs = word.Letters.Select(l => ToPageXmlGlyph(l, pageWidth, pageHeight)).ToArray(),
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = word.Text } },
Id = "w" + wordCount
};
}
private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, double height)
private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, double pageWidth, double pageHeight)
{
glyphCount++;
return new PageXmlDocument.PageXmlGlyph()
{
Coords = ToCoords(letter.GlyphRectangle, height),
Coords = ToCoords(letter.GlyphRectangle, pageWidth, pageHeight),
Ligature = false,
Production = PageXmlDocument.PageXmlProductionSimpleType.Printed,
TextStyle = new PageXmlDocument.PageXmlTextStyle()

View File

@ -0,0 +1,94 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using DocumentLayoutAnalysis.Export;
using DocumentLayoutAnalysis.PageSegmenter;
using DocumentLayoutAnalysis.ReadingOrderDetector;
using PdfPig.Core;
using PdfPig.Util;
using Xunit;
public class PageXmlTextExporterTests
{
private static string GetFilename()
{
return IntegrationHelpers.GetDocumentPath("2006_Swedish_Touring_Car_Championship.pdf");
}
private static string GetXml(PageXmlTextExporter pageXmlTextExporter = null)
{
pageXmlTextExporter = pageXmlTextExporter ?? new PageXmlTextExporter(
DefaultWordExtractor.Instance,
RecursiveXYCut.Instance,
UnsupervisedReadingOrderDetector.Instance);
string xml;
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
xml = pageXmlTextExporter.Get(page);
}
return xml;
}
[Fact]
public void WhenReadingOrder_ContainsReadingOrderXmlElements()
{
var pageXmlTextExporter = new PageXmlTextExporter(
DefaultWordExtractor.Instance,
RecursiveXYCut.Instance,
UnsupervisedReadingOrderDetector.Instance);
var xml = GetXml(pageXmlTextExporter);
Assert.Contains("<ReadingOrder>", xml);
Assert.Contains("</OrderedGroup>", xml);
}
[Fact]
public void PageHeightAndWidthArePresent()
{
var xml = GetXml();
Assert.Contains(@"<Page imageFilename=""unknown"" imageWidth=""595"" imageHeight=""842"">", xml);
}
[Fact]
public void ContainsExpectedNumberOfTextRegions()
{
var xml = GetXml();
var count = Regex.Matches(xml, "</TextRegion>").Count;
Assert.Equal(22, count);
}
[Fact]
public void ContainsExpectedText()
{
var xml = GetXml();
Assert.Contains(@"2006 Swedish Touring Car Championship", xml);
// the coords for that text
Assert.Contains(@"<Coords points=""35,77 35,62 397,62 397,77"" />", xml);
}
[Fact]
public void NoPointsAreOnThePageBoundary()
{
var pageWidth = 100;
var pageHeight = 200;
var topLeftPagePoint = new PdfPoint(0,0);
var bottomLeftPagePoint = new PdfPoint(0, pageHeight);
var bottomRightPagePoint = new PdfPoint(pageWidth, pageHeight);
var normalPoint = new PdfPoint(60, 60);
Assert.Equal("1,199", PageXmlTextExporter.PointToString(topLeftPagePoint, pageWidth, pageHeight));
Assert.Equal("1,1", PageXmlTextExporter.PointToString(bottomLeftPagePoint, pageWidth, pageHeight));
Assert.Equal("99,1", PageXmlTextExporter.PointToString(bottomRightPagePoint, pageWidth, pageHeight));
Assert.Equal($"60,140", PageXmlTextExporter.PointToString(normalPoint, pageWidth, pageHeight));
}
}
}