From b3ace45e965b56b8f2c73013fa9e9573b51f3206 Mon Sep 17 00:00:00 2001 From: davmarksman Date: Sun, 19 Dec 2021 15:21:15 +0000 Subject: [PATCH] PageXmlTextExporter: ensure no coords on the page boundaries #399 --- .../Export/PageXmlTextExporter.cs | 72 ++++++++------ .../Integration/PageXmlTextExporterTests.cs | 94 +++++++++++++++++++ 2 files changed, 138 insertions(+), 28 deletions(-) create mode 100644 src/UglyToad.PdfPig.Tests/Integration/PageXmlTextExporterTests.cs diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs index 3b71c2e5..956d6f18 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs @@ -104,28 +104,44 @@ return Serialize(pageXmlDocument); } - private string PointToString(PdfPoint point, double height) + /// + /// Converts a point to a string + /// + /// + /// The width of the page where the pdf point is located on + /// The height of the page where the pdf point is located on + /// + /// + public static string PointToString(PdfPoint point, double pageWidth, double pageHeight, double scaleToApply = 1.0) { - double x = Math.Round(point.X * scale); - double y = Math.Round((height - point.Y) * scale); - return (x > 0 ? x : 0).ToString("0") + "," + (y > 0 ? y : 0).ToString("0"); + double x = Math.Round(point.X * scaleToApply); + double y = Math.Round((pageHeight - point.Y) * scaleToApply); + + // move away from borders + x = x > 1 ? x : 1; + y = y > 1 ? y : 1; + + x = x < (pageWidth - 1) ? x : pageWidth - 1; + y = y < (pageHeight - 1) ? y : pageHeight - 1; + + return x.ToString("0") + "," + y.ToString("0"); } - private string ToPoints(IEnumerable points, double height) + private string ToPoints(IEnumerable points, double pageWidth, double pageHeight) { - return string.Join(" ", points.Select(p => PointToString(p, height))); + return string.Join(" ", points.Select(p => PointToString(p, pageWidth, pageHeight, this.scale))); } - private string ToPoints(PdfRectangle pdfRectangle, double height) + private string ToPoints(PdfRectangle pdfRectangle, double pageWidth, double pageHeight) { - return ToPoints(new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight }, height); + return ToPoints(new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight }, pageWidth, pageHeight); } - private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, double height) + private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, double pageWidth, double pageHeight) { return new PageXmlDocument.PageXmlCoords() { - Points = ToPoints(pdfRectangle, height) + Points = ToPoints(pdfRectangle, pageWidth, pageHeight) }; } @@ -166,7 +182,7 @@ blocks = readingOrderDetector.Get(blocks).ToList(); } - regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Height))); + regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Width, page.Height))); if (orderedRegions.Any()) { @@ -184,12 +200,12 @@ var images = page.GetImages().ToList(); if (images.Count > 0) { - regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Height))); + regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Width, page.Height))); } if (includePaths) { - var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Height)); + var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Width, page.Height)); if (graphicalElements.Where(g => g != null).Count() > 0) { regions.AddRange(graphicalElements.Where(g => g != null)); @@ -200,7 +216,7 @@ return pageXmlPage; } - private PageXmlDocument.PageXmlLineDrawingRegion ToPageXmlLineDrawingRegion(PdfPath pdfPath, double height) + private PageXmlDocument.PageXmlLineDrawingRegion ToPageXmlLineDrawingRegion(PdfPath pdfPath, double pageWidth, double pageHeight) { var bbox = pdfPath.GetBoundingRectangle(); if (bbox.HasValue) @@ -208,25 +224,25 @@ regionCount++; return new PageXmlDocument.PageXmlLineDrawingRegion() { - Coords = ToCoords(bbox.Value, height), + Coords = ToCoords(bbox.Value, pageWidth, pageHeight), Id = "r" + regionCount }; } return null; } - private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, double height) + private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, double pageWidth, double pageHeight) { regionCount++; var bbox = pdfImage.Bounds; return new PageXmlDocument.PageXmlImageRegion() { - Coords = ToCoords(bbox, height), + Coords = ToCoords(bbox, pageWidth, pageHeight), Id = "r" + regionCount }; } - private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double height) + private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double pageWidth, double pageHeight) { regionCount++; string regionId = "r" + regionCount; @@ -242,45 +258,45 @@ return new PageXmlDocument.PageXmlTextRegion() { - Coords = ToCoords(textBlock.BoundingBox, height), + Coords = ToCoords(textBlock.BoundingBox, pageWidth, pageHeight), Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph, - TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, height)).ToArray(), + TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, pageWidth, pageHeight)).ToArray(), TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textBlock.Text } }, Id = regionId }; } - private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, double height) + private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, double pageWidth, double pageHeight) { lineCount++; return new PageXmlDocument.PageXmlTextLine() { - Coords = ToCoords(textLine.BoundingBox, height), + Coords = ToCoords(textLine.BoundingBox, pageWidth, pageHeight), Production = PageXmlDocument.PageXmlProductionSimpleType.Printed, - Words = textLine.Words.Select(w => ToPageXmlWord(w, height)).ToArray(), + Words = textLine.Words.Select(w => ToPageXmlWord(w, pageWidth, pageHeight)).ToArray(), TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } }, Id = "l" + lineCount }; } - private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, double height) + private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, double pageWidth, double pageHeight) { wordCount++; return new PageXmlDocument.PageXmlWord() { - Coords = ToCoords(word.BoundingBox, height), - Glyphs = word.Letters.Select(l => ToPageXmlGlyph(l, height)).ToArray(), + Coords = ToCoords(word.BoundingBox, pageWidth, pageHeight), + Glyphs = word.Letters.Select(l => ToPageXmlGlyph(l, pageWidth, pageHeight)).ToArray(), TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = word.Text } }, Id = "w" + wordCount }; } - private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, double height) + private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, double pageWidth, double pageHeight) { glyphCount++; return new PageXmlDocument.PageXmlGlyph() { - Coords = ToCoords(letter.GlyphRectangle, height), + Coords = ToCoords(letter.GlyphRectangle, pageWidth, pageHeight), Ligature = false, Production = PageXmlDocument.PageXmlProductionSimpleType.Printed, TextStyle = new PageXmlDocument.PageXmlTextStyle() diff --git a/src/UglyToad.PdfPig.Tests/Integration/PageXmlTextExporterTests.cs b/src/UglyToad.PdfPig.Tests/Integration/PageXmlTextExporterTests.cs new file mode 100644 index 00000000..519a932c --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/PageXmlTextExporterTests.cs @@ -0,0 +1,94 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using System; + using System.Collections.Generic; + using System.IO; + using System.Text; + using System.Text.RegularExpressions; + using DocumentLayoutAnalysis.Export; + using DocumentLayoutAnalysis.PageSegmenter; + using DocumentLayoutAnalysis.ReadingOrderDetector; + using PdfPig.Core; + using PdfPig.Util; + using Xunit; + + public class PageXmlTextExporterTests + { + private static string GetFilename() + { + return IntegrationHelpers.GetDocumentPath("2006_Swedish_Touring_Car_Championship.pdf"); + } + + private static string GetXml(PageXmlTextExporter pageXmlTextExporter = null) + { + pageXmlTextExporter = pageXmlTextExporter ?? new PageXmlTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance, + UnsupervisedReadingOrderDetector.Instance); + + string xml; + using (var document = PdfDocument.Open(GetFilename())) + { + var page = document.GetPage(1); + xml = pageXmlTextExporter.Get(page); + } + + return xml; + } + + [Fact] + public void WhenReadingOrder_ContainsReadingOrderXmlElements() + { + var pageXmlTextExporter = new PageXmlTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance, + UnsupervisedReadingOrderDetector.Instance); + var xml = GetXml(pageXmlTextExporter); + + Assert.Contains("", xml); + Assert.Contains("", xml); + } + + [Fact] + public void PageHeightAndWidthArePresent() + { + var xml = GetXml(); + Assert.Contains(@"", xml); + } + + [Fact] + public void ContainsExpectedNumberOfTextRegions() + { + var xml = GetXml(); + var count = Regex.Matches(xml, "").Count; + + Assert.Equal(22, count); + } + + [Fact] + public void ContainsExpectedText() + { + var xml = GetXml(); + Assert.Contains(@"2006 Swedish Touring Car Championship", xml); + // the coords for that text + Assert.Contains(@"", xml); + } + + [Fact] + public void NoPointsAreOnThePageBoundary() + { + var pageWidth = 100; + var pageHeight = 200; + + var topLeftPagePoint = new PdfPoint(0,0); + var bottomLeftPagePoint = new PdfPoint(0, pageHeight); + var bottomRightPagePoint = new PdfPoint(pageWidth, pageHeight); + var normalPoint = new PdfPoint(60, 60); + + Assert.Equal("1,199", PageXmlTextExporter.PointToString(topLeftPagePoint, pageWidth, pageHeight)); + Assert.Equal("1,1", PageXmlTextExporter.PointToString(bottomLeftPagePoint, pageWidth, pageHeight)); + Assert.Equal("99,1", PageXmlTextExporter.PointToString(bottomRightPagePoint, pageWidth, pageHeight)); + Assert.Equal($"60,140", PageXmlTextExporter.PointToString(normalPoint, pageWidth, pageHeight)); + } + } +}