From 0dad611cb1dbd23bf4aa5f3c466b38609f7acb80 Mon Sep 17 00:00:00 2001 From: BobLd Date: Wed, 22 Jan 2020 20:42:27 +0000 Subject: [PATCH] Implement minimum bounding box algorithm --- src/UglyToad.PdfPig.Core/PdfRectangle.cs | 6 +- .../Distances.cs | 7 +- .../TextLine.cs | 8 +- src/UglyToad.PdfPig/Content/Word.cs | 122 +++-------- .../Geometry/GeometryExtensions.cs | 201 +++++++++++++++++- 5 files changed, 232 insertions(+), 112 deletions(-) diff --git a/src/UglyToad.PdfPig.Core/PdfRectangle.cs b/src/UglyToad.PdfPig.Core/PdfRectangle.cs index 169acf58..5d3f1cd6 100644 --- a/src/UglyToad.PdfPig.Core/PdfRectangle.cs +++ b/src/UglyToad.PdfPig.Core/PdfRectangle.cs @@ -1,7 +1,7 @@ -using System; - -namespace UglyToad.PdfPig.Core +namespace UglyToad.PdfPig.Core { + using System; + /// /// A rectangle in a PDF file. /// diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs index 2ee54379..7f06f7e5 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Distances.cs @@ -52,10 +52,9 @@ /// /// The first point. /// The second point. - /// public static double Angle(PdfPoint point1, PdfPoint point2) { - return Math.Atan2(point2.Y - point1.Y, point2.X - point1.X) * 180.0 / Math.PI; + return Math.Atan2(point2.Y - point1.Y, point2.X - point1.X) * 57.29577951; } /// @@ -63,7 +62,6 @@ /// /// The first point. /// The second point. - /// public static double Vertical(PdfPoint point1, PdfPoint point2) { return Math.Abs(point2.Y - point1.Y); @@ -74,7 +72,6 @@ /// /// The first point. /// The second point. - /// public static double Horizontal(PdfPoint point1, PdfPoint point2) { return Math.Abs(point2.X - point1.X); @@ -85,7 +82,6 @@ /// /// The first string. /// The second string. - /// public static int MinimumEditDistance(string string1, string string2) { ushort[,] d = new ushort[string1.Length + 1, string2.Length + 1]; @@ -134,7 +130,6 @@ /// /// The distance measure to use. /// The distance between reference point, and its nearest neighbour. - /// internal static int FindIndexNearest(this T element, IReadOnlyList candidates, Func candidatesPoint, Func pivotPoint, Func distanceMeasure, out double distance) diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs index 96e5c88e..198ef724 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextLine.cs @@ -71,10 +71,10 @@ private PdfRectangle NormaliseRectangle(PdfRectangle rectangle) { - return new PdfRectangle(Math.Min(rectangle.Left, rectangle.Right), - Math.Min(rectangle.Bottom, rectangle.Top), - Math.Max(rectangle.Left, rectangle.Right), - Math.Max(rectangle.Bottom, rectangle.Top)); + return new PdfRectangle(Math.Min(Math.Min(Math.Min(rectangle.TopLeft.X, rectangle.TopRight.X), rectangle.BottomLeft.X), rectangle.BottomRight.X), + Math.Min(Math.Min(Math.Min(rectangle.TopLeft.Y, rectangle.TopRight.Y), rectangle.BottomLeft.Y), rectangle.BottomRight.Y), + Math.Max(Math.Max(Math.Max(rectangle.TopLeft.X, rectangle.TopRight.X), rectangle.BottomLeft.X), rectangle.BottomRight.X), + Math.Max(Math.Max(Math.Max(rectangle.TopLeft.Y, rectangle.TopRight.Y), rectangle.BottomLeft.Y), rectangle.BottomRight.Y)); } /// diff --git a/src/UglyToad.PdfPig/Content/Word.cs b/src/UglyToad.PdfPig/Content/Word.cs index a8832b62..e6d26255 100644 --- a/src/UglyToad.PdfPig/Content/Word.cs +++ b/src/UglyToad.PdfPig/Content/Word.cs @@ -3,7 +3,9 @@ using Core; using System; using System.Collections.Generic; + using System.Linq; using System.Text; + using UglyToad.PdfPig.Geometry; /// /// A word. @@ -266,99 +268,19 @@ { var builder = new StringBuilder(); - var minX = double.MaxValue; - var maxX = double.MinValue; - var minY = double.MaxValue; - var maxY = double.MinValue; + var points = letters.SelectMany(r => new[] + { + r.StartBaseLine, + r.EndBaseLine, + r.GlyphRectangle.TopLeft, + r.GlyphRectangle.TopRight + }).Distinct(); + var convexHull = GeometryExtensions.GrahamScan(points).ToList(); + var minimalBoundingRectangle = GeometryExtensions.ParametricPerpendicularProjection(convexHull); for (var i = 0; i < letters.Count; i++) { - var letter = letters[i]; - builder.Append(letter.Value); - - // maxX - if (letter.GlyphRectangle.BottomLeft.X > maxX) - { - maxX = letter.GlyphRectangle.BottomLeft.X; - } - - if (letter.GlyphRectangle.BottomRight.X > maxX) - { - maxX = letter.GlyphRectangle.BottomRight.X; - } - - if (letter.GlyphRectangle.TopLeft.X > maxX) - { - maxX = letter.GlyphRectangle.TopLeft.X; - } - - if (letter.GlyphRectangle.TopRight.X > maxX) - { - maxX = letter.GlyphRectangle.TopRight.X; - } - - // minX - if (letter.GlyphRectangle.BottomLeft.X < minX) - { - minX = letter.GlyphRectangle.BottomLeft.X; - } - - if (letter.GlyphRectangle.BottomRight.X < minX) - { - minX = letter.GlyphRectangle.BottomRight.X; - } - - if (letter.GlyphRectangle.TopLeft.X < minX) - { - minX = letter.GlyphRectangle.TopLeft.X; - } - - if (letter.GlyphRectangle.TopRight.X < minX) - { - minX = letter.GlyphRectangle.TopRight.X; - } - - // maxY - if (letter.GlyphRectangle.BottomLeft.Y > maxY) - { - maxY = letter.GlyphRectangle.BottomLeft.Y; - } - - if (letter.GlyphRectangle.BottomRight.Y > maxY) - { - maxY = letter.GlyphRectangle.BottomRight.Y; - } - - if (letter.GlyphRectangle.TopLeft.Y > maxY) - { - maxY = letter.GlyphRectangle.TopLeft.Y; - } - - if (letter.GlyphRectangle.TopRight.Y > maxY) - { - maxY = letter.GlyphRectangle.TopRight.Y; - } - - // minY - if (letter.GlyphRectangle.BottomLeft.Y < minY) - { - minY = letter.GlyphRectangle.BottomLeft.Y; - } - - if (letter.GlyphRectangle.BottomRight.Y < minY) - { - minY = letter.GlyphRectangle.BottomRight.Y; - } - - if (letter.GlyphRectangle.TopLeft.Y < minY) - { - minY = letter.GlyphRectangle.TopLeft.Y; - } - - if (letter.GlyphRectangle.TopRight.Y < minY) - { - minY = letter.GlyphRectangle.TopRight.Y; - } + builder.Append(letters[i].Value); } var firstLetter = letters[0]; @@ -370,26 +292,30 @@ if (rotation >= -0.785398 && rotation < 0.785398) { // top border on top - return new Tuple(builder.ToString(), new PdfRectangle(minX, minY, maxX, maxY)); + //return new Tuple(builder.ToString(), new PdfRectangle(minX, minY, maxX, maxY)); + return new Tuple(builder.ToString(), minimalBoundingRectangle); } else if (rotation >= 0.785398 && rotation < 2.356194) { // top border on the left - return new Tuple(builder.ToString(), new PdfRectangle( - new PdfPoint(minX, minY), new PdfPoint(minX, maxY), - new PdfPoint(maxX, minY), new PdfPoint(maxX, maxY))); + //return new Tuple(builder.ToString(), new PdfRectangle( + // new PdfPoint(minX, minY), new PdfPoint(minX, maxY), + // new PdfPoint(maxX, minY), new PdfPoint(maxX, maxY))); + return new Tuple(builder.ToString(), minimalBoundingRectangle); } else if (rotation >= 2.356194 && rotation < 3.926991) { // top border on the bottom - return new Tuple(builder.ToString(), new PdfRectangle(minX, maxY, maxX, minY)); + //return new Tuple(builder.ToString(), new PdfRectangle(minX, maxY, maxX, minY)); + return new Tuple(builder.ToString(), minimalBoundingRectangle); } else { // top border on the right - return new Tuple(builder.ToString(), new PdfRectangle( - new PdfPoint(maxX, maxY), new PdfPoint(maxX, minY), - new PdfPoint(minX, maxY), new PdfPoint(minX, minY))); + //return new Tuple(builder.ToString(), new PdfRectangle( + // new PdfPoint(maxX, maxY), new PdfPoint(maxX, minY), + // new PdfPoint(minX, maxY), new PdfPoint(minX, minY))); + return new Tuple(builder.ToString(), minimalBoundingRectangle); } } #endregion diff --git a/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs b/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs index f1b5966d..c5e9bb7d 100644 --- a/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs +++ b/src/UglyToad.PdfPig/Geometry/GeometryExtensions.cs @@ -1,16 +1,215 @@ namespace UglyToad.PdfPig.Geometry { + using Core; using System; using System.Collections.Generic; using System.Linq; using System.Text; - using Core; /// /// Extension class to Geometry. /// public static class GeometryExtensions { + #region PdfPoint + /// + /// Get the dot product of both points. + /// + /// The first point. + /// The second point. + public static double DotProduct(this PdfPoint point1, PdfPoint point2) + { + return point1.X * point2.X + point1.Y * point2.Y; + } + + /// + /// Get a point with the summed coordinates of both points. + /// + /// The first point. + /// The second point. + public static PdfPoint Add(this PdfPoint point1, PdfPoint point2) + { + return new PdfPoint(point1.X + point2.X, point1.Y + point2.Y); + } + + /// + /// Get a point with the substracted coordinates of both points. + /// + /// The first point. + /// The second point. + public static PdfPoint Subtract(this PdfPoint point1, PdfPoint point2) + { + return new PdfPoint(point1.X - point2.X, point1.Y - point2.Y); + } + + /// + /// Algorithm to find a minimal bounding rectangle (MBR) such that the MBR corresponds to a rectangle + /// with smallest possible area completely enclosing the polygon. + /// From A Fast Algorithm for Generating a Minimal Bounding Rectangle by Lennert D. Den Boer. + /// + internal static PdfRectangle ParametricPerpendicularProjection(IReadOnlyList polygon) + { + // The vertices of P are assumed to be in strict cyclic sequential order, + // either clockwise or counter-clockwise relative to the origin P0. Polygon P is assumed to be + // both simple and convex, and to contain no duplicate (coincident) vertices. + polygon = polygon.Distinct().OrderBy(p => p.X).ThenBy(p => p.Y).ToList(); + var P0 = polygon[0]; + polygon = polygon.OrderBy(p => p, new PdfPointComparer(P0)).ToList(); + + PdfPoint[] MBR = new PdfPoint[0]; + + double Amin = double.MaxValue; + double tmin = 1; + double tmax = 0; + double smax = 0; + int j = 1; + int k = 0; + int l = -1; + + PdfPoint Q = new PdfPoint(); + PdfPoint R0 = new PdfPoint(); + PdfPoint R1 = new PdfPoint(); + + int nv = polygon.Count; + PdfPoint u = new PdfPoint(); + + while (true) + { + var Pk = polygon[k]; + + PdfPoint v = polygon[j].Subtract(Pk); + double r = 1.0 / v.DotProduct(v); + + for (j = 0; j < nv; j++) + { + if (j == k) continue; + PdfPoint Pj = polygon[j]; + u = Pj.Subtract(Pk); + double t = u.DotProduct(v) * r; + PdfPoint Pt = new PdfPoint(t * v.X + Pk.X, t * v.Y + Pk.Y); + u = Pt.Subtract(Pj); + double s = u.DotProduct(u); + + if (t < tmin) + { + tmin = t; + R0 = Pt; + } + + if (t > tmax) + { + tmax = t; + R1 = Pt; + } + + if (s > smax) + { + smax = s; + Q = Pt; + l = j; + } + } + + PdfPoint PlMinusQ = polygon[l].Subtract(Q); + PdfPoint R2 = R1.Add(PlMinusQ); + PdfPoint R3 = R0.Add(PlMinusQ); + u = R1.Subtract(R0); + double A = u.DotProduct(u) * smax; + + if (A < Amin) + { + Amin = A; + MBR = new[] { R0, R1, R2, R3 }; + } + + k++; + j = k; + + if (j == nv) j = 0; + + if (k == nv) break; + } + + return new PdfRectangle(MBR[2], MBR[3], MBR[1], MBR[0]); + } + + private class PdfPointComparer : IComparer + { + PdfPoint P0; + + public PdfPointComparer(PdfPoint referencePoint) + { + P0 = referencePoint; + } + + public int Compare(PdfPoint a, PdfPoint b) + { + var det = Math.Round((a.X - P0.X) * (b.Y - P0.Y) - (b.X - P0.X) * (a.Y - P0.Y), 6); + if (det == 0) return 0; + return Math.Sign(det); + } + } + + /// + /// Algorithm to find the convex hull of the set of points with time complexity O(n log n). + /// + internal static IEnumerable GrahamScan(IEnumerable points) + { + if (points.Count() < 3) return points; + + Func ccw = (PdfPoint p1, PdfPoint p2, PdfPoint p3) => + { + return Math.Round((p2.X - p1.X) * (p3.Y - p1.Y) - (p2.Y - p1.Y) * (p3.X - p1.X), 6); + }; + + Func polarAngle = (PdfPoint point1, PdfPoint point2) => + { + return Math.Atan2(point2.Y - point1.Y, point2.X - point1.X) % Math.PI; + }; + + Stack stack = new Stack(); + var sortedPoints = points.OrderBy(p => p.Y).ThenBy(p => p.X).ToList(); + var P0 = sortedPoints[0]; + var groups = sortedPoints.Skip(1).GroupBy(p => polarAngle(P0, p)).OrderBy(g => g.Key); + + sortedPoints = new List(); + foreach (var group in groups) + { + if (group.Count() == 1) + { + sortedPoints.Add(group.First()); + } + else + { + // if more than one point has the same angle, + // remove all but the one that is farthest from P0 + sortedPoints.Add(group.OrderByDescending(p => + { + double dx = p.X - P0.X; + double dy = p.Y - P0.Y; + return dx * dx + dy * dy; + }).First()); + } + } + + stack.Push(P0); + stack.Push(sortedPoints[0]); + stack.Push(sortedPoints[1]); + + for (int i = 2; i < sortedPoints.Count; i++) + { + var point = sortedPoints[i]; + while (ccw(stack.ElementAt(1), stack.Peek(), point) < 0) + { + stack.Pop(); + } + stack.Push(point); + } + + return stack; + } + #endregion + #region PdfRectangle /// /// Whether the rectangle contains the point.