From 00b9d416df5b379d1a24ec0bbad6107f8ec16bce Mon Sep 17 00:00:00 2001 From: Adam Busbin Date: Thu, 23 Apr 2020 14:58:48 -0700 Subject: [PATCH 1/4] added check for bad fonts see https://github.com/apache/pdfbox/blob/61ceca8376f08f23f62178bd4fd97e919a690e43/fontbox/src/main/java/org/apache/fontbox/ttf/HorizontalMetricsTable.java line 67 for matching code. --- .../TrueType/Parser/HorizontalMetricsTableParser.cs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/UglyToad.PdfPig.Fonts/TrueType/Parser/HorizontalMetricsTableParser.cs b/src/UglyToad.PdfPig.Fonts/TrueType/Parser/HorizontalMetricsTableParser.cs index 98ca5926..cb1f8ba7 100644 --- a/src/UglyToad.PdfPig.Fonts/TrueType/Parser/HorizontalMetricsTableParser.cs +++ b/src/UglyToad.PdfPig.Fonts/TrueType/Parser/HorizontalMetricsTableParser.cs @@ -26,10 +26,18 @@ bytesRead += 4; } - + + int numberNonHorizontal = glyphCount - metricCount; + + // handle bad fonts with too many hmetrics + if (numberNonHorizontal < 0) + { + numberNonHorizontal = glyphCount; + } + // The number of entries in the left side bearing field per entry is number of glyphs - number of metrics // For bearings over the metric count, the width is the same as the last width in advanced widths. - var additionalLeftSideBearings = new short[glyphCount - metricCount]; + var additionalLeftSideBearings = new short[numberNonHorizontal]; for (var i = 0; i < additionalLeftSideBearings.Length; i++) { From d4210cd5d16a1dfccea36866e409a39f21223cd6 Mon Sep 17 00:00:00 2001 From: BobLd Date: Thu, 23 Apr 2020 11:32:05 +0100 Subject: [PATCH 2/4] Make clustering algos public and use shorter names --- .../{ClusteringAlgorithms.cs => Clustering.cs} | 12 ++++++------ .../PageSegmenter/DocstrumBoundingBoxes.cs | 4 ++-- .../WordExtractor/NearestNeighbourWordExtractor.cs | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) rename src/UglyToad.PdfPig.DocumentLayoutAnalysis/{ClusteringAlgorithms.cs => Clustering.cs} (96%) diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs similarity index 96% rename from src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs rename to src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs index 36b62120..74235f97 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ClusteringAlgorithms.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs @@ -9,7 +9,7 @@ /// /// Clustering Algorithms. /// - internal class ClusteringAlgorithms + public static class Clustering { /// /// Algorithm to group elements using nearest neighbours. @@ -26,7 +26,7 @@ /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. - internal static IEnumerable> ClusterNearestNeighbours(IReadOnlyList elements, + internal static IEnumerable> NearestNeighbours(IReadOnlyList elements, Func distMeasure, Func maxDistanceFunction, Func pivotPoint, Func candidatesPoint, @@ -94,7 +94,7 @@ /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. - internal static IEnumerable> ClusterNearestNeighbours(IReadOnlyList elements, int k, + internal static IEnumerable> NearestNeighbours(IReadOnlyList elements, int k, Func distMeasure, Func maxDistanceFunction, Func pivotPoint, Func candidatesPoint, @@ -161,7 +161,7 @@ /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. - internal static IEnumerable> ClusterNearestNeighbours(T[] elements, + internal static IEnumerable> NearestNeighbours(IReadOnlyList elements, Func distMeasure, Func maxDistanceFunction, Func pivotLine, Func candidatesLine, @@ -185,12 +185,12 @@ * (i,j,k) will form a group and (m,n) will form another group. *************************************************************************************/ - int[] indexes = Enumerable.Repeat(-1, elements.Length).ToArray(); + int[] indexes = Enumerable.Repeat(-1, elements.Count).ToArray(); ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; // 1. Find nearest neighbours indexes - Parallel.For(0, elements.Length, parallelOptions, e => + Parallel.For(0, elements.Count, parallelOptions, e => { var pivot = elements[e]; diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs index 9a624a45..2488014b 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs @@ -197,7 +197,7 @@ private static IEnumerable GetLines(List words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism) { TextDirection textDirection = words[0].TextDirection; - var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, 2, Distances.Euclidean, + var groupedIndexes = Clustering.NearestNeighbours(words, 2, Distances.Euclidean, (pivot, candidate) => maxDist, pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft, pivot => true, @@ -246,7 +246,7 @@ new PdfPoint(left + d / 2, l2.Point1.Y)); } - var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(lines, + var groupedIndexes = Clustering.NearestNeighbours(lines, euclidianOverlappingMiddleDistance, (pivot, candidate) => maxDist, pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight), diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs index 05989bfe..9e2a4e93 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/WordExtractor/NearestNeighbourWordExtractor.cs @@ -106,7 +106,7 @@ throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction."); } - var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(pageLetters, + var groupedIndexes = Clustering.NearestNeighbours(pageLetters, distMeasure, maxDistanceFunction, l => l.EndBaseLine, l => l.StartBaseLine, l => !string.IsNullOrWhiteSpace(l.Value), From c2de52423e181ab807888fa06e46d947b6f940b0 Mon Sep 17 00:00:00 2001 From: BobLd Date: Thu, 23 Apr 2020 11:34:01 +0100 Subject: [PATCH 3/4] Make NearestNeighbours public --- src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs index 74235f97..7048b078 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Clustering.cs @@ -26,7 +26,7 @@ /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. - internal static IEnumerable> NearestNeighbours(IReadOnlyList elements, + public static IEnumerable> NearestNeighbours(IReadOnlyList elements, Func distMeasure, Func maxDistanceFunction, Func pivotPoint, Func candidatesPoint, @@ -94,7 +94,7 @@ /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. - internal static IEnumerable> NearestNeighbours(IReadOnlyList elements, int k, + public static IEnumerable> NearestNeighbours(IReadOnlyList elements, int k, Func distMeasure, Func maxDistanceFunction, Func pivotPoint, Func candidatesPoint, @@ -161,7 +161,7 @@ /// Sets the maximum number of concurrent tasks enabled. /// A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations. - internal static IEnumerable> NearestNeighbours(IReadOnlyList elements, + public static IEnumerable> NearestNeighbours(IReadOnlyList elements, Func distMeasure, Func maxDistanceFunction, Func pivotLine, Func candidatesLine, From 635c4b4c5e0be3a433ce3c9b53505a24cd9d3398 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Sat, 25 Apr 2020 09:11:16 +0100 Subject: [PATCH 4/4] formatting tidy-up --- src/UglyToad.PdfPig.sln.DotSettings | 1 + src/UglyToad.PdfPig/Content/Page.cs | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/UglyToad.PdfPig.sln.DotSettings b/src/UglyToad.PdfPig.sln.DotSettings index 8306290d..c7263dcd 100644 --- a/src/UglyToad.PdfPig.sln.DotSettings +++ b/src/UglyToad.PdfPig.sln.DotSettings @@ -1,4 +1,5 @@  + True BE CIE CMYK diff --git a/src/UglyToad.PdfPig/Content/Page.cs b/src/UglyToad.PdfPig/Content/Page.cs index 0aaf17c5..433f4ebc 100644 --- a/src/UglyToad.PdfPig/Content/Page.cs +++ b/src/UglyToad.PdfPig/Content/Page.cs @@ -4,13 +4,12 @@ using System.Collections.Generic; using System.Text; using Annotations; - using Core; using Graphics.Operations; using Tokens; using Util; using Util.JetBrains.Annotations; using Tokenization.Scanner; - using UglyToad.PdfPig.Graphics; + using Graphics; /// /// Contains the content and provides access to methods of a single page in the .