diff --git a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs index 74b6b8da..14cd5f5a 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs @@ -7,6 +7,28 @@ public class GithubIssuesTests { + [Fact] + public void Issue1148() + { + var path = IntegrationHelpers.GetSpecificTestDocumentPath("P2P-33713919.pdf"); + + using (var document = PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true })) + { + var page = document.GetPage(2); + + var letters = page.Letters; + + var words = NearestNeighbourWordExtractor.Instance.GetWords(letters).ToArray(); + + var firstTableLine = words[42]; + + Assert.EndsWith("C<--,:", firstTableLine.Text); // Just to make sure we are looking at the correct line. Text might change as this is not actually correct + + Assert.Equal(firstTableLine.BoundingBox.BottomLeft, new PdfPoint(x: 31.890118, y: 693.035685)); + Assert.Equal(firstTableLine.BoundingBox.BottomRight, new PdfPoint(x: 563.3851179999991, y: 693.035685)); + } + } + [Fact] public void Issue1122() { diff --git a/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/P2P-33713919.pdf b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/P2P-33713919.pdf new file mode 100644 index 00000000..4209085d Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/P2P-33713919.pdf differ diff --git a/src/UglyToad.PdfPig/PdfFonts/CidFonts/Type2CidFont.cs b/src/UglyToad.PdfPig/PdfFonts/CidFonts/Type2CidFont.cs index 596e8c73..65487bfb 100644 --- a/src/UglyToad.PdfPig/PdfFonts/CidFonts/Type2CidFont.cs +++ b/src/UglyToad.PdfPig/PdfFonts/CidFonts/Type2CidFont.cs @@ -61,9 +61,12 @@ this.defaultWidth = defaultWidth; this.cidToGid = cidToGid; - // TODO: This should maybe take units per em into account? var scale = 1 / (double)(fontProgram?.GetFontMatrixMultiplier() ?? 1000); FontMatrix = TransformationMatrix.FromValues(scale, 0, 0, scale, 0, 0); + + // NB: For the font matrixPdfBox always return 1/1000 with the comment '1000 upem, this is not strictly true' + // see https://github.com/apache/pdfbox/blob/a5379f5588ee4c98222ee61366ad3d82e0f2264e/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/PDCIDFontType2.java#L191 + // Always using 1/1000 breaks the 'ReadWordsFromOldGutnishPage1' test } public double GetWidthFromFont(int characterIdentifier) diff --git a/src/UglyToad.PdfPig/PdfFonts/Composite/Type0Font.cs b/src/UglyToad.PdfPig/PdfFonts/Composite/Type0Font.cs index a141a9db..78d2e6af 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Composite/Type0Font.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Composite/Type0Font.cs @@ -126,13 +126,23 @@ // Get the bounding box in glyph space var boundingBox = CidFont.GetBoundingBox(characterIdentifier); - boundingBox = CidFont.GetFontMatrix(characterIdentifier).Transform(boundingBox); + var fontMatrix = CidFont.GetFontMatrix(characterIdentifier); + boundingBox = fontMatrix.Transform(boundingBox); var width = CidFont.GetWidthFromFont(characterIdentifier); - var advanceWidth = GetFontMatrix().TransformX(width); - // BobLD: Not sure why we don't need CidFont.GetFontMatrix(characterCode) - // Might be related to https://github.com/veraPDF/veraPDF-library/issues/1010 + double scale = fontMatrix.A; + if (Math.Abs(scale - 0.001) < 0.0001) + { + // BobLD: The value of scale is close enough to 0.001 to be able to use 0.001. + // Still not sure what is the correct logic, but this hack fixes issue #1148 (while not breaking "Old Gutnish Internet Explorer.pdf") + // + // Based on https://martin.hoppenheit.info/blog/2018/pdfa-validation-and-inconsistent-glyph-width-information/ + // which quotes: section 6.2.11.5 of ISO 19005-2:2011 (PDF/A-2) clarifies this issue: “For ISO 19005, consistent is defined to be a difference of no more than 1/1000 unit.” + scale = 0.001; + } + + var advanceWidth = scale * width; var result = new CharacterBoundingBox(boundingBox, advanceWidth);