From 1021729727ac2c61740781a335bb7a2977208264 Mon Sep 17 00:00:00 2001 From: EliotJones Date: Tue, 15 Jul 2025 20:49:43 -0500 Subject: [PATCH] fall back to times-roman as standard 14 font when lenient if parsing in lenient mode and encountering a malformed base name (in this case 'helveticai') we fallback to times-roman as the adobe font metrics file for a standard 14 font. this aligns with the behavior of pdfbox. we also log a more informative error in non-lenient modes this fixes document 0000086.pdf from the corpus --- .../Parser/PdfDocumentFactory.cs | 6 +++- .../Parser/Handlers/Type1FontHandler.cs | 31 +++++++++++++++---- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index 1ce4a84d..8a09f81a 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -168,7 +168,11 @@ pdfScanner, parsingOptions); - var type1Handler = new Type1FontHandler(pdfScanner, filterProvider, encodingReader); + var type1Handler = new Type1FontHandler( + pdfScanner, + filterProvider, + encodingReader, + parsingOptions.UseLenientParsing); var trueTypeHandler = new TrueTypeFontHandler(parsingOptions.Logger, pdfScanner, diff --git a/src/UglyToad.PdfPig/PdfFonts/Parser/Handlers/Type1FontHandler.cs b/src/UglyToad.PdfPig/PdfFonts/Parser/Handlers/Type1FontHandler.cs index 7bac246b..5cf1b7aa 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Parser/Handlers/Type1FontHandler.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Parser/Handlers/Type1FontHandler.cs @@ -20,13 +20,18 @@ private readonly IPdfTokenScanner pdfScanner; private readonly ILookupFilterProvider filterProvider; private readonly IEncodingReader encodingReader; + private readonly bool isLenientParsing; - public Type1FontHandler(IPdfTokenScanner pdfScanner, ILookupFilterProvider filterProvider, - IEncodingReader encodingReader) + public Type1FontHandler( + IPdfTokenScanner pdfScanner, + ILookupFilterProvider filterProvider, + IEncodingReader encodingReader, + bool isLenientParsing) { this.pdfScanner = pdfScanner; this.filterProvider = filterProvider; this.encodingReader = encodingReader; + this.isLenientParsing = isLenientParsing; } public IFont Generate(DictionaryToken dictionary) @@ -69,12 +74,26 @@ widths = []; } - if (!dictionary.TryGet(NameToken.FontDescriptor, out var _)) + if (!dictionary.TryGet(NameToken.FontDescriptor, out _)) { - if (dictionary.TryGet(NameToken.BaseFont, out var baseFontToken) && - DirectObjectFinder.TryGet(baseFontToken, pdfScanner, out NameToken? baseFontName)) + if (dictionary.TryGet(NameToken.BaseFont, pdfScanner, out NameToken? baseFontToken)) { - var metrics = Standard14.GetAdobeFontMetrics(baseFontName.Data); + var metrics = Standard14.GetAdobeFontMetrics(baseFontToken.Data); + + if (metrics == null) + { + if (isLenientParsing) + { + // We can support a fallback here to return content. + // https://github.com/apache/pdfbox/blob/f81c7c5a06126db68aa985a0e755cdbffed7d270/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/FontMapperImpl.java#L304 + metrics = Standard14.GetAdobeFontMetrics(Standard14Font.TimesRoman); + } + else + { + throw new PdfDocumentFormatException( + $"Type 1 Standard 14 font with name {baseFontToken} requested, this is an invalid name."); + } + } var overrideEncoding = encodingReader.Read(dictionary);