fall back to times-roman as standard 14 font when lenient

if parsing in lenient mode and encountering a malformed base name
(in this case 'helveticai') we fallback to times-roman as the adobe font
metrics file for a standard 14 font. this aligns with the behavior of pdfbox.
we also log a more informative error in non-lenient modes

this fixes document 0000086.pdf from the corpus
This commit is contained in:
EliotJones 2025-07-15 20:49:43 -05:00 committed by BobLd
parent 9503f9c137
commit 1021729727
2 changed files with 30 additions and 7 deletions

View File

@ -168,7 +168,11 @@
pdfScanner,
parsingOptions);
var type1Handler = new Type1FontHandler(pdfScanner, filterProvider, encodingReader);
var type1Handler = new Type1FontHandler(
pdfScanner,
filterProvider,
encodingReader,
parsingOptions.UseLenientParsing);
var trueTypeHandler = new TrueTypeFontHandler(parsingOptions.Logger,
pdfScanner,

View File

@ -20,13 +20,18 @@
private readonly IPdfTokenScanner pdfScanner;
private readonly ILookupFilterProvider filterProvider;
private readonly IEncodingReader encodingReader;
private readonly bool isLenientParsing;
public Type1FontHandler(IPdfTokenScanner pdfScanner, ILookupFilterProvider filterProvider,
IEncodingReader encodingReader)
public Type1FontHandler(
IPdfTokenScanner pdfScanner,
ILookupFilterProvider filterProvider,
IEncodingReader encodingReader,
bool isLenientParsing)
{
this.pdfScanner = pdfScanner;
this.filterProvider = filterProvider;
this.encodingReader = encodingReader;
this.isLenientParsing = isLenientParsing;
}
public IFont Generate(DictionaryToken dictionary)
@ -69,12 +74,26 @@
widths = [];
}
if (!dictionary.TryGet(NameToken.FontDescriptor, out var _))
if (!dictionary.TryGet(NameToken.FontDescriptor, out _))
{
if (dictionary.TryGet(NameToken.BaseFont, out var baseFontToken) &&
DirectObjectFinder.TryGet(baseFontToken, pdfScanner, out NameToken? baseFontName))
if (dictionary.TryGet(NameToken.BaseFont, pdfScanner, out NameToken? baseFontToken))
{
var metrics = Standard14.GetAdobeFontMetrics(baseFontName.Data);
var metrics = Standard14.GetAdobeFontMetrics(baseFontToken.Data);
if (metrics == null)
{
if (isLenientParsing)
{
// We can support a fallback here to return content.
// https://github.com/apache/pdfbox/blob/f81c7c5a06126db68aa985a0e755cdbffed7d270/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/font/FontMapperImpl.java#L304
metrics = Standard14.GetAdobeFontMetrics(Standard14Font.TimesRoman);
}
else
{
throw new PdfDocumentFormatException(
$"Type 1 Standard 14 font with name {baseFontToken} requested, this is an invalid name.");
}
}
var overrideEncoding = encodingReader.Read(dictionary);