handle additional broken pdf files in the common crawl set

- a file contained 2 indices pointing to '.notdef' for the character name so
we just take the first rather than requiring a single
- a file contained '/' (empty name) as the subtype declaration, so we fall back
to trying type 1 and truetype parsing in this situation
This commit is contained in:
EliotJones 2025-07-26 12:32:41 -05:00
parent 50f878b2ba
commit 21f1cd5354
2 changed files with 20 additions and 1 deletions

View File

@ -32,7 +32,7 @@
public virtual string GetNameByStringId(int stringId)
{
return GlyphIdToStringIdAndName.SingleOrDefault(x => x.Value.stringId == stringId).Value.name;
return GlyphIdToStringIdAndName.FirstOrDefault(x => x.Value.stringId == stringId).Value.name;
}
public virtual int GetStringIdByGlyphId(int glyphId)

View File

@ -44,6 +44,25 @@
return handler.Generate(dictionary);
}
// Try simple font recovery:
NameToken[] orderedFallbacks = [NameToken.Type1, NameToken.TrueType];
foreach (var fallback in orderedFallbacks)
{
if (!handlers.TryGetValue(fallback, out handler))
{
continue;
}
try
{
return handler.Generate(dictionary);
}
catch (Exception ex)
{
log?.Error($"Tried to parse font as fallback type: {fallback}", ex);
}
}
throw new NotImplementedException($"Parsing not implemented for fonts of type: {subtype}, please submit a pull request or an issue.");
}
}