Address #672 to ignore errors while reading the descriptor file in CidFontFactory

This commit is contained in:
BobLd
2023-08-05 13:00:31 +01:00
parent 8a82500427
commit 9aaf20ceb4
6 changed files with 96 additions and 30 deletions

View File

@@ -0,0 +1,74 @@
using System.Linq;
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
using Xunit;
namespace UglyToad.PdfPig.Tests.Integration
{
public class OpenTypeFontTests
{
[Fact]
public void Issue672()
{
// NB: The issue is actually not fully fixed: the change are just allowing
// to parse the document and get the text without error
// but the embedded font data is not properly parsed.
// It seems the font bytes are incorrectly parsed using the TrueTypeFontParser
// and are actually parsable with CompactFontFormatParser, but with some errors though.
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Why.does.this.not.work")))
{
var page = document.GetPage(1);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
var lines = DocstrumBoundingBoxes.Instance.GetBlocks(words).SelectMany(b => b.TextLines).ToArray();
Assert.Equal(3, lines.Length);
Assert.Equal("THIS TEST SEEMS TO BREAK THE PARSER....", lines[0].Text);
Assert.Equal("This is just some test text.", lines[1].Text);
Assert.Equal("SO DOES THIS", lines[2].Text);
}
}
[Fact]
public void Issue672ok()
{
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Test.Doc")))
{
var page = document.GetPage(1);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
var lines = DocstrumBoundingBoxes.Instance.GetBlocks(words).SelectMany(b => b.TextLines).ToArray();
Assert.Equal(4, lines.Length);
Assert.Equal("This is just a bunch of boring text...", lines[0].Text);
Assert.Equal("THIS IS SOME SEMPLICITA PRO FONT", lines[1].Text);
Assert.Equal("Hopefully font that are not embedded on the server.", lines[2].Text);
Assert.Equal("And a bit of Verdana for good measure.", lines[3].Text);
}
}
[Fact]
public void So74165171()
{
// https://stackoverflow.com/questions/74165171/embedded-opentype-cff-font-in-a-pdf-shows-strange-behaviour-in-some-viewers
// Adding this test case as the OpenType font is correctly parsed using TrueTypeFontParser
// It seems there are further issues with the extracted test (also the case in Acrobat Reader).
// Out of scope for the moment
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("test-2_so_74165171")))
{
var page = document.GetPage(1);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters).ToArray();
Assert.Equal(2, words.Length);
}
}
}
}

View File

@@ -158,7 +158,11 @@
pdfScanner.UpdateEncryptionHandler(encryptionHandler);
var cidFontFactory = new CidFontFactory(pdfScanner, filterProvider);
var cidFontFactory = new CidFontFactory(
parsingOptions.Logger,
pdfScanner,
filterProvider);
var encodingReader = new EncodingReader(pdfScanner);
var type0Handler = new Type0FontHandler(

View File

@@ -13,21 +13,24 @@
using PdfPig.Parser.Parts;
using Tokenization.Scanner;
using Tokens;
using UglyToad.PdfPig.Logging;
using Util;
internal class CidFontFactory
{
private readonly ILookupFilterProvider filterProvider;
private readonly IPdfTokenScanner pdfScanner;
private readonly ILog logger;
public CidFontFactory(IPdfTokenScanner pdfScanner, ILookupFilterProvider filterProvider)
public CidFontFactory(ILog log, IPdfTokenScanner pdfScanner, ILookupFilterProvider filterProvider)
{
this.logger = log;
this.pdfScanner = pdfScanner;
this.filterProvider = filterProvider;
}
public ICidFont Generate(DictionaryToken dictionary)
{
{
var type = dictionary.GetNameOrDefault(NameToken.Type);
if (!NameToken.Font.Equals(type))
{
@@ -50,7 +53,15 @@
descriptor = FontDescriptorFactory.Generate(descriptorDictionary, pdfScanner);
}
var fontProgram = ReadDescriptorFile(descriptor);
ICidFontProgram fontProgram = null;
try
{
fontProgram = ReadDescriptorFile(descriptor);
}
catch (Exception ex)
{
logger.Error($"Invalid descriptor in CID font named '{descriptor?.FontName}': {ex.Message}.");
}
var baseFont = dictionary.GetNameOrDefault(NameToken.BaseFont);
@@ -74,25 +85,7 @@
private bool TryGetFontDescriptor(DictionaryToken dictionary, out DictionaryToken descriptorDictionary)
{
descriptorDictionary = null;
if (!dictionary.TryGet(NameToken.FontDescriptor, out var baseValue))
{
return false;
}
try
{
var descriptor = DirectObjectFinder.Get<DictionaryToken>(baseValue, pdfScanner);
descriptorDictionary = descriptor;
}
catch
{
return false;
}
return true;
return dictionary.TryGet(NameToken.FontDescriptor, pdfScanner, out descriptorDictionary);
}
private ICidFontProgram ReadDescriptorFile(FontDescriptor descriptor)
@@ -267,14 +260,9 @@
throw new InvalidFontFormatException($"No CID System Info was found in the CID Font dictionary: {dictionary}");
}
if (cidEntry is DictionaryToken cidDictionary)
if (!(cidEntry is DictionaryToken cidDictionary))
{
}
else
{
cidDictionary =
DirectObjectFinder.Get<DictionaryToken>(cidEntry, pdfScanner);
cidDictionary = DirectObjectFinder.Get<DictionaryToken>(cidEntry, pdfScanner);
}
var registry = SafeKeyAccess(cidDictionary, NameToken.Registry);