mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-07 16:14:03 +08:00
Address #672 to ignore errors while reading the descriptor file in CidFontFactory
This commit is contained in:
BIN
src/UglyToad.PdfPig.Tests/Integration/Documents/Test.Doc.pdf
Normal file
BIN
src/UglyToad.PdfPig.Tests/Integration/Documents/Test.Doc.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
74
src/UglyToad.PdfPig.Tests/Integration/OpenTypeFontTests.cs
Normal file
74
src/UglyToad.PdfPig.Tests/Integration/OpenTypeFontTests.cs
Normal file
@@ -0,0 +1,74 @@
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
||||
using Xunit;
|
||||
|
||||
namespace UglyToad.PdfPig.Tests.Integration
|
||||
{
|
||||
public class OpenTypeFontTests
|
||||
{
|
||||
[Fact]
|
||||
public void Issue672()
|
||||
{
|
||||
// NB: The issue is actually not fully fixed: the change are just allowing
|
||||
// to parse the document and get the text without error
|
||||
// but the embedded font data is not properly parsed.
|
||||
// It seems the font bytes are incorrectly parsed using the TrueTypeFontParser
|
||||
// and are actually parsable with CompactFontFormatParser, but with some errors though.
|
||||
|
||||
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Why.does.this.not.work")))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
|
||||
|
||||
var lines = DocstrumBoundingBoxes.Instance.GetBlocks(words).SelectMany(b => b.TextLines).ToArray();
|
||||
|
||||
Assert.Equal(3, lines.Length);
|
||||
|
||||
Assert.Equal("THIS TEST SEEMS TO BREAK THE PARSER....", lines[0].Text);
|
||||
Assert.Equal("This is just some test text.", lines[1].Text);
|
||||
Assert.Equal("SO DOES THIS", lines[2].Text);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Issue672ok()
|
||||
{
|
||||
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Test.Doc")))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
|
||||
|
||||
var lines = DocstrumBoundingBoxes.Instance.GetBlocks(words).SelectMany(b => b.TextLines).ToArray();
|
||||
|
||||
Assert.Equal(4, lines.Length);
|
||||
|
||||
Assert.Equal("This is just a bunch of boring text...", lines[0].Text);
|
||||
Assert.Equal("THIS IS SOME SEMPLICITA PRO FONT", lines[1].Text);
|
||||
Assert.Equal("Hopefully font that are not embedded on the server.", lines[2].Text);
|
||||
Assert.Equal("And a bit of Verdana for good measure.", lines[3].Text);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void So74165171()
|
||||
{
|
||||
// https://stackoverflow.com/questions/74165171/embedded-opentype-cff-font-in-a-pdf-shows-strange-behaviour-in-some-viewers
|
||||
|
||||
// Adding this test case as the OpenType font is correctly parsed using TrueTypeFontParser
|
||||
// It seems there are further issues with the extracted test (also the case in Acrobat Reader).
|
||||
// Out of scope for the moment
|
||||
|
||||
using (var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("test-2_so_74165171")))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters).ToArray();
|
||||
|
||||
Assert.Equal(2, words.Length);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -158,7 +158,11 @@
|
||||
|
||||
pdfScanner.UpdateEncryptionHandler(encryptionHandler);
|
||||
|
||||
var cidFontFactory = new CidFontFactory(pdfScanner, filterProvider);
|
||||
var cidFontFactory = new CidFontFactory(
|
||||
parsingOptions.Logger,
|
||||
pdfScanner,
|
||||
filterProvider);
|
||||
|
||||
var encodingReader = new EncodingReader(pdfScanner);
|
||||
|
||||
var type0Handler = new Type0FontHandler(
|
||||
|
@@ -13,21 +13,24 @@
|
||||
using PdfPig.Parser.Parts;
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
using UglyToad.PdfPig.Logging;
|
||||
using Util;
|
||||
|
||||
internal class CidFontFactory
|
||||
{
|
||||
private readonly ILookupFilterProvider filterProvider;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
private readonly ILog logger;
|
||||
|
||||
public CidFontFactory(IPdfTokenScanner pdfScanner, ILookupFilterProvider filterProvider)
|
||||
public CidFontFactory(ILog log, IPdfTokenScanner pdfScanner, ILookupFilterProvider filterProvider)
|
||||
{
|
||||
this.logger = log;
|
||||
this.pdfScanner = pdfScanner;
|
||||
this.filterProvider = filterProvider;
|
||||
}
|
||||
|
||||
public ICidFont Generate(DictionaryToken dictionary)
|
||||
{
|
||||
{
|
||||
var type = dictionary.GetNameOrDefault(NameToken.Type);
|
||||
if (!NameToken.Font.Equals(type))
|
||||
{
|
||||
@@ -50,7 +53,15 @@
|
||||
descriptor = FontDescriptorFactory.Generate(descriptorDictionary, pdfScanner);
|
||||
}
|
||||
|
||||
var fontProgram = ReadDescriptorFile(descriptor);
|
||||
ICidFontProgram fontProgram = null;
|
||||
try
|
||||
{
|
||||
fontProgram = ReadDescriptorFile(descriptor);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
logger.Error($"Invalid descriptor in CID font named '{descriptor?.FontName}': {ex.Message}.");
|
||||
}
|
||||
|
||||
var baseFont = dictionary.GetNameOrDefault(NameToken.BaseFont);
|
||||
|
||||
@@ -74,25 +85,7 @@
|
||||
|
||||
private bool TryGetFontDescriptor(DictionaryToken dictionary, out DictionaryToken descriptorDictionary)
|
||||
{
|
||||
descriptorDictionary = null;
|
||||
|
||||
if (!dictionary.TryGet(NameToken.FontDescriptor, out var baseValue))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var descriptor = DirectObjectFinder.Get<DictionaryToken>(baseValue, pdfScanner);
|
||||
|
||||
descriptorDictionary = descriptor;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return dictionary.TryGet(NameToken.FontDescriptor, pdfScanner, out descriptorDictionary);
|
||||
}
|
||||
|
||||
private ICidFontProgram ReadDescriptorFile(FontDescriptor descriptor)
|
||||
@@ -267,14 +260,9 @@
|
||||
throw new InvalidFontFormatException($"No CID System Info was found in the CID Font dictionary: {dictionary}");
|
||||
}
|
||||
|
||||
if (cidEntry is DictionaryToken cidDictionary)
|
||||
if (!(cidEntry is DictionaryToken cidDictionary))
|
||||
{
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
cidDictionary =
|
||||
DirectObjectFinder.Get<DictionaryToken>(cidEntry, pdfScanner);
|
||||
cidDictionary = DirectObjectFinder.Get<DictionaryToken>(cidEntry, pdfScanner);
|
||||
}
|
||||
|
||||
var registry = SafeKeyAccess(cidDictionary, NameToken.Registry);
|
||||
|
Reference in New Issue
Block a user