skip missing objects if skip fonts is true #298

if skip missing fonts is set we want to read the file
as much as possible so we will also skip any missing
xobjects like images, forms or postscript code
This commit is contained in:
Eliot Jones 2023-05-27 10:46:29 +01:00
parent 20d3cc9066
commit fba1cbc13c
7 changed files with 31 additions and 8 deletions

View File

@ -17,7 +17,7 @@
IFont GetFont(NameToken name);
StreamToken GetXObject(NameToken name);
bool TryGetXObject(NameToken name, out StreamToken stream);
DictionaryToken GetExtendedGraphicsStateDictionary(NameToken name);

View File

@ -8,7 +8,7 @@
using PdfFonts;
using Tokenization.Scanner;
using Tokens;
using UglyToad.PdfPig.Filters;
using Filters;
using Util;
internal class ResourceStore : IResourceStore
@ -322,10 +322,15 @@
throw new InvalidOperationException($"Could not find color space for token '{name}'.");
}
public StreamToken GetXObject(NameToken name)
public bool TryGetXObject(NameToken name, out StreamToken stream)
{
var reference = currentResourceState[name];
return DirectObjectFinder.Get<StreamToken>(new IndirectReferenceToken(reference), scanner);
stream = null;
if (!currentResourceState.TryGetValue(name, out var indirectReference))
{
return false;
}
return DirectObjectFinder.TryGet(new IndirectReferenceToken(indirectReference), scanner, out stream);
}
public DictionaryToken GetExtendedGraphicsStateDictionary(NameToken name)

View File

@ -82,7 +82,9 @@
{XObjectType.PostScript, new List<XObjectContentRecord>()}
};
public ContentStreamProcessor(IResourceStore resourceStore,
public ContentStreamProcessor(
int pageNumber,
IResourceStore resourceStore,
UserSpaceUnit userSpaceUnit,
MediaBox mediaBox,
CropBox cropBox,
@ -92,6 +94,7 @@
ILookupFilterProvider filterProvider,
InternalParsingOptions parsingOptions)
{
this.pageNumber = pageNumber;
this.resourceStore = resourceStore;
this.userSpaceUnit = userSpaceUnit;
this.rotation = rotation;
@ -418,7 +421,15 @@
public void ApplyXObject(NameToken xObjectName)
{
var xObjectStream = resourceStore.GetXObject(xObjectName);
if (!resourceStore.TryGetXObject(xObjectName, out var xObjectStream))
{
if (parsingOptions.SkipMissingFonts)
{
return;
}
throw new PdfDocumentFormatException($"No XObject with name {xObjectName} found on page {pageNumber}.");
}
// For now we will determine the type and store the object with the graphics state information preceding it.
// Then consumers of the page can request the object(s) to be retrieved by type.

View File

@ -16,6 +16,8 @@
public bool SkipMissingFonts { get; }
public bool SkipMissingXObjects { get; }
public ILog Logger { get; }
public InternalParsingOptions(
@ -23,12 +25,14 @@
bool useLenientParsing,
bool clipPaths,
bool skipMissingFonts,
bool skipMissingXObjects,
ILog logger)
{
Passwords = passwords;
UseLenientParsing = useLenientParsing;
ClipPaths = clipPaths;
SkipMissingFonts = skipMissingFonts;
SkipMissingXObjects = skipMissingXObjects;
Logger = logger;
}
}

View File

@ -163,6 +163,7 @@
parsingOptions.Logger);
var context = new ContentStreamProcessor(
pageNumber,
resourceStore,
userSpaceUnit,
mediaBox,

View File

@ -96,6 +96,7 @@
isLenientParsing,
clipPaths,
options?.SkipMissingFonts ?? false,
options?.SkipMissingFonts ?? false,
options?.Logger ?? new NoOpLog());
var document = OpenDocument(inputBytes, tokenScanner, finalOptions);

View File

@ -51,7 +51,8 @@
/// <summary>
/// Skip extracting content where the font could not be found, will result in some letters being skipped/missed
/// but will prevent the library throwing where the source PDF has some corrupted text.
/// but will prevent the library throwing where the source PDF has some corrupted text. Also skips XObjects like
/// forms and images when missing.
/// </summary>
public bool SkipMissingFonts { get; set; } = false;
}