Fix for #662: when encountering invalid content, try to continue parsing

if option "useLenientParsing" is in effect.
This commit is contained in:
Mark van 't Zet 2023-09-12 17:25:58 +02:00 committed by BobLd
parent d59d2c61a0
commit e3f281435a
4 changed files with 40 additions and 4 deletions

View File

@ -0,0 +1,33 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using System;
using Xunit;
public class InvalidOperatorTests
{
[Fact]
public void InvalidOperatorThrowsExceptionIfNotUsingLenientParsing()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("invalid-operator.pdf");
using (var document = PdfDocument.Open(path, new ParsingOptions { UseLenientParsing = false }))
{
Assert.Throws<ArgumentException>(() => document.GetPage(1));
}
}
[Fact]
public void InvalidOperatorDoesNotThrowExceptionIfUsingLenientParsing()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("invalid-operator.pdf");
using (var document = PdfDocument.Open(path, new ParsingOptions { UseLenientParsing = true }))
{
var page = document.GetPage(1);
var text = page.Text;
Assert.True(text.Contains("Text line 1"));
Assert.True(text.Contains("Text line 2"));
}
}
}
}

View File

@ -15,10 +15,12 @@
internal class PageContentParser : IPageContentParser
{
private readonly IGraphicsStateOperationFactory operationFactory;
private readonly bool useLenientParsing;
public PageContentParser(IGraphicsStateOperationFactory operationFactory)
public PageContentParser(IGraphicsStateOperationFactory operationFactory, bool useLenientParsing = false)
{
this.operationFactory = operationFactory;
this.useLenientParsing = useLenientParsing;
}
public IReadOnlyList<IGraphicsStateOperation> Parse(
@ -116,9 +118,10 @@
catch (Exception ex)
{
// End images can cause weird state if the "EI" appears inside the inline data stream.
if (TryGetLastEndImage(graphicsStateOperations, out _, out _))
log.Error($"Failed reading operation at offset {inputBytes.CurrentOffset} for page {pageNumber}, data: '{op.Data}'", ex);
if (TryGetLastEndImage(graphicsStateOperations, out _, out _)
|| useLenientParsing)
{
log.Error($"Failed reading an operation at offset {inputBytes.CurrentOffset} for page {pageNumber}.", ex);
operation = null;
}
else

View File

@ -195,7 +195,7 @@
parsingOptions.UseLenientParsing);
var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider,
new PageContentParser(new ReflectionGraphicsStateOperationFactory()), parsingOptions.Logger);
new PageContentParser(new ReflectionGraphicsStateOperationFactory(), parsingOptions.UseLenientParsing), parsingOptions.Logger);
var catalog = CatalogFactory.Create(
rootReference,