diff --git a/src/UglyToad.PdfPig.Tests/Integration/InvalidOperator.cs b/src/UglyToad.PdfPig.Tests/Integration/InvalidOperator.cs new file mode 100644 index 00000000..da07b605 --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/InvalidOperator.cs @@ -0,0 +1,33 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using System; + using Xunit; + + public class InvalidOperatorTests + { + [Fact] + public void InvalidOperatorThrowsExceptionIfNotUsingLenientParsing() + { + var path = IntegrationHelpers.GetSpecificTestDocumentPath("invalid-operator.pdf"); + + using (var document = PdfDocument.Open(path, new ParsingOptions { UseLenientParsing = false })) + { + Assert.Throws(() => document.GetPage(1)); + } + } + + [Fact] + public void InvalidOperatorDoesNotThrowExceptionIfUsingLenientParsing() + { + var path = IntegrationHelpers.GetSpecificTestDocumentPath("invalid-operator.pdf"); + + using (var document = PdfDocument.Open(path, new ParsingOptions { UseLenientParsing = true })) + { + var page = document.GetPage(1); + var text = page.Text; + Assert.True(text.Contains("Text line 1")); + Assert.True(text.Contains("Text line 2")); + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/invalid-operator.pdf b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/invalid-operator.pdf new file mode 100644 index 00000000..63563836 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/invalid-operator.pdf differ diff --git a/src/UglyToad.PdfPig/Parser/PageContentParser.cs b/src/UglyToad.PdfPig/Parser/PageContentParser.cs index bd253e7b..e8822e4c 100644 --- a/src/UglyToad.PdfPig/Parser/PageContentParser.cs +++ b/src/UglyToad.PdfPig/Parser/PageContentParser.cs @@ -15,10 +15,12 @@ internal class PageContentParser : IPageContentParser { private readonly IGraphicsStateOperationFactory operationFactory; + private readonly bool useLenientParsing; - public PageContentParser(IGraphicsStateOperationFactory operationFactory) + public PageContentParser(IGraphicsStateOperationFactory operationFactory, bool useLenientParsing = false) { this.operationFactory = operationFactory; + this.useLenientParsing = useLenientParsing; } public IReadOnlyList Parse( @@ -116,9 +118,10 @@ catch (Exception ex) { // End images can cause weird state if the "EI" appears inside the inline data stream. - if (TryGetLastEndImage(graphicsStateOperations, out _, out _)) + log.Error($"Failed reading operation at offset {inputBytes.CurrentOffset} for page {pageNumber}, data: '{op.Data}'", ex); + if (TryGetLastEndImage(graphicsStateOperations, out _, out _) + || useLenientParsing) { - log.Error($"Failed reading an operation at offset {inputBytes.CurrentOffset} for page {pageNumber}.", ex); operation = null; } else diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index 9f6ded48..121a0e0e 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -195,7 +195,7 @@ parsingOptions.UseLenientParsing); var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider, - new PageContentParser(new ReflectionGraphicsStateOperationFactory()), parsingOptions.Logger); + new PageContentParser(new ReflectionGraphicsStateOperationFactory(), parsingOptions.UseLenientParsing), parsingOptions.Logger); var catalog = CatalogFactory.Create( rootReference,