diff --git a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs index c4f4af00..4c8c8cb3 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs @@ -3,6 +3,7 @@ using System; using System.IO; using System.Text.RegularExpressions; + using Logging; using PdfPig.Core; using PdfPig.Graphics; using PdfPig.Graphics.Core; @@ -19,6 +20,7 @@ public class PageContentParserTests { private readonly PageContentParser parser = new PageContentParser(new ReflectionGraphicsStateOperationFactory()); + private readonly ILog log = new NoOpLog(); [Fact] public void CorrectlyExtractsOperations() @@ -27,7 +29,7 @@ var content = File.ReadAllText(path); var input = StringBytesTestConverter.Convert(content, false); - var result = parser.Parse(1, input.Bytes); + var result = parser.Parse(1, input.Bytes, log); Assert.NotEmpty(result); } @@ -39,7 +41,7 @@ var content = File.ReadAllText(path); var input = StringBytesTestConverter.Convert(content, false); - var result = parser.Parse(1, input.Bytes); + var result = parser.Parse(1, input.Bytes, log); var replacementRegex = new Regex(@"\s(\.\d+)\b"); @@ -72,7 +74,7 @@ ET"; var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(1, input.Bytes); + var result = parser.Parse(1, input.Bytes, log); using (var stream = new MemoryStream()) { @@ -102,7 +104,7 @@ ET"; ET"; var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(1, input.Bytes); + var result = parser.Parse(1, input.Bytes, log); Assert.Equal(7, result.Count); @@ -138,7 +140,7 @@ ET"; var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(1, input.Bytes); + var result = parser.Parse(1, input.Bytes, log); Assert.Equal(4, result.Count); @@ -163,7 +165,7 @@ cm BT 0.0001 Tc 19 0 0 19 0 0 Tm /Tc1 1 Tf ( \(sleep 1; printf ""QUIT\\r\\n""\ var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(1, input.Bytes); + var result = parser.Parse(1, input.Bytes, log); Assert.Equal(9, result.Count); diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs index 911a4000..7a73d518 100644 --- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs @@ -220,6 +220,46 @@ customTokenizers.RemoveAll(x => ReferenceEquals(x.tokenizer, tokenizer)); } + /// + /// Handles the situation where "EI" was encountered in the inline image data but was + /// not the end of the image. + /// + /// The offset of the "E" of the "EI" marker which was incorrectly read. + /// The set of bytes from the incorrect "EI" to the correct "EI" including the incorrect "EI". + public IReadOnlyList RecoverFromIncorrectEndImage(long lastEndImageOffset) + { + var data = new List(); + + inputBytes.Seek(lastEndImageOffset); + + if (!inputBytes.MoveNext() || inputBytes.CurrentByte != 'E') + { + var message = $"Failed to recover the image data stream for an inline image at offset {lastEndImageOffset}. " + + $"Expected to read byte 'E' instead got {inputBytes.CurrentByte}."; + + throw new PdfDocumentFormatException(message); + } + + data.Add(inputBytes.CurrentByte); + + if (!inputBytes.MoveNext() || inputBytes.CurrentByte != 'I') + { + var message = $"Failed to recover the image data stream for an inline image at offset {lastEndImageOffset}. " + + $"Expected to read second byte 'I' following 'E' instead got {inputBytes.CurrentByte}."; + + throw new PdfDocumentFormatException(message); + } + + data.Add(inputBytes.CurrentByte); + + data.AddRange(ReadUntilEndImage(lastEndImageOffset)); + + // Skip beyond the 'I' in the "EI" token we just read so the scanner is in a valid position. + inputBytes.MoveNext(); + + return data; + } + private IReadOnlyList ReadInlineImageData() { // The ID operator should be followed by a single white-space character, and the next character is interpreted @@ -231,9 +271,14 @@ var startsAt = inputBytes.CurrentOffset - 2; + return ReadUntilEndImage(startsAt); + } + + private List ReadUntilEndImage(long startsAt) + { const byte lastPlainText = 127; const byte space = 32; - + var imageData = new List(); byte prevByte = 0; diff --git a/src/UglyToad.PdfPig/Content/PageRotationDegrees.cs b/src/UglyToad.PdfPig/Content/PageRotationDegrees.cs index 90bdfa82..6e5b96b9 100644 --- a/src/UglyToad.PdfPig/Content/PageRotationDegrees.cs +++ b/src/UglyToad.PdfPig/Content/PageRotationDegrees.cs @@ -47,6 +47,11 @@ /// Rotation in degrees clockwise. public PageRotationDegrees(int rotation) { + if (rotation < 0) + { + rotation = 360 + rotation; + } + while (rotation >= 360) { rotation -= 360; diff --git a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs index a08cff88..2f029409 100644 --- a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs +++ b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs @@ -383,7 +383,7 @@ var contentStream = formStream.Decode(filterProvider); - var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream)); + var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream), log); // 3. We don't respect clipping currently. diff --git a/src/UglyToad.PdfPig/Graphics/ReflectionGraphicsStateOperationFactory.cs b/src/UglyToad.PdfPig/Graphics/ReflectionGraphicsStateOperationFactory.cs index 7f01434a..49b9acb0 100644 --- a/src/UglyToad.PdfPig/Graphics/ReflectionGraphicsStateOperationFactory.cs +++ b/src/UglyToad.PdfPig/Graphics/ReflectionGraphicsStateOperationFactory.cs @@ -385,6 +385,11 @@ namespace UglyToad.PdfPig.Graphics foreach (var parameter in parameters) { + if (offset >= operands.Count) + { + throw new InvalidOperationException($"Fewer operands {operands.Count} found than required ({offset + 1}) for operator: {op.Data}."); + } + if (parameter.ParameterType == typeof(decimal)) { if (operands[offset] is NumericToken numeric) diff --git a/src/UglyToad.PdfPig/Parser/IPageContentParser.cs b/src/UglyToad.PdfPig/Parser/IPageContentParser.cs index 3f011f7f..c7817b2f 100644 --- a/src/UglyToad.PdfPig/Parser/IPageContentParser.cs +++ b/src/UglyToad.PdfPig/Parser/IPageContentParser.cs @@ -3,9 +3,11 @@ using System.Collections.Generic; using Core; using Graphics.Operations; + using Logging; internal interface IPageContentParser { - IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes); + IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes, + ILog log); } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Parser/PageContentParser.cs b/src/UglyToad.PdfPig/Parser/PageContentParser.cs index c18cc2ff..d20ba750 100644 --- a/src/UglyToad.PdfPig/Parser/PageContentParser.cs +++ b/src/UglyToad.PdfPig/Parser/PageContentParser.cs @@ -7,6 +7,7 @@ using Graphics; using Graphics.Operations; using Graphics.Operations.InlineImages; + using Logging; using Tokenization.Scanner; using Tokens; @@ -19,7 +20,8 @@ this.operationFactory = operationFactory; } - public IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes) + public IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes, + ILog log) { var scanner = new CoreTokenScanner(inputBytes); @@ -75,6 +77,8 @@ // Work out how much data we missed between the false EI operator and the actual one. var actualEndImageOffset = scanner.CurrentPosition - 3; + log.Warn($"End inline image (EI) encountered after previous EI, attempting recovery at {actualEndImageOffset}."); + var gap = (int)(actualEndImageOffset - lastEndImageOffset); var from = inputBytes.CurrentOffset; @@ -101,12 +105,52 @@ } else { - var operation = operationFactory.Create(op, precedingTokens); + IGraphicsStateOperation operation; + try + { + operation = operationFactory.Create(op, precedingTokens); + } + catch (Exception ex) + { + var lastWasEndImage = graphicsStateOperations.Count > 0 + && graphicsStateOperations[graphicsStateOperations.Count - 1] is EndInlineImage; + + // End images can cause weird state if the "EI" appears inside the inline data stream. + if (lastWasEndImage) + { + log.Error($"Failed reading an operation at offset {inputBytes.CurrentOffset} for page {pageNumber}.", ex); + operation = null; + } + else + { + throw; + } + } if (operation != null) { graphicsStateOperations.Add(operation); } + else if (graphicsStateOperations.Count > 0) + { + var lastToken = graphicsStateOperations[graphicsStateOperations.Count - 1]; + + if (lastToken is EndInlineImage prevEndInlineImage && lastEndImageOffset.HasValue) + { + log.Warn($"Operator {op.Data} was not understood following end of inline image data at {lastEndImageOffset}, " + + "attempting recovery."); + + var nextByteSet = scanner.RecoverFromIncorrectEndImage(lastEndImageOffset.Value); + graphicsStateOperations.RemoveAt(graphicsStateOperations.Count - 1); + var newEndInlineImage = new EndInlineImage(prevEndInlineImage.ImageData.Concat(nextByteSet).ToList()); + graphicsStateOperations.Add(newEndInlineImage); + lastEndImageOffset = scanner.CurrentPosition - 2; + } + else + { + log.Warn($"Operator which was not understood encountered. Values was {op.Data}. Ignoring."); + } + } } precedingTokens.Clear(); diff --git a/src/UglyToad.PdfPig/Parser/PageFactory.cs b/src/UglyToad.PdfPig/Parser/PageFactory.cs index 4d086ccb..11c050ed 100644 --- a/src/UglyToad.PdfPig/Parser/PageFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PageFactory.cs @@ -141,7 +141,8 @@ PageRotationDegrees rotation, bool isLenientParsing) { - var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes)); + var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes), + log); var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, rotation, isLenientParsing, pdfScanner, pageContentParser,