From ba09a13d08756e1e58da4599fcf04f5ea14e5eff Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Sat, 25 Jan 2020 15:53:08 +0000 Subject: [PATCH] more end image recovery logic since inline image data may contain the end image "ei" token inside the data stream there's no reliable way to actually determine if we've read all the data. for this reason if we end up with an invalid state parsing operations after we've read the end image token we try to recover by reading from the previous token to the next end image token if any. we supply log information to let the consumer know this is what we're doing. it's still not bullet-proof but it should be good enough. also support negative page rotation values by adding them to a 360 degree rotation so -90 degrees clockwise is 270 degrees clockwise. --- .../Parser/PageContentParserTests.cs | 14 +++--- .../Scanner/CoreTokenScanner.cs | 47 +++++++++++++++++- .../Content/PageRotationDegrees.cs | 5 ++ .../Graphics/ContentStreamProcessor.cs | 2 +- ...ReflectionGraphicsStateOperationFactory.cs | 5 ++ .../Parser/IPageContentParser.cs | 4 +- .../Parser/PageContentParser.cs | 48 ++++++++++++++++++- src/UglyToad.PdfPig/Parser/PageFactory.cs | 3 +- 8 files changed, 116 insertions(+), 12 deletions(-) diff --git a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs index c4f4af00..4c8c8cb3 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs @@ -3,6 +3,7 @@ using System; using System.IO; using System.Text.RegularExpressions; + using Logging; using PdfPig.Core; using PdfPig.Graphics; using PdfPig.Graphics.Core; @@ -19,6 +20,7 @@ public class PageContentParserTests { private readonly PageContentParser parser = new PageContentParser(new ReflectionGraphicsStateOperationFactory()); + private readonly ILog log = new NoOpLog(); [Fact] public void CorrectlyExtractsOperations() @@ -27,7 +29,7 @@ var content = File.ReadAllText(path); var input = StringBytesTestConverter.Convert(content, false); - var result = parser.Parse(1, input.Bytes); + var result = parser.Parse(1, input.Bytes, log); Assert.NotEmpty(result); } @@ -39,7 +41,7 @@ var content = File.ReadAllText(path); var input = StringBytesTestConverter.Convert(content, false); - var result = parser.Parse(1, input.Bytes); + var result = parser.Parse(1, input.Bytes, log); var replacementRegex = new Regex(@"\s(\.\d+)\b"); @@ -72,7 +74,7 @@ ET"; var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(1, input.Bytes); + var result = parser.Parse(1, input.Bytes, log); using (var stream = new MemoryStream()) { @@ -102,7 +104,7 @@ ET"; ET"; var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(1, input.Bytes); + var result = parser.Parse(1, input.Bytes, log); Assert.Equal(7, result.Count); @@ -138,7 +140,7 @@ ET"; var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(1, input.Bytes); + var result = parser.Parse(1, input.Bytes, log); Assert.Equal(4, result.Count); @@ -163,7 +165,7 @@ cm BT 0.0001 Tc 19 0 0 19 0 0 Tm /Tc1 1 Tf ( \(sleep 1; printf ""QUIT\\r\\n""\ var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(1, input.Bytes); + var result = parser.Parse(1, input.Bytes, log); Assert.Equal(9, result.Count); diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs index 911a4000..7a73d518 100644 --- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs @@ -220,6 +220,46 @@ customTokenizers.RemoveAll(x => ReferenceEquals(x.tokenizer, tokenizer)); } + /// + /// Handles the situation where "EI" was encountered in the inline image data but was + /// not the end of the image. + /// + /// The offset of the "E" of the "EI" marker which was incorrectly read. + /// The set of bytes from the incorrect "EI" to the correct "EI" including the incorrect "EI". + public IReadOnlyList RecoverFromIncorrectEndImage(long lastEndImageOffset) + { + var data = new List(); + + inputBytes.Seek(lastEndImageOffset); + + if (!inputBytes.MoveNext() || inputBytes.CurrentByte != 'E') + { + var message = $"Failed to recover the image data stream for an inline image at offset {lastEndImageOffset}. " + + $"Expected to read byte 'E' instead got {inputBytes.CurrentByte}."; + + throw new PdfDocumentFormatException(message); + } + + data.Add(inputBytes.CurrentByte); + + if (!inputBytes.MoveNext() || inputBytes.CurrentByte != 'I') + { + var message = $"Failed to recover the image data stream for an inline image at offset {lastEndImageOffset}. " + + $"Expected to read second byte 'I' following 'E' instead got {inputBytes.CurrentByte}."; + + throw new PdfDocumentFormatException(message); + } + + data.Add(inputBytes.CurrentByte); + + data.AddRange(ReadUntilEndImage(lastEndImageOffset)); + + // Skip beyond the 'I' in the "EI" token we just read so the scanner is in a valid position. + inputBytes.MoveNext(); + + return data; + } + private IReadOnlyList ReadInlineImageData() { // The ID operator should be followed by a single white-space character, and the next character is interpreted @@ -231,9 +271,14 @@ var startsAt = inputBytes.CurrentOffset - 2; + return ReadUntilEndImage(startsAt); + } + + private List ReadUntilEndImage(long startsAt) + { const byte lastPlainText = 127; const byte space = 32; - + var imageData = new List(); byte prevByte = 0; diff --git a/src/UglyToad.PdfPig/Content/PageRotationDegrees.cs b/src/UglyToad.PdfPig/Content/PageRotationDegrees.cs index 90bdfa82..6e5b96b9 100644 --- a/src/UglyToad.PdfPig/Content/PageRotationDegrees.cs +++ b/src/UglyToad.PdfPig/Content/PageRotationDegrees.cs @@ -47,6 +47,11 @@ /// Rotation in degrees clockwise. public PageRotationDegrees(int rotation) { + if (rotation < 0) + { + rotation = 360 + rotation; + } + while (rotation >= 360) { rotation -= 360; diff --git a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs index a08cff88..2f029409 100644 --- a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs +++ b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs @@ -383,7 +383,7 @@ var contentStream = formStream.Decode(filterProvider); - var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream)); + var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream), log); // 3. We don't respect clipping currently. diff --git a/src/UglyToad.PdfPig/Graphics/ReflectionGraphicsStateOperationFactory.cs b/src/UglyToad.PdfPig/Graphics/ReflectionGraphicsStateOperationFactory.cs index 7f01434a..49b9acb0 100644 --- a/src/UglyToad.PdfPig/Graphics/ReflectionGraphicsStateOperationFactory.cs +++ b/src/UglyToad.PdfPig/Graphics/ReflectionGraphicsStateOperationFactory.cs @@ -385,6 +385,11 @@ namespace UglyToad.PdfPig.Graphics foreach (var parameter in parameters) { + if (offset >= operands.Count) + { + throw new InvalidOperationException($"Fewer operands {operands.Count} found than required ({offset + 1}) for operator: {op.Data}."); + } + if (parameter.ParameterType == typeof(decimal)) { if (operands[offset] is NumericToken numeric) diff --git a/src/UglyToad.PdfPig/Parser/IPageContentParser.cs b/src/UglyToad.PdfPig/Parser/IPageContentParser.cs index 3f011f7f..c7817b2f 100644 --- a/src/UglyToad.PdfPig/Parser/IPageContentParser.cs +++ b/src/UglyToad.PdfPig/Parser/IPageContentParser.cs @@ -3,9 +3,11 @@ using System.Collections.Generic; using Core; using Graphics.Operations; + using Logging; internal interface IPageContentParser { - IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes); + IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes, + ILog log); } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Parser/PageContentParser.cs b/src/UglyToad.PdfPig/Parser/PageContentParser.cs index c18cc2ff..d20ba750 100644 --- a/src/UglyToad.PdfPig/Parser/PageContentParser.cs +++ b/src/UglyToad.PdfPig/Parser/PageContentParser.cs @@ -7,6 +7,7 @@ using Graphics; using Graphics.Operations; using Graphics.Operations.InlineImages; + using Logging; using Tokenization.Scanner; using Tokens; @@ -19,7 +20,8 @@ this.operationFactory = operationFactory; } - public IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes) + public IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes, + ILog log) { var scanner = new CoreTokenScanner(inputBytes); @@ -75,6 +77,8 @@ // Work out how much data we missed between the false EI operator and the actual one. var actualEndImageOffset = scanner.CurrentPosition - 3; + log.Warn($"End inline image (EI) encountered after previous EI, attempting recovery at {actualEndImageOffset}."); + var gap = (int)(actualEndImageOffset - lastEndImageOffset); var from = inputBytes.CurrentOffset; @@ -101,12 +105,52 @@ } else { - var operation = operationFactory.Create(op, precedingTokens); + IGraphicsStateOperation operation; + try + { + operation = operationFactory.Create(op, precedingTokens); + } + catch (Exception ex) + { + var lastWasEndImage = graphicsStateOperations.Count > 0 + && graphicsStateOperations[graphicsStateOperations.Count - 1] is EndInlineImage; + + // End images can cause weird state if the "EI" appears inside the inline data stream. + if (lastWasEndImage) + { + log.Error($"Failed reading an operation at offset {inputBytes.CurrentOffset} for page {pageNumber}.", ex); + operation = null; + } + else + { + throw; + } + } if (operation != null) { graphicsStateOperations.Add(operation); } + else if (graphicsStateOperations.Count > 0) + { + var lastToken = graphicsStateOperations[graphicsStateOperations.Count - 1]; + + if (lastToken is EndInlineImage prevEndInlineImage && lastEndImageOffset.HasValue) + { + log.Warn($"Operator {op.Data} was not understood following end of inline image data at {lastEndImageOffset}, " + + "attempting recovery."); + + var nextByteSet = scanner.RecoverFromIncorrectEndImage(lastEndImageOffset.Value); + graphicsStateOperations.RemoveAt(graphicsStateOperations.Count - 1); + var newEndInlineImage = new EndInlineImage(prevEndInlineImage.ImageData.Concat(nextByteSet).ToList()); + graphicsStateOperations.Add(newEndInlineImage); + lastEndImageOffset = scanner.CurrentPosition - 2; + } + else + { + log.Warn($"Operator which was not understood encountered. Values was {op.Data}. Ignoring."); + } + } } precedingTokens.Clear(); diff --git a/src/UglyToad.PdfPig/Parser/PageFactory.cs b/src/UglyToad.PdfPig/Parser/PageFactory.cs index 4d086ccb..11c050ed 100644 --- a/src/UglyToad.PdfPig/Parser/PageFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PageFactory.cs @@ -141,7 +141,8 @@ PageRotationDegrees rotation, bool isLenientParsing) { - var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes)); + var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes), + log); var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, rotation, isLenientParsing, pdfScanner, pageContentParser,