diff --git a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs
index c4f4af00..4c8c8cb3 100644
--- a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs
@@ -3,6 +3,7 @@
using System;
using System.IO;
using System.Text.RegularExpressions;
+ using Logging;
using PdfPig.Core;
using PdfPig.Graphics;
using PdfPig.Graphics.Core;
@@ -19,6 +20,7 @@
public class PageContentParserTests
{
private readonly PageContentParser parser = new PageContentParser(new ReflectionGraphicsStateOperationFactory());
+ private readonly ILog log = new NoOpLog();
[Fact]
public void CorrectlyExtractsOperations()
@@ -27,7 +29,7 @@
var content = File.ReadAllText(path);
var input = StringBytesTestConverter.Convert(content, false);
- var result = parser.Parse(1, input.Bytes);
+ var result = parser.Parse(1, input.Bytes, log);
Assert.NotEmpty(result);
}
@@ -39,7 +41,7 @@
var content = File.ReadAllText(path);
var input = StringBytesTestConverter.Convert(content, false);
- var result = parser.Parse(1, input.Bytes);
+ var result = parser.Parse(1, input.Bytes, log);
var replacementRegex = new Regex(@"\s(\.\d+)\b");
@@ -72,7 +74,7 @@
ET";
var input = StringBytesTestConverter.Convert(s, false);
- var result = parser.Parse(1, input.Bytes);
+ var result = parser.Parse(1, input.Bytes, log);
using (var stream = new MemoryStream())
{
@@ -102,7 +104,7 @@ ET";
ET";
var input = StringBytesTestConverter.Convert(s, false);
- var result = parser.Parse(1, input.Bytes);
+ var result = parser.Parse(1, input.Bytes, log);
Assert.Equal(7, result.Count);
@@ -138,7 +140,7 @@ ET";
var input = StringBytesTestConverter.Convert(s, false);
- var result = parser.Parse(1, input.Bytes);
+ var result = parser.Parse(1, input.Bytes, log);
Assert.Equal(4, result.Count);
@@ -163,7 +165,7 @@ cm BT 0.0001 Tc 19 0 0 19 0 0 Tm /Tc1 1 Tf ( \(sleep 1; printf ""QUIT\\r\\n""\
var input = StringBytesTestConverter.Convert(s, false);
- var result = parser.Parse(1, input.Bytes);
+ var result = parser.Parse(1, input.Bytes, log);
Assert.Equal(9, result.Count);
diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
index 911a4000..7a73d518 100644
--- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
+++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
@@ -220,6 +220,46 @@
customTokenizers.RemoveAll(x => ReferenceEquals(x.tokenizer, tokenizer));
}
+ ///
+ /// Handles the situation where "EI" was encountered in the inline image data but was
+ /// not the end of the image.
+ ///
+ /// The offset of the "E" of the "EI" marker which was incorrectly read.
+ /// The set of bytes from the incorrect "EI" to the correct "EI" including the incorrect "EI".
+ public IReadOnlyList RecoverFromIncorrectEndImage(long lastEndImageOffset)
+ {
+ var data = new List();
+
+ inputBytes.Seek(lastEndImageOffset);
+
+ if (!inputBytes.MoveNext() || inputBytes.CurrentByte != 'E')
+ {
+ var message = $"Failed to recover the image data stream for an inline image at offset {lastEndImageOffset}. " +
+ $"Expected to read byte 'E' instead got {inputBytes.CurrentByte}.";
+
+ throw new PdfDocumentFormatException(message);
+ }
+
+ data.Add(inputBytes.CurrentByte);
+
+ if (!inputBytes.MoveNext() || inputBytes.CurrentByte != 'I')
+ {
+ var message = $"Failed to recover the image data stream for an inline image at offset {lastEndImageOffset}. " +
+ $"Expected to read second byte 'I' following 'E' instead got {inputBytes.CurrentByte}.";
+
+ throw new PdfDocumentFormatException(message);
+ }
+
+ data.Add(inputBytes.CurrentByte);
+
+ data.AddRange(ReadUntilEndImage(lastEndImageOffset));
+
+ // Skip beyond the 'I' in the "EI" token we just read so the scanner is in a valid position.
+ inputBytes.MoveNext();
+
+ return data;
+ }
+
private IReadOnlyList ReadInlineImageData()
{
// The ID operator should be followed by a single white-space character, and the next character is interpreted
@@ -231,9 +271,14 @@
var startsAt = inputBytes.CurrentOffset - 2;
+ return ReadUntilEndImage(startsAt);
+ }
+
+ private List ReadUntilEndImage(long startsAt)
+ {
const byte lastPlainText = 127;
const byte space = 32;
-
+
var imageData = new List();
byte prevByte = 0;
diff --git a/src/UglyToad.PdfPig/Content/PageRotationDegrees.cs b/src/UglyToad.PdfPig/Content/PageRotationDegrees.cs
index 90bdfa82..6e5b96b9 100644
--- a/src/UglyToad.PdfPig/Content/PageRotationDegrees.cs
+++ b/src/UglyToad.PdfPig/Content/PageRotationDegrees.cs
@@ -47,6 +47,11 @@
/// Rotation in degrees clockwise.
public PageRotationDegrees(int rotation)
{
+ if (rotation < 0)
+ {
+ rotation = 360 + rotation;
+ }
+
while (rotation >= 360)
{
rotation -= 360;
diff --git a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs
index a08cff88..2f029409 100644
--- a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs
+++ b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs
@@ -383,7 +383,7 @@
var contentStream = formStream.Decode(filterProvider);
- var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream));
+ var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream), log);
// 3. We don't respect clipping currently.
diff --git a/src/UglyToad.PdfPig/Graphics/ReflectionGraphicsStateOperationFactory.cs b/src/UglyToad.PdfPig/Graphics/ReflectionGraphicsStateOperationFactory.cs
index 7f01434a..49b9acb0 100644
--- a/src/UglyToad.PdfPig/Graphics/ReflectionGraphicsStateOperationFactory.cs
+++ b/src/UglyToad.PdfPig/Graphics/ReflectionGraphicsStateOperationFactory.cs
@@ -385,6 +385,11 @@ namespace UglyToad.PdfPig.Graphics
foreach (var parameter in parameters)
{
+ if (offset >= operands.Count)
+ {
+ throw new InvalidOperationException($"Fewer operands {operands.Count} found than required ({offset + 1}) for operator: {op.Data}.");
+ }
+
if (parameter.ParameterType == typeof(decimal))
{
if (operands[offset] is NumericToken numeric)
diff --git a/src/UglyToad.PdfPig/Parser/IPageContentParser.cs b/src/UglyToad.PdfPig/Parser/IPageContentParser.cs
index 3f011f7f..c7817b2f 100644
--- a/src/UglyToad.PdfPig/Parser/IPageContentParser.cs
+++ b/src/UglyToad.PdfPig/Parser/IPageContentParser.cs
@@ -3,9 +3,11 @@
using System.Collections.Generic;
using Core;
using Graphics.Operations;
+ using Logging;
internal interface IPageContentParser
{
- IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes);
+ IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes,
+ ILog log);
}
}
\ No newline at end of file
diff --git a/src/UglyToad.PdfPig/Parser/PageContentParser.cs b/src/UglyToad.PdfPig/Parser/PageContentParser.cs
index c18cc2ff..d20ba750 100644
--- a/src/UglyToad.PdfPig/Parser/PageContentParser.cs
+++ b/src/UglyToad.PdfPig/Parser/PageContentParser.cs
@@ -7,6 +7,7 @@
using Graphics;
using Graphics.Operations;
using Graphics.Operations.InlineImages;
+ using Logging;
using Tokenization.Scanner;
using Tokens;
@@ -19,7 +20,8 @@
this.operationFactory = operationFactory;
}
- public IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes)
+ public IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes,
+ ILog log)
{
var scanner = new CoreTokenScanner(inputBytes);
@@ -75,6 +77,8 @@
// Work out how much data we missed between the false EI operator and the actual one.
var actualEndImageOffset = scanner.CurrentPosition - 3;
+ log.Warn($"End inline image (EI) encountered after previous EI, attempting recovery at {actualEndImageOffset}.");
+
var gap = (int)(actualEndImageOffset - lastEndImageOffset);
var from = inputBytes.CurrentOffset;
@@ -101,12 +105,52 @@
}
else
{
- var operation = operationFactory.Create(op, precedingTokens);
+ IGraphicsStateOperation operation;
+ try
+ {
+ operation = operationFactory.Create(op, precedingTokens);
+ }
+ catch (Exception ex)
+ {
+ var lastWasEndImage = graphicsStateOperations.Count > 0
+ && graphicsStateOperations[graphicsStateOperations.Count - 1] is EndInlineImage;
+
+ // End images can cause weird state if the "EI" appears inside the inline data stream.
+ if (lastWasEndImage)
+ {
+ log.Error($"Failed reading an operation at offset {inputBytes.CurrentOffset} for page {pageNumber}.", ex);
+ operation = null;
+ }
+ else
+ {
+ throw;
+ }
+ }
if (operation != null)
{
graphicsStateOperations.Add(operation);
}
+ else if (graphicsStateOperations.Count > 0)
+ {
+ var lastToken = graphicsStateOperations[graphicsStateOperations.Count - 1];
+
+ if (lastToken is EndInlineImage prevEndInlineImage && lastEndImageOffset.HasValue)
+ {
+ log.Warn($"Operator {op.Data} was not understood following end of inline image data at {lastEndImageOffset}, " +
+ "attempting recovery.");
+
+ var nextByteSet = scanner.RecoverFromIncorrectEndImage(lastEndImageOffset.Value);
+ graphicsStateOperations.RemoveAt(graphicsStateOperations.Count - 1);
+ var newEndInlineImage = new EndInlineImage(prevEndInlineImage.ImageData.Concat(nextByteSet).ToList());
+ graphicsStateOperations.Add(newEndInlineImage);
+ lastEndImageOffset = scanner.CurrentPosition - 2;
+ }
+ else
+ {
+ log.Warn($"Operator which was not understood encountered. Values was {op.Data}. Ignoring.");
+ }
+ }
}
precedingTokens.Clear();
diff --git a/src/UglyToad.PdfPig/Parser/PageFactory.cs b/src/UglyToad.PdfPig/Parser/PageFactory.cs
index 4d086ccb..11c050ed 100644
--- a/src/UglyToad.PdfPig/Parser/PageFactory.cs
+++ b/src/UglyToad.PdfPig/Parser/PageFactory.cs
@@ -141,7 +141,8 @@
PageRotationDegrees rotation,
bool isLenientParsing)
{
- var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes));
+ var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes),
+ log);
var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, rotation, isLenientParsing, pdfScanner,
pageContentParser,