more end image recovery logic

since inline image data may contain the end image "ei" token inside the data stream there's no reliable way to actually determine if we've read all the data. for this reason if we end up with an invalid state parsing operations after we've read the end image token we try to recover by reading from the previous token to the next end image token if any. we supply log information to let the consumer know this is what we're doing. it's still not bullet-proof but it should be good enough.

also support negative page rotation values by adding them to a 360 degree rotation so -90 degrees clockwise is 270 degrees clockwise.
This commit is contained in:
Eliot Jones
2020-01-25 15:53:08 +00:00
parent 3ac8d7ed91
commit ba09a13d08
8 changed files with 116 additions and 12 deletions

View File

@@ -3,6 +3,7 @@
using System;
using System.IO;
using System.Text.RegularExpressions;
using Logging;
using PdfPig.Core;
using PdfPig.Graphics;
using PdfPig.Graphics.Core;
@@ -19,6 +20,7 @@
public class PageContentParserTests
{
private readonly PageContentParser parser = new PageContentParser(new ReflectionGraphicsStateOperationFactory());
private readonly ILog log = new NoOpLog();
[Fact]
public void CorrectlyExtractsOperations()
@@ -27,7 +29,7 @@
var content = File.ReadAllText(path);
var input = StringBytesTestConverter.Convert(content, false);
var result = parser.Parse(1, input.Bytes);
var result = parser.Parse(1, input.Bytes, log);
Assert.NotEmpty(result);
}
@@ -39,7 +41,7 @@
var content = File.ReadAllText(path);
var input = StringBytesTestConverter.Convert(content, false);
var result = parser.Parse(1, input.Bytes);
var result = parser.Parse(1, input.Bytes, log);
var replacementRegex = new Regex(@"\s(\.\d+)\b");
@@ -72,7 +74,7 @@
ET";
var input = StringBytesTestConverter.Convert(s, false);
var result = parser.Parse(1, input.Bytes);
var result = parser.Parse(1, input.Bytes, log);
using (var stream = new MemoryStream())
{
@@ -102,7 +104,7 @@ ET";
ET";
var input = StringBytesTestConverter.Convert(s, false);
var result = parser.Parse(1, input.Bytes);
var result = parser.Parse(1, input.Bytes, log);
Assert.Equal(7, result.Count);
@@ -138,7 +140,7 @@ ET";
var input = StringBytesTestConverter.Convert(s, false);
var result = parser.Parse(1, input.Bytes);
var result = parser.Parse(1, input.Bytes, log);
Assert.Equal(4, result.Count);
@@ -163,7 +165,7 @@ cm BT 0.0001 Tc 19 0 0 19 0 0 Tm /Tc1 1 Tf ( \(sleep 1; printf ""QUIT\\r\\n""\
var input = StringBytesTestConverter.Convert(s, false);
var result = parser.Parse(1, input.Bytes);
var result = parser.Parse(1, input.Bytes, log);
Assert.Equal(9, result.Count);

View File

@@ -220,6 +220,46 @@
customTokenizers.RemoveAll(x => ReferenceEquals(x.tokenizer, tokenizer));
}
/// <summary>
/// Handles the situation where "EI" was encountered in the inline image data but was
/// not the end of the image.
/// </summary>
/// <param name="lastEndImageOffset">The offset of the "E" of the "EI" marker which was incorrectly read.</param>
/// <returns>The set of bytes from the incorrect "EI" to the correct "EI" including the incorrect "EI".</returns>
public IReadOnlyList<byte> RecoverFromIncorrectEndImage(long lastEndImageOffset)
{
var data = new List<byte>();
inputBytes.Seek(lastEndImageOffset);
if (!inputBytes.MoveNext() || inputBytes.CurrentByte != 'E')
{
var message = $"Failed to recover the image data stream for an inline image at offset {lastEndImageOffset}. " +
$"Expected to read byte 'E' instead got {inputBytes.CurrentByte}.";
throw new PdfDocumentFormatException(message);
}
data.Add(inputBytes.CurrentByte);
if (!inputBytes.MoveNext() || inputBytes.CurrentByte != 'I')
{
var message = $"Failed to recover the image data stream for an inline image at offset {lastEndImageOffset}. " +
$"Expected to read second byte 'I' following 'E' instead got {inputBytes.CurrentByte}.";
throw new PdfDocumentFormatException(message);
}
data.Add(inputBytes.CurrentByte);
data.AddRange(ReadUntilEndImage(lastEndImageOffset));
// Skip beyond the 'I' in the "EI" token we just read so the scanner is in a valid position.
inputBytes.MoveNext();
return data;
}
private IReadOnlyList<byte> ReadInlineImageData()
{
// The ID operator should be followed by a single white-space character, and the next character is interpreted
@@ -231,9 +271,14 @@
var startsAt = inputBytes.CurrentOffset - 2;
return ReadUntilEndImage(startsAt);
}
private List<byte> ReadUntilEndImage(long startsAt)
{
const byte lastPlainText = 127;
const byte space = 32;
var imageData = new List<byte>();
byte prevByte = 0;

View File

@@ -47,6 +47,11 @@
/// <param name="rotation">Rotation in degrees clockwise.</param>
public PageRotationDegrees(int rotation)
{
if (rotation < 0)
{
rotation = 360 + rotation;
}
while (rotation >= 360)
{
rotation -= 360;

View File

@@ -383,7 +383,7 @@
var contentStream = formStream.Decode(filterProvider);
var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream));
var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream), log);
// 3. We don't respect clipping currently.

View File

@@ -385,6 +385,11 @@ namespace UglyToad.PdfPig.Graphics
foreach (var parameter in parameters)
{
if (offset >= operands.Count)
{
throw new InvalidOperationException($"Fewer operands {operands.Count} found than required ({offset + 1}) for operator: {op.Data}.");
}
if (parameter.ParameterType == typeof(decimal))
{
if (operands[offset] is NumericToken numeric)

View File

@@ -3,9 +3,11 @@
using System.Collections.Generic;
using Core;
using Graphics.Operations;
using Logging;
internal interface IPageContentParser
{
IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes);
IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes,
ILog log);
}
}

View File

@@ -7,6 +7,7 @@
using Graphics;
using Graphics.Operations;
using Graphics.Operations.InlineImages;
using Logging;
using Tokenization.Scanner;
using Tokens;
@@ -19,7 +20,8 @@
this.operationFactory = operationFactory;
}
public IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes)
public IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes,
ILog log)
{
var scanner = new CoreTokenScanner(inputBytes);
@@ -75,6 +77,8 @@
// Work out how much data we missed between the false EI operator and the actual one.
var actualEndImageOffset = scanner.CurrentPosition - 3;
log.Warn($"End inline image (EI) encountered after previous EI, attempting recovery at {actualEndImageOffset}.");
var gap = (int)(actualEndImageOffset - lastEndImageOffset);
var from = inputBytes.CurrentOffset;
@@ -101,12 +105,52 @@
}
else
{
var operation = operationFactory.Create(op, precedingTokens);
IGraphicsStateOperation operation;
try
{
operation = operationFactory.Create(op, precedingTokens);
}
catch (Exception ex)
{
var lastWasEndImage = graphicsStateOperations.Count > 0
&& graphicsStateOperations[graphicsStateOperations.Count - 1] is EndInlineImage;
// End images can cause weird state if the "EI" appears inside the inline data stream.
if (lastWasEndImage)
{
log.Error($"Failed reading an operation at offset {inputBytes.CurrentOffset} for page {pageNumber}.", ex);
operation = null;
}
else
{
throw;
}
}
if (operation != null)
{
graphicsStateOperations.Add(operation);
}
else if (graphicsStateOperations.Count > 0)
{
var lastToken = graphicsStateOperations[graphicsStateOperations.Count - 1];
if (lastToken is EndInlineImage prevEndInlineImage && lastEndImageOffset.HasValue)
{
log.Warn($"Operator {op.Data} was not understood following end of inline image data at {lastEndImageOffset}, " +
"attempting recovery.");
var nextByteSet = scanner.RecoverFromIncorrectEndImage(lastEndImageOffset.Value);
graphicsStateOperations.RemoveAt(graphicsStateOperations.Count - 1);
var newEndInlineImage = new EndInlineImage(prevEndInlineImage.ImageData.Concat(nextByteSet).ToList());
graphicsStateOperations.Add(newEndInlineImage);
lastEndImageOffset = scanner.CurrentPosition - 2;
}
else
{
log.Warn($"Operator which was not understood encountered. Values was {op.Data}. Ignoring.");
}
}
}
precedingTokens.Clear();

View File

@@ -141,7 +141,8 @@
PageRotationDegrees rotation,
bool isLenientParsing)
{
var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes));
var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes),
log);
var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, rotation, isLenientParsing, pdfScanner,
pageContentParser,