handle incorrect end image detected

since an inline image's data stream may contain the characters 'ei' as a result of compression it's possible to read an end image operator mid-data, this results in the next operator also being end image and the content stream being in an invalid state. to recover from this when we detect this situation we remove the previous operator, read to the current operator and replace the operator and data bytes in the list of operations.
This commit is contained in:
Eliot Jones
2020-01-08 12:17:30 +00:00
parent a083214da2
commit 4976fa1027
7 changed files with 74 additions and 26 deletions

View File

@@ -27,7 +27,7 @@
var content = File.ReadAllText(path); var content = File.ReadAllText(path);
var input = StringBytesTestConverter.Convert(content, false); var input = StringBytesTestConverter.Convert(content, false);
var result = parser.Parse(input.Bytes); var result = parser.Parse(1, input.Bytes);
Assert.NotEmpty(result); Assert.NotEmpty(result);
} }
@@ -39,7 +39,7 @@
var content = File.ReadAllText(path); var content = File.ReadAllText(path);
var input = StringBytesTestConverter.Convert(content, false); var input = StringBytesTestConverter.Convert(content, false);
var result = parser.Parse(input.Bytes); var result = parser.Parse(1, input.Bytes);
var replacementRegex = new Regex(@"\s(\.\d+)\b"); var replacementRegex = new Regex(@"\s(\.\d+)\b");
@@ -72,7 +72,7 @@
ET"; ET";
var input = StringBytesTestConverter.Convert(s, false); var input = StringBytesTestConverter.Convert(s, false);
var result = parser.Parse(input.Bytes); var result = parser.Parse(1, input.Bytes);
using (var stream = new MemoryStream()) using (var stream = new MemoryStream())
{ {
@@ -102,7 +102,7 @@ ET";
ET"; ET";
var input = StringBytesTestConverter.Convert(s, false); var input = StringBytesTestConverter.Convert(s, false);
var result = parser.Parse(input.Bytes); var result = parser.Parse(1, input.Bytes);
Assert.Equal(7, result.Count); Assert.Equal(7, result.Count);
@@ -138,7 +138,7 @@ ET";
var input = StringBytesTestConverter.Convert(s, false); var input = StringBytesTestConverter.Convert(s, false);
var result = parser.Parse(input.Bytes); var result = parser.Parse(1, input.Bytes);
Assert.Equal(4, result.Count); Assert.Equal(4, result.Count);
@@ -163,7 +163,7 @@ cm BT 0.0001 Tc 19 0 0 19 0 0 Tm /Tc1 1 Tf ( \(sleep 1; printf ""QUIT\\r\\n""\
var input = StringBytesTestConverter.Convert(s, false); var input = StringBytesTestConverter.Convert(s, false);
var result = parser.Parse(input.Bytes); var result = parser.Parse(1, input.Bytes);
Assert.Equal(9, result.Count); Assert.Equal(9, result.Count);

View File

@@ -4,14 +4,10 @@
using System.Collections.Generic; using System.Collections.Generic;
using System.Diagnostics; using System.Diagnostics;
using System.Linq; using System.Linq;
using Colors;
using Content; using Content;
using Core; using Core;
using Exceptions;
using Filters; using Filters;
using Fonts;
using Geometry; using Geometry;
using IO;
using Logging; using Logging;
using Operations; using Operations;
using Parser; using Parser;
@@ -19,7 +15,6 @@
using PdfPig.Core; using PdfPig.Core;
using Tokenization.Scanner; using Tokenization.Scanner;
using Tokens; using Tokens;
using Util;
using XObjects; using XObjects;
internal class ContentStreamProcessor : IOperationContext internal class ContentStreamProcessor : IOperationContext
@@ -52,6 +47,7 @@
private IFont activeExtendedGraphicsStateFont; private IFont activeExtendedGraphicsStateFont;
private InlineImageBuilder inlineImageBuilder; private InlineImageBuilder inlineImageBuilder;
private bool currentPathAdded; private bool currentPathAdded;
private int pageNumber;
/// <summary> /// <summary>
/// A counter to track individual calls to <see cref="ShowText"/> operations used to determine if letters are likely to be /// A counter to track individual calls to <see cref="ShowText"/> operations used to determine if letters are likely to be
@@ -97,8 +93,9 @@
ColorSpaceContext = new ColorSpaceContext(GetCurrentState, resourceStore); ColorSpaceContext = new ColorSpaceContext(GetCurrentState, resourceStore);
} }
public PageContent Process(IReadOnlyList<IGraphicsStateOperation> operations) public PageContent Process(int pageNumberCurrent, IReadOnlyList<IGraphicsStateOperation> operations)
{ {
pageNumber = pageNumberCurrent;
CloneAllStates(); CloneAllStates();
ProcessOperations(operations); ProcessOperations(operations);
@@ -364,7 +361,7 @@
var contentStream = formStream.Decode(filterProvider); var contentStream = formStream.Decode(filterProvider);
var operations = pageContentParser.Parse(new ByteArrayInputBytes(contentStream)); var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream));
// 3. We don't respect clipping currently. // 3. We don't respect clipping currently.

View File

@@ -1,7 +1,6 @@
namespace UglyToad.PdfPig.Graphics namespace UglyToad.PdfPig.Graphics
{ {
using System.Collections.Generic; using System.Collections.Generic;
using Geometry;
using Tokens; using Tokens;
using PdfPig.Core; using PdfPig.Core;
using Util.JetBrains.Annotations; using Util.JetBrains.Annotations;

View File

@@ -6,7 +6,6 @@
using Colors; using Colors;
using Content; using Content;
using Core; using Core;
using Exceptions;
using Filters; using Filters;
using PdfPig.Core; using PdfPig.Core;
using Tokenization.Scanner; using Tokenization.Scanner;

View File

@@ -6,6 +6,6 @@
internal interface IPageContentParser internal interface IPageContentParser
{ {
IReadOnlyList<IGraphicsStateOperation> Parse(IInputBytes inputBytes); IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes);
} }
} }

View File

@@ -1,6 +1,8 @@
namespace UglyToad.PdfPig.Parser namespace UglyToad.PdfPig.Parser
{ {
using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq;
using Core; using Core;
using Graphics; using Graphics;
using Graphics.Operations; using Graphics.Operations;
@@ -17,13 +19,15 @@
this.operationFactory = operationFactory; this.operationFactory = operationFactory;
} }
public IReadOnlyList<IGraphicsStateOperation> Parse(IInputBytes inputBytes) public IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes)
{ {
var scanner = new CoreTokenScanner(inputBytes); var scanner = new CoreTokenScanner(inputBytes);
var precedingTokens = new List<IToken>(); var precedingTokens = new List<IToken>();
var graphicsStateOperations = new List<IGraphicsStateOperation>(); var graphicsStateOperations = new List<IGraphicsStateOperation>();
var lastEndImageOffset = new long?();
while (scanner.MoveNext()) while (scanner.MoveNext())
{ {
var token = scanner.CurrentToken; var token = scanner.CurrentToken;
@@ -47,9 +51,55 @@
graphicsStateOperations.Add(new BeginInlineImageData(dictionary)); graphicsStateOperations.Add(new BeginInlineImageData(dictionary));
graphicsStateOperations.Add(new EndInlineImage(inlineImageData.Data)); graphicsStateOperations.Add(new EndInlineImage(inlineImageData.Data));
lastEndImageOffset = scanner.CurrentPosition - 2;
precedingTokens.Clear(); precedingTokens.Clear();
} }
else if (token is OperatorToken op) else if (token is OperatorToken op)
{
// Handle an end image where the stream of image data contained EI but was not actually a real end image operator.
if (op.Data == "EI")
{
// Check an end image operation was the last thing that happened.
IGraphicsStateOperation lastOperation = graphicsStateOperations.Count > 0
? graphicsStateOperations[graphicsStateOperations.Count - 1]
: null;
if (lastEndImageOffset == null || lastOperation == null || !(lastOperation is EndInlineImage lastEndImage))
{
throw new PdfDocumentFormatException("Encountered End Image token outside an inline image on " +
$"page {pageNumber} at offset in content: {scanner.CurrentPosition}.");
}
// Work out how much data we missed between the false EI operator and the actual one.
var actualEndImageOffset = scanner.CurrentPosition - 3;
var gap = (int)(actualEndImageOffset - lastEndImageOffset);
var from = inputBytes.CurrentOffset;
inputBytes.Seek(lastEndImageOffset.Value);
// Recover the full image data.
{
var missingData = new byte[gap];
var read = inputBytes.Read(missingData);
if (read != gap)
{
throw new InvalidOperationException($"Failed to read expected buffer length {gap} on page {pageNumber} " +
$"when reading inline image at offset in content: {lastEndImageOffset.Value}.");
}
// Replace the last end image operator with one containing the full set of data.
graphicsStateOperations.Remove(lastEndImage);
graphicsStateOperations.Add(new EndInlineImage(lastEndImage.ImageData.Concat(missingData).ToArray()));
}
lastEndImageOffset = actualEndImageOffset;
inputBytes.Seek(from);
}
else
{ {
var operation = operationFactory.Create(op, precedingTokens); var operation = operationFactory.Create(op, precedingTokens);
@@ -57,6 +107,7 @@
{ {
graphicsStateOperations.Add(operation); graphicsStateOperations.Add(operation);
} }
}
precedingTokens.Clear(); precedingTokens.Clear();
} }

View File

@@ -109,7 +109,7 @@
} }
} }
content = GetContent(bytes, cropBox, userSpaceUnit, rotation, isLenientParsing); content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, isLenientParsing);
} }
else else
{ {
@@ -122,7 +122,7 @@
var bytes = contentStream.Decode(filterProvider); var bytes = contentStream.Decode(filterProvider);
content = GetContent(bytes, cropBox, userSpaceUnit, rotation, isLenientParsing); content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, isLenientParsing);
} }
var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content,
@@ -137,16 +137,18 @@
return page; return page;
} }
private PageContent GetContent(IReadOnlyList<byte> contentBytes, CropBox cropBox, UserSpaceUnit userSpaceUnit, PageRotationDegrees rotation, bool isLenientParsing) private PageContent GetContent(int pageNumber, IReadOnlyList<byte> contentBytes, CropBox cropBox, UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
bool isLenientParsing)
{ {
var operations = pageContentParser.Parse(new ByteArrayInputBytes(contentBytes)); var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes));
var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, rotation, isLenientParsing, pdfScanner, var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, rotation, isLenientParsing, pdfScanner,
pageContentParser, pageContentParser,
filterProvider, filterProvider,
log); log);
return context.Process(operations); return context.Process(pageNumber, operations);
} }
private static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary) private static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary)