#48 add handling of inline image data to pdf content parsing

an inline image in a pdf content stream starts with the bi tag, then id declares the start of image data and ei the end. attempting to parse the bytes after the id tag as usual resulted in errors. this change adds special case handling for inline images.
This commit is contained in:
Eliot Jones
2019-08-03 15:42:19 +01:00
parent 5ee9c49f8a
commit 364bd25fa8
8 changed files with 194 additions and 27 deletions

View File

@@ -6,6 +6,7 @@
using System.Linq;
using System.Reflection;
using PdfPig.Graphics.Operations;
using PdfPig.Graphics.Operations.InlineImages;
using PdfPig.Tokens;
using Xunit;
@@ -41,6 +42,10 @@
operation = (IGraphicsStateOperation)field.GetValue(null);
}
else if (operationType == typeof(EndInlineImage))
{
operation = new EndInlineImage(new List<IToken>(), new List<byte>());
}
else
{
var constructor = constructors[0];

View File

@@ -174,6 +174,7 @@
"UglyToad.PdfPig.Tokens.HexToken",
"UglyToad.PdfPig.Tokens.IDataToken`1",
"UglyToad.PdfPig.Tokens.IndirectReferenceToken",
"UglyToad.PdfPig.Tokens.InlineImageDataToken",
"UglyToad.PdfPig.Tokens.IToken",
"UglyToad.PdfPig.Tokens.NameToken",
"UglyToad.PdfPig.Tokens.NullToken",

View File

@@ -9,7 +9,7 @@
using Util;
using Util.JetBrains.Annotations;
using XObjects;
using UglyToad.PdfPig.Geometry;
using Geometry;
/// <summary>
/// Contains the content and provides access to methods of a single page in the <see cref="PdfDocument"/>.
@@ -30,23 +30,18 @@
internal CropBox CropBox { get; }
internal PageContent Content { get; }
/// <summary>
/// The rotation of the page in degrees (clockwise). Valid values are 0, 90, 180 and 270.
/// </summary>
public PageRotationDegrees Rotation { get; }
internal PageContent Content { get; }
/// <summary>
/// The set of <see cref="Letter"/>s drawn by the PDF content.
/// </summary>
public IReadOnlyList<Letter> Letters => Content?.Letters ?? new Letter[0];
/// <summary>
/// The set of <see cref="PdfPath"/>s drawn by the PDF content.
/// </summary>
public IReadOnlyList<PdfPath> Paths => Content?.Paths ?? new List<PdfPath>();
/// <summary>
/// The full text of all characters on the page in the order they are presented in the PDF content.
/// </summary>
@@ -136,6 +131,11 @@
private readonly Page page;
private readonly AnnotationProvider annotationProvider;
/// <summary>
/// The set of <see cref="PdfPath"/>s drawn by the PDF content.
/// </summary>
public IReadOnlyList<PdfPath> Paths => page.Content?.Paths ?? new List<PdfPath>();
internal Experimental(Page page, AnnotationProvider annotationProvider)
{
this.page = page;

View File

@@ -5,19 +5,28 @@ namespace UglyToad.PdfPig.Geometry
using System.Collections.Generic;
using System.Linq;
using System.Text;
using UglyToad.PdfPig.Core;
using Core;
/// <summary>
/// A path in a PDF document, used by glyphs and page content.
/// A path in a PDF document, used by glyphs and page content. Can contain multiple sub-paths.
/// </summary>
public class PdfPath
{
private readonly List<IPathCommand> commands = new List<IPathCommand>();
/// <summary>
/// The sequence of sub-paths which form this <see cref="PdfPath"/>.
/// </summary>
public IReadOnlyList<IPathCommand> Commands => commands;
private PdfPoint? currentPosition;
private TransformationMatrix currentTransformationMatrix = TransformationMatrix.Identity;
private readonly TransformationMatrix currentTransformationMatrix;
/// <summary>
/// Create a new <see cref="PdfPath"/>.
/// </summary>
/// <param name="transformationMatrix">The transformation to apply to all points in this path.</param>
public PdfPath(TransformationMatrix transformationMatrix)
{
currentTransformationMatrix = transformationMatrix;
@@ -162,79 +171,140 @@ namespace UglyToad.PdfPig.Geometry
return result;
}
/// <summary>
/// A command in a <see cref="PdfPath"/>.
/// </summary>
public interface IPathCommand
{
/// <summary>
/// Returns the smallest rectangle which contains the path region given by this command.
/// </summary>
/// <returns></returns>
PdfRectangle? GetBoundingRectangle();
/// <summary>
/// Converts from the path command to an SVG string representing the path operation.
/// </summary>
void WriteSvg(StringBuilder builder);
}
private class Close : IPathCommand
/// <summary>
/// Close the current <see cref="PdfPath"/>.
/// </summary>
public class Close : IPathCommand
{
/// <inheritdoc />
public PdfRectangle? GetBoundingRectangle()
{
return null;
}
/// <inheritdoc />
public void WriteSvg(StringBuilder builder)
{
builder.Append("Z ");
}
}
/// <summary>
/// Move drawing of the current <see cref="PdfPath"/> to the specified location.
/// </summary>
public class Move : IPathCommand
{
/// <summary>
/// The location to move to.
/// </summary>
public PdfPoint Location { get; }
/// <summary>
/// Create a new <see cref="Move"/> path command.
/// </summary>
/// <param name="location"></param>
public Move(PdfPoint location)
{
Location = location;
}
/// <summary>
/// Returns <see langword="null"/> since this generates no visible path.
/// </summary>
public PdfRectangle? GetBoundingRectangle()
{
return null;
}
/// <inheritdoc />
public void WriteSvg(StringBuilder builder)
{
builder.Append("M ").Append(Location.X).Append(' ').Append(Location.Y).Append(' ');
}
}
/// <summary>
/// Draw a straight line between two points.
/// </summary>
public class Line : IPathCommand
{
/// <summary>
/// The start of the line.
/// </summary>
public PdfPoint From { get; }
/// <summary>
/// The end of the line.
/// </summary>
public PdfPoint To { get; }
/// <summary>
/// Create a new <see cref="Line"/>.
/// </summary>
public Line(PdfPoint from, PdfPoint to)
{
From = from;
To = to;
}
/// <inheritdoc />
public PdfRectangle? GetBoundingRectangle()
{
return new PdfRectangle(From, To);
}
/// <inheritdoc />
public void WriteSvg(StringBuilder builder)
{
builder.AppendFormat("L {0} {1} ", To.X, To.Y);
}
}
/// <summary>
/// Draw a Bezier curve given by the start, control and end points.
/// </summary>
public class BezierCurve : IPathCommand
{
/// <summary>
/// The start point of the Bezier curve.
/// </summary>
public PdfPoint StartPoint { get; }
/// <summary>
/// The first control point of the curve.
/// </summary>
public PdfPoint FirstControlPoint { get; }
/// <summary>
/// The second control point of the curve.
/// </summary>
public PdfPoint SecondControlPoint { get; }
/// <summary>
/// The end point of the curve.
/// </summary>
public PdfPoint EndPoint { get; }
/// <summary>
/// Create a Bezier curve at the provided points.
/// </summary>
public BezierCurve(PdfPoint startPoint, PdfPoint firstControlPoint, PdfPoint secondControlPoint, PdfPoint endPoint)
{
StartPoint = startPoint;
@@ -243,6 +313,7 @@ namespace UglyToad.PdfPig.Geometry
EndPoint = endPoint;
}
/// <inheritdoc />
public PdfRectangle? GetBoundingRectangle()
{
// Optimised
@@ -287,6 +358,13 @@ namespace UglyToad.PdfPig.Geometry
return new PdfRectangle((decimal)minX, (decimal)minY, (decimal)maxX, (decimal)maxY);
}
/// <inheritdoc />
public void WriteSvg(StringBuilder builder)
{
builder.AppendFormat("C {0} {1}, {2} {3}, {4} {5} ", FirstControlPoint.X, FirstControlPoint.Y, SecondControlPoint.X, SecondControlPoint.Y,
EndPoint.X, EndPoint.Y);
}
private bool TrySolveQuadratic(bool isX, double currentMin, double currentMax, out (double min, double max) solutions)
{
@@ -378,12 +456,6 @@ namespace UglyToad.PdfPig.Geometry
return p;
}
public void WriteSvg(StringBuilder builder)
{
builder.AppendFormat("C {0} {1}, {2} {3}, {4} {5} ", FirstControlPoint.X, FirstControlPoint.Y, SecondControlPoint.X, SecondControlPoint.Y,
EndPoint.X, EndPoint.Y);
}
}
internal void Rectangle(decimal x, decimal y, decimal width, decimal height)

View File

@@ -1,6 +1,9 @@
namespace UglyToad.PdfPig.Graphics.Operations.InlineImages
{
using System;
using System.Collections.Generic;
using System.IO;
using Tokens;
/// <inheritdoc />
/// <summary>
@@ -14,15 +17,27 @@
public const string Symbol = "EI";
/// <summary>
/// The instance of the <see cref="EndInlineImage"/> operation.
/// The tokens declared in order for this inline image object.
/// </summary>
public static readonly EndInlineImage Value = new EndInlineImage();
public IReadOnlyList<IToken> ImageTokens { get; }
/// <summary>
/// The raw data for the inline image which should be interpreted according to the <see cref="ImageTokens"/>.
/// </summary>
public IReadOnlyList<byte> ImageData { get; }
/// <inheritdoc />
public string Operator => Symbol;
private EndInlineImage()
/// <summary>
/// Create a new <see cref="EndInlineImage"/> operation.
/// </summary>
/// <param name="imageTokens">The tokens which were set during the declaration of this image.</param>
/// <param name="imageData">The raw byte data of this image.</param>
public EndInlineImage(IReadOnlyList<IToken> imageTokens, IReadOnlyList<byte> imageData)
{
ImageTokens = imageTokens ?? throw new ArgumentNullException(nameof(imageTokens));
ImageData = imageData ?? throw new ArgumentNullException(nameof(imageData));
}
/// <inheritdoc />

View File

@@ -3,6 +3,7 @@
using System.Collections.Generic;
using Graphics;
using Graphics.Operations;
using Graphics.Operations.InlineImages;
using IO;
using Tokenization.Scanner;
using Tokens;
@@ -27,7 +28,13 @@
{
var token = scanner.CurrentToken;
if (token is OperatorToken op)
if (token is InlineImageDataToken inlineImageData)
{
graphicsStateOperations.Add(BeginInlineImageData.Value);
graphicsStateOperations.Add(new EndInlineImage(precedingTokens, inlineImageData.Data));
precedingTokens.Clear();
}
else if (token is OperatorToken op)
{
var operation = operationFactory.Create(op, precedingTokens);

View File

@@ -20,11 +20,12 @@
private readonly ScannerScope scope;
private readonly IInputBytes inputBytes;
private readonly List<byte> currentBuffer = new List<byte>();
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
internal long CurrentTokenStart { get; private set; }
public IToken CurrentToken { get; private set; }
public bool TryReadToken<T>(out T token) where T : class, IToken
{
token = default(T);
@@ -51,6 +52,7 @@
public long CurrentPosition => inputBytes.CurrentOffset;
private bool hasBytePreRead;
private bool isInInlineImage;
internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None)
{
@@ -60,8 +62,6 @@
public bool MoveNext()
{
currentBuffer.Clear();
var endAngleBracesRead = 0;
bool isSkippingSymbol = false;
@@ -89,7 +89,6 @@
continue;
}
// If we failed to read the symbol for whatever reason we pass over it.
if (isSkippingSymbol && c != '>')
{
@@ -161,6 +160,23 @@
continue;
}
if (token is OperatorToken op)
{
if (op.Data == "BI")
{
isInInlineImage = true;
}
else if (isInInlineImage && op.Data == "ID")
{
// Special case handling for inline images.
var imageData = ReadInlineImageData();
isInInlineImage = false;
CurrentToken = new InlineImageDataToken(imageData);
hasBytePreRead = false;
return true;
}
}
CurrentToken = token;
/*
@@ -190,6 +206,35 @@
customTokenizers.RemoveAll(x => ReferenceEquals(x.tokenizer, tokenizer));
}
private IReadOnlyList<byte> ReadInlineImageData()
{
// The ID operator should be followed by a single white-space character, and the next character is interpreted
// as the first byte of image data.
if (inputBytes.CurrentByte != ' ')
{
throw new PdfDocumentFormatException($"No whitespace character following the image data (ID) operator. Position: {inputBytes.CurrentOffset}.");
}
var startsAt = inputBytes.CurrentOffset - 2;
var imageData = new List<byte>();
byte prevByte = 0;
while (inputBytes.MoveNext())
{
if (inputBytes.CurrentByte == 'I' && prevByte == 'E')
{
imageData.RemoveAt(imageData.Count - 1);
return imageData;
}
imageData.Add(inputBytes.CurrentByte);
prevByte = inputBytes.CurrentByte;
}
throw new PdfDocumentFormatException($"No end of inline image data (EI) was found for image data at position {startsAt}.");
}
private static bool IsEmpty(byte b)
{
return b == ' ' || b == '\r' || b == '\n' || b == 0;

View File

@@ -0,0 +1,22 @@
namespace UglyToad.PdfPig.Tokens
{
using System.Collections.Generic;
/// <summary>
/// Inline image data is used to embed images in PDF content streams. The content is wrapped by ID and ED tags in a BI operation.
/// </summary>
public class InlineImageDataToken : IDataToken<IReadOnlyList<byte>>
{
/// <inheritdoc />
public IReadOnlyList<byte> Data { get; }
/// <summary>
/// Create a new <see cref="InlineImageDataToken"/>.
/// </summary>
/// <param name="data"></param>
public InlineImageDataToken(IReadOnlyList<byte> data)
{
Data = data;
}
}
}