mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-14 10:55:04 +08:00
@@ -42,7 +42,7 @@ New in v0.0.5 - To create documents use the class ```PdfDocumentBuilder```. Thou
|
||||
|
||||
byte[] documentBytes = builder.Build();
|
||||
|
||||
File.WriteAllBytes(@"C:\git\newPdf.pdf");
|
||||
File.WriteAllBytes(@"C:\git\newPdf.pdf", documentBytes);
|
||||
|
||||
Each font must be registered with the PdfDocumentBuilder prior to use enable pages to share the font resources. Currently only Standard 14 fonts and TrueType fonts (.ttf) are supported.
|
||||
|
||||
|
@@ -6,6 +6,7 @@
|
||||
using System.Linq;
|
||||
using System.Reflection;
|
||||
using PdfPig.Graphics.Operations;
|
||||
using PdfPig.Graphics.Operations.InlineImages;
|
||||
using PdfPig.Tokens;
|
||||
using Xunit;
|
||||
|
||||
@@ -41,6 +42,10 @@
|
||||
|
||||
operation = (IGraphicsStateOperation)field.GetValue(null);
|
||||
}
|
||||
else if (operationType == typeof(EndInlineImage))
|
||||
{
|
||||
operation = new EndInlineImage(new List<IToken>(), new List<byte>());
|
||||
}
|
||||
else
|
||||
{
|
||||
var constructor = constructors[0];
|
||||
|
@@ -64,6 +64,12 @@
|
||||
|
||||
Assert.True(stream.IsAtEnd());
|
||||
Assert.True(array.IsAtEnd());
|
||||
|
||||
stream.Seek(0);
|
||||
array.Seek(0);
|
||||
|
||||
Assert.False(stream.IsAtEnd());
|
||||
Assert.False(array.IsAtEnd());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -65,6 +65,8 @@
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
|
||||
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
|
||||
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
|
||||
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
|
||||
@@ -174,6 +176,7 @@
|
||||
"UglyToad.PdfPig.Tokens.HexToken",
|
||||
"UglyToad.PdfPig.Tokens.IDataToken`1",
|
||||
"UglyToad.PdfPig.Tokens.IndirectReferenceToken",
|
||||
"UglyToad.PdfPig.Tokens.InlineImageDataToken",
|
||||
"UglyToad.PdfPig.Tokens.IToken",
|
||||
"UglyToad.PdfPig.Tokens.NameToken",
|
||||
"UglyToad.PdfPig.Tokens.NullToken",
|
||||
|
@@ -9,7 +9,7 @@
|
||||
using Util;
|
||||
using Util.JetBrains.Annotations;
|
||||
using XObjects;
|
||||
using UglyToad.PdfPig.Geometry;
|
||||
using Geometry;
|
||||
|
||||
/// <summary>
|
||||
/// Contains the content and provides access to methods of a single page in the <see cref="PdfDocument"/>.
|
||||
@@ -30,23 +30,18 @@
|
||||
|
||||
internal CropBox CropBox { get; }
|
||||
|
||||
internal PageContent Content { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The rotation of the page in degrees (clockwise). Valid values are 0, 90, 180 and 270.
|
||||
/// </summary>
|
||||
public PageRotationDegrees Rotation { get; }
|
||||
|
||||
internal PageContent Content { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The set of <see cref="Letter"/>s drawn by the PDF content.
|
||||
/// </summary>
|
||||
public IReadOnlyList<Letter> Letters => Content?.Letters ?? new Letter[0];
|
||||
|
||||
/// <summary>
|
||||
/// The set of <see cref="PdfPath"/>s drawn by the PDF content.
|
||||
/// </summary>
|
||||
public IReadOnlyList<PdfPath> Paths => Content?.Paths ?? new List<PdfPath>();
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// The full text of all characters on the page in the order they are presented in the PDF content.
|
||||
/// </summary>
|
||||
@@ -136,6 +131,11 @@
|
||||
private readonly Page page;
|
||||
private readonly AnnotationProvider annotationProvider;
|
||||
|
||||
/// <summary>
|
||||
/// The set of <see cref="PdfPath"/>s drawn by the PDF content.
|
||||
/// </summary>
|
||||
public IReadOnlyList<PdfPath> Paths => page.Content?.Paths ?? new List<PdfPath>();
|
||||
|
||||
internal Experimental(Page page, AnnotationProvider annotationProvider)
|
||||
{
|
||||
this.page = page;
|
||||
|
@@ -11,7 +11,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
|
||||
/// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips</para>
|
||||
/// </summary>
|
||||
public class RecursiveXYCut
|
||||
public static class RecursiveXYCut
|
||||
{
|
||||
/// <summary>
|
||||
/// Get the blocks.
|
||||
|
109
src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs
Normal file
109
src/UglyToad.PdfPig/DocumentLayoutAnalysis/TextEdgesExtractor.cs
Normal file
@@ -0,0 +1,109 @@
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
using UglyToad.PdfPig.Content;
|
||||
using UglyToad.PdfPig.Geometry;
|
||||
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
/// <summary>
|
||||
/// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line.
|
||||
/// <para>Useful to detect text columns, tables, justified text, lists, etc.</para>
|
||||
/// </summary>
|
||||
public static class TextEdgesExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Functions used to define left, middle and right edges.
|
||||
/// </summary>
|
||||
private static readonly Tuple<EdgeType, Func<PdfRectangle, decimal>>[] edgesFuncs = new Tuple<EdgeType, Func<PdfRectangle, decimal>>[]
|
||||
{
|
||||
Tuple.Create<EdgeType, Func<PdfRectangle, decimal>>(EdgeType.Left, x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate
|
||||
Tuple.Create<EdgeType, Func<PdfRectangle, decimal>>(EdgeType.Mid, x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate
|
||||
Tuple.Create<EdgeType, Func<PdfRectangle, decimal>>(EdgeType.Right, x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Get the text edges.
|
||||
/// </summary>
|
||||
/// <param name="pageWords">The words in the page.</param>
|
||||
/// <param name="minimumElements">The minimum number of elements to define a text edge.</param>
|
||||
public static IReadOnlyDictionary<EdgeType, List<PdfLine>> GetEdges(IEnumerable<Word> pageWords, int minimumElements = 4)
|
||||
{
|
||||
if (minimumElements < 0)
|
||||
{
|
||||
throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements");
|
||||
}
|
||||
|
||||
var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim()));
|
||||
|
||||
ConcurrentDictionary<EdgeType, List<PdfLine>> dictionary = new ConcurrentDictionary<EdgeType, List<PdfLine>>();
|
||||
|
||||
Parallel.ForEach(edgesFuncs, f =>
|
||||
{
|
||||
dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
|
||||
});
|
||||
return dictionary.ToDictionary(x => x.Key, x => x.Value);
|
||||
}
|
||||
|
||||
private static List<PdfLine> GetVerticalEdges(IEnumerable<Word> pageWords, Func<PdfRectangle, decimal> func, int minimumElements)
|
||||
{
|
||||
Dictionary<decimal, List<Word>> edges = pageWords.GroupBy(x => func(x.BoundingBox))
|
||||
.Where(x => x.Count() >= minimumElements).ToDictionary(gdc => gdc.Key, gdc => gdc.ToList());
|
||||
Dictionary<decimal, List<List<Word>>> cleanEdges = new Dictionary<decimal, List<List<Word>>>();
|
||||
|
||||
foreach (var edge in edges)
|
||||
{
|
||||
var sortedEdges = edge.Value.OrderBy(x => x.BoundingBox.Bottom).ToList();
|
||||
cleanEdges.Add(edge.Key, new List<List<Word>>());
|
||||
|
||||
var cuttings = pageWords.Except(edge.Value) // remove selected words
|
||||
// words that cut the vertical line
|
||||
.Where(x => x.BoundingBox.Left < edge.Key && x.BoundingBox.Right > edge.Key)
|
||||
// and that are within the boundaries of the edge
|
||||
.Where(k => k.BoundingBox.Bottom > edge.Value.Min(z => z.BoundingBox.Bottom)
|
||||
&& k.BoundingBox.Top < edge.Value.Max(z => z.BoundingBox.Top))
|
||||
.OrderBy(x => x.BoundingBox.Bottom).ToList();
|
||||
|
||||
if (cuttings.Count > 0)
|
||||
{
|
||||
foreach (var cut in cuttings)
|
||||
{
|
||||
var group1 = sortedEdges.Where(x => x.BoundingBox.Top < cut.BoundingBox.Bottom).ToList();
|
||||
if (group1.Count >= minimumElements) cleanEdges[edge.Key].Add(group1);
|
||||
sortedEdges = sortedEdges.Except(group1).ToList();
|
||||
}
|
||||
if (sortedEdges.Count >= minimumElements) cleanEdges[edge.Key].Add(sortedEdges);
|
||||
}
|
||||
else
|
||||
{
|
||||
cleanEdges[edge.Key].Add(sortedEdges);
|
||||
}
|
||||
}
|
||||
|
||||
return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The type of text edge.
|
||||
/// </summary>
|
||||
public enum EdgeType
|
||||
{
|
||||
/// <summary>
|
||||
/// Text edges where words have their BoundingBox's left coordinate aligned on the same vertical line.
|
||||
/// </summary>
|
||||
Left = 0,
|
||||
|
||||
/// <summary>
|
||||
/// Text edges where words have their BoundingBox's mid coordinate aligned on the same vertical line.
|
||||
/// </summary>
|
||||
Mid = 1,
|
||||
|
||||
/// <summary>
|
||||
/// Text edges where words have their BoundingBox's right coordinate aligned on the same vertical line.
|
||||
/// </summary>
|
||||
Right = 2
|
||||
}
|
||||
}
|
@@ -48,6 +48,11 @@
|
||||
|
||||
public decimal GetWidthFromFont(int characterIdentifier)
|
||||
{
|
||||
if (fontProgram == null)
|
||||
{
|
||||
return GetWidthFromDictionary(characterIdentifier);
|
||||
}
|
||||
|
||||
if (fontProgram.TryGetBoundingAdvancedWidth(characterIdentifier, cidToGid.GetGlyphIndex, out var width))
|
||||
{
|
||||
return width;
|
||||
|
@@ -5,19 +5,28 @@ namespace UglyToad.PdfPig.Geometry
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using UglyToad.PdfPig.Core;
|
||||
using Core;
|
||||
|
||||
/// <summary>
|
||||
/// A path in a PDF document, used by glyphs and page content.
|
||||
/// A path in a PDF document, used by glyphs and page content. Can contain multiple sub-paths.
|
||||
/// </summary>
|
||||
public class PdfPath
|
||||
{
|
||||
private readonly List<IPathCommand> commands = new List<IPathCommand>();
|
||||
|
||||
/// <summary>
|
||||
/// The sequence of sub-paths which form this <see cref="PdfPath"/>.
|
||||
/// </summary>
|
||||
public IReadOnlyList<IPathCommand> Commands => commands;
|
||||
|
||||
private PdfPoint? currentPosition;
|
||||
private TransformationMatrix currentTransformationMatrix = TransformationMatrix.Identity;
|
||||
|
||||
private readonly TransformationMatrix currentTransformationMatrix;
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="PdfPath"/>.
|
||||
/// </summary>
|
||||
/// <param name="transformationMatrix">The transformation to apply to all points in this path.</param>
|
||||
public PdfPath(TransformationMatrix transformationMatrix)
|
||||
{
|
||||
currentTransformationMatrix = transformationMatrix;
|
||||
@@ -162,79 +171,140 @@ namespace UglyToad.PdfPig.Geometry
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A command in a <see cref="PdfPath"/>.
|
||||
/// </summary>
|
||||
public interface IPathCommand
|
||||
{
|
||||
/// <summary>
|
||||
/// Returns the smallest rectangle which contains the path region given by this command.
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
PdfRectangle? GetBoundingRectangle();
|
||||
|
||||
/// <summary>
|
||||
/// Converts from the path command to an SVG string representing the path operation.
|
||||
/// </summary>
|
||||
void WriteSvg(StringBuilder builder);
|
||||
}
|
||||
|
||||
private class Close : IPathCommand
|
||||
/// <summary>
|
||||
/// Close the current <see cref="PdfPath"/>.
|
||||
/// </summary>
|
||||
public class Close : IPathCommand
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public PdfRectangle? GetBoundingRectangle()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void WriteSvg(StringBuilder builder)
|
||||
{
|
||||
builder.Append("Z ");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Move drawing of the current <see cref="PdfPath"/> to the specified location.
|
||||
/// </summary>
|
||||
public class Move : IPathCommand
|
||||
{
|
||||
/// <summary>
|
||||
/// The location to move to.
|
||||
/// </summary>
|
||||
public PdfPoint Location { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="Move"/> path command.
|
||||
/// </summary>
|
||||
/// <param name="location"></param>
|
||||
public Move(PdfPoint location)
|
||||
{
|
||||
Location = location;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns <see langword="null"/> since this generates no visible path.
|
||||
/// </summary>
|
||||
public PdfRectangle? GetBoundingRectangle()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void WriteSvg(StringBuilder builder)
|
||||
{
|
||||
builder.Append("M ").Append(Location.X).Append(' ').Append(Location.Y).Append(' ');
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Draw a straight line between two points.
|
||||
/// </summary>
|
||||
public class Line : IPathCommand
|
||||
{
|
||||
/// <summary>
|
||||
/// The start of the line.
|
||||
/// </summary>
|
||||
public PdfPoint From { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The end of the line.
|
||||
/// </summary>
|
||||
public PdfPoint To { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="Line"/>.
|
||||
/// </summary>
|
||||
public Line(PdfPoint from, PdfPoint to)
|
||||
{
|
||||
From = from;
|
||||
To = to;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public PdfRectangle? GetBoundingRectangle()
|
||||
{
|
||||
return new PdfRectangle(From, To);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void WriteSvg(StringBuilder builder)
|
||||
{
|
||||
builder.AppendFormat("L {0} {1} ", To.X, To.Y);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Draw a Bezier curve given by the start, control and end points.
|
||||
/// </summary>
|
||||
public class BezierCurve : IPathCommand
|
||||
{
|
||||
/// <summary>
|
||||
/// The start point of the Bezier curve.
|
||||
/// </summary>
|
||||
public PdfPoint StartPoint { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The first control point of the curve.
|
||||
/// </summary>
|
||||
public PdfPoint FirstControlPoint { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The second control point of the curve.
|
||||
/// </summary>
|
||||
public PdfPoint SecondControlPoint { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The end point of the curve.
|
||||
/// </summary>
|
||||
public PdfPoint EndPoint { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a Bezier curve at the provided points.
|
||||
/// </summary>
|
||||
public BezierCurve(PdfPoint startPoint, PdfPoint firstControlPoint, PdfPoint secondControlPoint, PdfPoint endPoint)
|
||||
{
|
||||
StartPoint = startPoint;
|
||||
@@ -243,6 +313,7 @@ namespace UglyToad.PdfPig.Geometry
|
||||
EndPoint = endPoint;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public PdfRectangle? GetBoundingRectangle()
|
||||
{
|
||||
// Optimised
|
||||
@@ -287,6 +358,13 @@ namespace UglyToad.PdfPig.Geometry
|
||||
return new PdfRectangle((decimal)minX, (decimal)minY, (decimal)maxX, (decimal)maxY);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void WriteSvg(StringBuilder builder)
|
||||
{
|
||||
builder.AppendFormat("C {0} {1}, {2} {3}, {4} {5} ", FirstControlPoint.X, FirstControlPoint.Y, SecondControlPoint.X, SecondControlPoint.Y,
|
||||
EndPoint.X, EndPoint.Y);
|
||||
}
|
||||
|
||||
|
||||
private bool TrySolveQuadratic(bool isX, double currentMin, double currentMax, out (double min, double max) solutions)
|
||||
{
|
||||
@@ -378,12 +456,6 @@ namespace UglyToad.PdfPig.Geometry
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
public void WriteSvg(StringBuilder builder)
|
||||
{
|
||||
builder.AppendFormat("C {0} {1}, {2} {3}, {4} {5} ", FirstControlPoint.X, FirstControlPoint.Y, SecondControlPoint.X, SecondControlPoint.Y,
|
||||
EndPoint.X, EndPoint.Y);
|
||||
}
|
||||
}
|
||||
|
||||
internal void Rectangle(decimal x, decimal y, decimal width, decimal height)
|
||||
|
@@ -82,6 +82,27 @@
|
||||
return new PdfVector(X, Y);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a value indicating whether this <see cref="PdfPoint"/> is equal to a specified <see cref="PdfPoint"/> .
|
||||
/// </summary>
|
||||
/// <param name="obj"></param>
|
||||
public override bool Equals(object obj)
|
||||
{
|
||||
if (obj is PdfPoint point)
|
||||
{
|
||||
return point.X == this.X && point.Y == this.Y;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the hash code for this <see cref="PdfPoint"/>.
|
||||
/// </summary>
|
||||
public override int GetHashCode()
|
||||
{
|
||||
return (X, Y).GetHashCode();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get a string representation of this point.
|
||||
/// </summary>
|
||||
|
@@ -30,6 +30,11 @@
|
||||
/// </summary>
|
||||
public PdfPoint BottomLeft { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Centroid point of the rectangle.
|
||||
/// </summary>
|
||||
public PdfPoint Centroid { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Width of the rectangle.
|
||||
/// </summary>
|
||||
@@ -105,15 +110,14 @@
|
||||
|
||||
BottomLeft = new PdfPoint(left, bottom);
|
||||
BottomRight = new PdfPoint(right, bottom);
|
||||
|
||||
Centroid = new PdfPoint(left + (right - left) / 2, bottom + (top - bottom) / 2);
|
||||
}
|
||||
|
||||
internal PdfRectangle(PdfVector topLeft, PdfVector topRight, PdfVector bottomLeft, PdfVector bottomRight)
|
||||
: this(topLeft.ToPoint(), topRight.ToPoint(), bottomLeft.ToPoint(), bottomRight.ToPoint())
|
||||
{
|
||||
TopLeft = topLeft.ToPoint();
|
||||
TopRight = topRight.ToPoint();
|
||||
|
||||
BottomLeft = bottomLeft.ToPoint();
|
||||
BottomRight = bottomRight.ToPoint();
|
||||
}
|
||||
|
||||
internal PdfRectangle(PdfPoint topLeft, PdfPoint topRight, PdfPoint bottomLeft, PdfPoint bottomRight)
|
||||
@@ -123,6 +127,8 @@
|
||||
|
||||
BottomLeft = bottomLeft;
|
||||
BottomRight = bottomRight;
|
||||
|
||||
Centroid = new PdfPoint(topLeft.X + (topRight.X - topLeft.X) / 2, bottomLeft.Y + (topLeft.Y - bottomLeft.Y) / 2);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@@ -1,6 +1,9 @@
|
||||
namespace UglyToad.PdfPig.Graphics.Operations.InlineImages
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using Tokens;
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
@@ -14,15 +17,27 @@
|
||||
public const string Symbol = "EI";
|
||||
|
||||
/// <summary>
|
||||
/// The instance of the <see cref="EndInlineImage"/> operation.
|
||||
/// The tokens declared in order for this inline image object.
|
||||
/// </summary>
|
||||
public static readonly EndInlineImage Value = new EndInlineImage();
|
||||
public IReadOnlyList<IToken> ImageTokens { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The raw data for the inline image which should be interpreted according to the <see cref="ImageTokens"/>.
|
||||
/// </summary>
|
||||
public IReadOnlyList<byte> ImageData { get; }
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Operator => Symbol;
|
||||
|
||||
private EndInlineImage()
|
||||
/// <summary>
|
||||
/// Create a new <see cref="EndInlineImage"/> operation.
|
||||
/// </summary>
|
||||
/// <param name="imageTokens">The tokens which were set during the declaration of this image.</param>
|
||||
/// <param name="imageData">The raw byte data of this image.</param>
|
||||
public EndInlineImage(IReadOnlyList<IToken> imageTokens, IReadOnlyList<byte> imageData)
|
||||
{
|
||||
ImageTokens = imageTokens ?? throw new ArgumentNullException(nameof(imageTokens));
|
||||
ImageData = imageData ?? throw new ArgumentNullException(nameof(imageData));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
|
@@ -75,6 +75,8 @@
|
||||
|
||||
public void Seek(long position)
|
||||
{
|
||||
isAtEnd = false;
|
||||
|
||||
if (position == 0)
|
||||
{
|
||||
stream.Seek(0, SeekOrigin.Begin);
|
||||
|
@@ -3,6 +3,7 @@
|
||||
using System.Collections.Generic;
|
||||
using Graphics;
|
||||
using Graphics.Operations;
|
||||
using Graphics.Operations.InlineImages;
|
||||
using IO;
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
@@ -27,7 +28,13 @@
|
||||
{
|
||||
var token = scanner.CurrentToken;
|
||||
|
||||
if (token is OperatorToken op)
|
||||
if (token is InlineImageDataToken inlineImageData)
|
||||
{
|
||||
graphicsStateOperations.Add(BeginInlineImageData.Value);
|
||||
graphicsStateOperations.Add(new EndInlineImage(precedingTokens, inlineImageData.Data));
|
||||
precedingTokens.Clear();
|
||||
}
|
||||
else if (token is OperatorToken op)
|
||||
{
|
||||
var operation = operationFactory.Create(op, precedingTokens);
|
||||
|
||||
|
@@ -20,11 +20,12 @@
|
||||
|
||||
private readonly ScannerScope scope;
|
||||
private readonly IInputBytes inputBytes;
|
||||
private readonly List<byte> currentBuffer = new List<byte>();
|
||||
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
|
||||
|
||||
internal long CurrentTokenStart { get; private set; }
|
||||
|
||||
public IToken CurrentToken { get; private set; }
|
||||
|
||||
public bool TryReadToken<T>(out T token) where T : class, IToken
|
||||
{
|
||||
token = default(T);
|
||||
@@ -51,6 +52,7 @@
|
||||
public long CurrentPosition => inputBytes.CurrentOffset;
|
||||
|
||||
private bool hasBytePreRead;
|
||||
private bool isInInlineImage;
|
||||
|
||||
internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None)
|
||||
{
|
||||
@@ -60,8 +62,6 @@
|
||||
|
||||
public bool MoveNext()
|
||||
{
|
||||
currentBuffer.Clear();
|
||||
|
||||
var endAngleBracesRead = 0;
|
||||
|
||||
bool isSkippingSymbol = false;
|
||||
@@ -89,7 +89,6 @@
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// If we failed to read the symbol for whatever reason we pass over it.
|
||||
if (isSkippingSymbol && c != '>')
|
||||
{
|
||||
@@ -161,6 +160,23 @@
|
||||
continue;
|
||||
}
|
||||
|
||||
if (token is OperatorToken op)
|
||||
{
|
||||
if (op.Data == "BI")
|
||||
{
|
||||
isInInlineImage = true;
|
||||
}
|
||||
else if (isInInlineImage && op.Data == "ID")
|
||||
{
|
||||
// Special case handling for inline images.
|
||||
var imageData = ReadInlineImageData();
|
||||
isInInlineImage = false;
|
||||
CurrentToken = new InlineImageDataToken(imageData);
|
||||
hasBytePreRead = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
CurrentToken = token;
|
||||
|
||||
/*
|
||||
@@ -190,6 +206,35 @@
|
||||
customTokenizers.RemoveAll(x => ReferenceEquals(x.tokenizer, tokenizer));
|
||||
}
|
||||
|
||||
private IReadOnlyList<byte> ReadInlineImageData()
|
||||
{
|
||||
// The ID operator should be followed by a single white-space character, and the next character is interpreted
|
||||
// as the first byte of image data.
|
||||
if (inputBytes.CurrentByte != ' ')
|
||||
{
|
||||
throw new PdfDocumentFormatException($"No whitespace character following the image data (ID) operator. Position: {inputBytes.CurrentOffset}.");
|
||||
}
|
||||
|
||||
var startsAt = inputBytes.CurrentOffset - 2;
|
||||
|
||||
var imageData = new List<byte>();
|
||||
byte prevByte = 0;
|
||||
while (inputBytes.MoveNext())
|
||||
{
|
||||
if (inputBytes.CurrentByte == 'I' && prevByte == 'E')
|
||||
{
|
||||
imageData.RemoveAt(imageData.Count - 1);
|
||||
return imageData;
|
||||
}
|
||||
|
||||
imageData.Add(inputBytes.CurrentByte);
|
||||
|
||||
prevByte = inputBytes.CurrentByte;
|
||||
}
|
||||
|
||||
throw new PdfDocumentFormatException($"No end of inline image data (EI) was found for image data at position {startsAt}.");
|
||||
}
|
||||
|
||||
private static bool IsEmpty(byte b)
|
||||
{
|
||||
return b == ' ' || b == '\r' || b == '\n' || b == 0;
|
||||
|
22
src/UglyToad.PdfPig/Tokens/InlineImageDataToken.cs
Normal file
22
src/UglyToad.PdfPig/Tokens/InlineImageDataToken.cs
Normal file
@@ -0,0 +1,22 @@
|
||||
namespace UglyToad.PdfPig.Tokens
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
|
||||
/// <summary>
|
||||
/// Inline image data is used to embed images in PDF content streams. The content is wrapped by ID and ED tags in a BI operation.
|
||||
/// </summary>
|
||||
public class InlineImageDataToken : IDataToken<IReadOnlyList<byte>>
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<byte> Data { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="InlineImageDataToken"/>.
|
||||
/// </summary>
|
||||
/// <param name="data"></param>
|
||||
public InlineImageDataToken(IReadOnlyList<byte> data)
|
||||
{
|
||||
Data = data;
|
||||
}
|
||||
}
|
||||
}
|
@@ -6,15 +6,15 @@
|
||||
<DebugType>full</DebugType>
|
||||
<Authors>UglyToad</Authors>
|
||||
<Title>PdfPig</Title>
|
||||
<Description>Reads text content from PDF documents and supports document creation.</Description>
|
||||
<Description>Reads text content from PDF documents and supports document creation. Apache 2.0 licensed.</Description>
|
||||
<PackageLicenseUrl>https://raw.githubusercontent.com/UglyToad/PdfPig/master/LICENSE</PackageLicenseUrl>
|
||||
<PackageProjectUrl>https://github.com/UglyToad/PdfPig</PackageProjectUrl>
|
||||
<PackageTags>PDF;Reader;Document;Adobe;PDFBox;PdfPig;pdf-extract</PackageTags>
|
||||
<RepositoryUrl>https://github.com/UglyToad/PdfPig</RepositoryUrl>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<Version>0.0.6</Version>
|
||||
<AssemblyVersion>0.0.6.0</AssemblyVersion>
|
||||
<FileVersion>0.0.6.0</FileVersion>
|
||||
<Version>0.0.7</Version>
|
||||
<AssemblyVersion>0.0.7.0</AssemblyVersion>
|
||||
<FileVersion>0.0.7.0</FileVersion>
|
||||
<PackageIconUrl>https://raw.githubusercontent.com/UglyToad/PdfPig/master/documentation/pdfpig.png</PackageIconUrl>
|
||||
<Product>PdfPig</Product>
|
||||
<PublishRepositoryUrl>true</PublishRepositoryUrl>
|
||||
|
Reference in New Issue
Block a user