Merge pull request #1 from UglyToad/master

merging updates
This commit is contained in:
vadik299
2019-08-10 10:10:44 -04:00
committed by GitHub
17 changed files with 355 additions and 37 deletions

View File

@@ -42,7 +42,7 @@ New in v0.0.5 - To create documents use the class ```PdfDocumentBuilder```. Thou
byte[] documentBytes = builder.Build();
File.WriteAllBytes(@"C:\git\newPdf.pdf");
File.WriteAllBytes(@"C:\git\newPdf.pdf", documentBytes);
Each font must be registered with the PdfDocumentBuilder prior to use enable pages to share the font resources. Currently only Standard 14 fonts and TrueType fonts (.ttf) are supported.

View File

@@ -6,6 +6,7 @@
using System.Linq;
using System.Reflection;
using PdfPig.Graphics.Operations;
using PdfPig.Graphics.Operations.InlineImages;
using PdfPig.Tokens;
using Xunit;
@@ -41,6 +42,10 @@
operation = (IGraphicsStateOperation)field.GetValue(null);
}
else if (operationType == typeof(EndInlineImage))
{
operation = new EndInlineImage(new List<IToken>(), new List<byte>());
}
else
{
var constructor = constructors[0];

View File

@@ -64,6 +64,12 @@
Assert.True(stream.IsAtEnd());
Assert.True(array.IsAtEnd());
stream.Seek(0);
array.Seek(0);
Assert.False(stream.IsAtEnd());
Assert.False(array.IsAtEnd());
}
}
}

View File

@@ -65,6 +65,8 @@
"UglyToad.PdfPig.DocumentLayoutAnalysis.RecursiveXYCut",
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYNode",
"UglyToad.PdfPig.DocumentLayoutAnalysis.XYLeaf",
"UglyToad.PdfPig.DocumentLayoutAnalysis.TextEdgesExtractor",
"UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
@@ -174,6 +176,7 @@
"UglyToad.PdfPig.Tokens.HexToken",
"UglyToad.PdfPig.Tokens.IDataToken`1",
"UglyToad.PdfPig.Tokens.IndirectReferenceToken",
"UglyToad.PdfPig.Tokens.InlineImageDataToken",
"UglyToad.PdfPig.Tokens.IToken",
"UglyToad.PdfPig.Tokens.NameToken",
"UglyToad.PdfPig.Tokens.NullToken",

View File

@@ -9,7 +9,7 @@
using Util;
using Util.JetBrains.Annotations;
using XObjects;
using UglyToad.PdfPig.Geometry;
using Geometry;
/// <summary>
/// Contains the content and provides access to methods of a single page in the <see cref="PdfDocument"/>.
@@ -30,23 +30,18 @@
internal CropBox CropBox { get; }
internal PageContent Content { get; }
/// <summary>
/// The rotation of the page in degrees (clockwise). Valid values are 0, 90, 180 and 270.
/// </summary>
public PageRotationDegrees Rotation { get; }
internal PageContent Content { get; }
/// <summary>
/// The set of <see cref="Letter"/>s drawn by the PDF content.
/// </summary>
public IReadOnlyList<Letter> Letters => Content?.Letters ?? new Letter[0];
/// <summary>
/// The set of <see cref="PdfPath"/>s drawn by the PDF content.
/// </summary>
public IReadOnlyList<PdfPath> Paths => Content?.Paths ?? new List<PdfPath>();
/// <summary>
/// The full text of all characters on the page in the order they are presented in the PDF content.
/// </summary>
@@ -136,6 +131,11 @@
private readonly Page page;
private readonly AnnotationProvider annotationProvider;
/// <summary>
/// The set of <see cref="PdfPath"/>s drawn by the PDF content.
/// </summary>
public IReadOnlyList<PdfPath> Paths => page.Content?.Paths ?? new List<PdfPath>();
internal Experimental(Page page, AnnotationProvider annotationProvider)
{
this.page = page;

View File

@@ -11,7 +11,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// https://en.wikipedia.org/wiki/Recursive_X-Y_cut
/// <para>See 'Recursive X-Y Cut using Bounding Boxes of Connected Components' by Jaekyu Ha, Robert M.Haralick and Ihsin T. Phillips</para>
/// </summary>
public class RecursiveXYCut
public static class RecursiveXYCut
{
/// <summary>
/// Get the blocks.

View File

@@ -0,0 +1,109 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Geometry;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
/// <summary>
/// Text edges extractor. Text edges are where words have either their BoundingBox's left, right or mid coordinates aligned on the same vertical line.
/// <para>Useful to detect text columns, tables, justified text, lists, etc.</para>
/// </summary>
public static class TextEdgesExtractor
{
/// <summary>
/// Functions used to define left, middle and right edges.
/// </summary>
private static readonly Tuple<EdgeType, Func<PdfRectangle, decimal>>[] edgesFuncs = new Tuple<EdgeType, Func<PdfRectangle, decimal>>[]
{
Tuple.Create<EdgeType, Func<PdfRectangle, decimal>>(EdgeType.Left, x => Math.Round(x.Left, 0)), // use BoundingBox's left coordinate
Tuple.Create<EdgeType, Func<PdfRectangle, decimal>>(EdgeType.Mid, x => Math.Round(x.Left + x.Width / 2, 0)), // use BoundingBox's mid coordinate
Tuple.Create<EdgeType, Func<PdfRectangle, decimal>>(EdgeType.Right, x => Math.Round(x.Right, 0)) // use BoundingBox's right coordinate
};
/// <summary>
/// Get the text edges.
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="minimumElements">The minimum number of elements to define a text edge.</param>
public static IReadOnlyDictionary<EdgeType, List<PdfLine>> GetEdges(IEnumerable<Word> pageWords, int minimumElements = 4)
{
if (minimumElements < 0)
{
throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements");
}
var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim()));
ConcurrentDictionary<EdgeType, List<PdfLine>> dictionary = new ConcurrentDictionary<EdgeType, List<PdfLine>>();
Parallel.ForEach(edgesFuncs, f =>
{
dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
});
return dictionary.ToDictionary(x => x.Key, x => x.Value);
}
private static List<PdfLine> GetVerticalEdges(IEnumerable<Word> pageWords, Func<PdfRectangle, decimal> func, int minimumElements)
{
Dictionary<decimal, List<Word>> edges = pageWords.GroupBy(x => func(x.BoundingBox))
.Where(x => x.Count() >= minimumElements).ToDictionary(gdc => gdc.Key, gdc => gdc.ToList());
Dictionary<decimal, List<List<Word>>> cleanEdges = new Dictionary<decimal, List<List<Word>>>();
foreach (var edge in edges)
{
var sortedEdges = edge.Value.OrderBy(x => x.BoundingBox.Bottom).ToList();
cleanEdges.Add(edge.Key, new List<List<Word>>());
var cuttings = pageWords.Except(edge.Value) // remove selected words
// words that cut the vertical line
.Where(x => x.BoundingBox.Left < edge.Key && x.BoundingBox.Right > edge.Key)
// and that are within the boundaries of the edge
.Where(k => k.BoundingBox.Bottom > edge.Value.Min(z => z.BoundingBox.Bottom)
&& k.BoundingBox.Top < edge.Value.Max(z => z.BoundingBox.Top))
.OrderBy(x => x.BoundingBox.Bottom).ToList();
if (cuttings.Count > 0)
{
foreach (var cut in cuttings)
{
var group1 = sortedEdges.Where(x => x.BoundingBox.Top < cut.BoundingBox.Bottom).ToList();
if (group1.Count >= minimumElements) cleanEdges[edge.Key].Add(group1);
sortedEdges = sortedEdges.Except(group1).ToList();
}
if (sortedEdges.Count >= minimumElements) cleanEdges[edge.Key].Add(sortedEdges);
}
else
{
cleanEdges[edge.Key].Add(sortedEdges);
}
}
return cleanEdges.SelectMany(x => x.Value.Select(y => new PdfLine(x.Key, y.Min(w => w.BoundingBox.Bottom), x.Key, y.Max(w => w.BoundingBox.Top)))).ToList();
}
}
/// <summary>
/// The type of text edge.
/// </summary>
public enum EdgeType
{
/// <summary>
/// Text edges where words have their BoundingBox's left coordinate aligned on the same vertical line.
/// </summary>
Left = 0,
/// <summary>
/// Text edges where words have their BoundingBox's mid coordinate aligned on the same vertical line.
/// </summary>
Mid = 1,
/// <summary>
/// Text edges where words have their BoundingBox's right coordinate aligned on the same vertical line.
/// </summary>
Right = 2
}
}

View File

@@ -48,6 +48,11 @@
public decimal GetWidthFromFont(int characterIdentifier)
{
if (fontProgram == null)
{
return GetWidthFromDictionary(characterIdentifier);
}
if (fontProgram.TryGetBoundingAdvancedWidth(characterIdentifier, cidToGid.GetGlyphIndex, out var width))
{
return width;

View File

@@ -5,19 +5,28 @@ namespace UglyToad.PdfPig.Geometry
using System.Collections.Generic;
using System.Linq;
using System.Text;
using UglyToad.PdfPig.Core;
using Core;
/// <summary>
/// A path in a PDF document, used by glyphs and page content.
/// A path in a PDF document, used by glyphs and page content. Can contain multiple sub-paths.
/// </summary>
public class PdfPath
{
private readonly List<IPathCommand> commands = new List<IPathCommand>();
/// <summary>
/// The sequence of sub-paths which form this <see cref="PdfPath"/>.
/// </summary>
public IReadOnlyList<IPathCommand> Commands => commands;
private PdfPoint? currentPosition;
private TransformationMatrix currentTransformationMatrix = TransformationMatrix.Identity;
private readonly TransformationMatrix currentTransformationMatrix;
/// <summary>
/// Create a new <see cref="PdfPath"/>.
/// </summary>
/// <param name="transformationMatrix">The transformation to apply to all points in this path.</param>
public PdfPath(TransformationMatrix transformationMatrix)
{
currentTransformationMatrix = transformationMatrix;
@@ -162,79 +171,140 @@ namespace UglyToad.PdfPig.Geometry
return result;
}
/// <summary>
/// A command in a <see cref="PdfPath"/>.
/// </summary>
public interface IPathCommand
{
/// <summary>
/// Returns the smallest rectangle which contains the path region given by this command.
/// </summary>
/// <returns></returns>
PdfRectangle? GetBoundingRectangle();
/// <summary>
/// Converts from the path command to an SVG string representing the path operation.
/// </summary>
void WriteSvg(StringBuilder builder);
}
private class Close : IPathCommand
/// <summary>
/// Close the current <see cref="PdfPath"/>.
/// </summary>
public class Close : IPathCommand
{
/// <inheritdoc />
public PdfRectangle? GetBoundingRectangle()
{
return null;
}
/// <inheritdoc />
public void WriteSvg(StringBuilder builder)
{
builder.Append("Z ");
}
}
/// <summary>
/// Move drawing of the current <see cref="PdfPath"/> to the specified location.
/// </summary>
public class Move : IPathCommand
{
/// <summary>
/// The location to move to.
/// </summary>
public PdfPoint Location { get; }
/// <summary>
/// Create a new <see cref="Move"/> path command.
/// </summary>
/// <param name="location"></param>
public Move(PdfPoint location)
{
Location = location;
}
/// <summary>
/// Returns <see langword="null"/> since this generates no visible path.
/// </summary>
public PdfRectangle? GetBoundingRectangle()
{
return null;
}
/// <inheritdoc />
public void WriteSvg(StringBuilder builder)
{
builder.Append("M ").Append(Location.X).Append(' ').Append(Location.Y).Append(' ');
}
}
/// <summary>
/// Draw a straight line between two points.
/// </summary>
public class Line : IPathCommand
{
/// <summary>
/// The start of the line.
/// </summary>
public PdfPoint From { get; }
/// <summary>
/// The end of the line.
/// </summary>
public PdfPoint To { get; }
/// <summary>
/// Create a new <see cref="Line"/>.
/// </summary>
public Line(PdfPoint from, PdfPoint to)
{
From = from;
To = to;
}
/// <inheritdoc />
public PdfRectangle? GetBoundingRectangle()
{
return new PdfRectangle(From, To);
}
/// <inheritdoc />
public void WriteSvg(StringBuilder builder)
{
builder.AppendFormat("L {0} {1} ", To.X, To.Y);
}
}
/// <summary>
/// Draw a Bezier curve given by the start, control and end points.
/// </summary>
public class BezierCurve : IPathCommand
{
/// <summary>
/// The start point of the Bezier curve.
/// </summary>
public PdfPoint StartPoint { get; }
/// <summary>
/// The first control point of the curve.
/// </summary>
public PdfPoint FirstControlPoint { get; }
/// <summary>
/// The second control point of the curve.
/// </summary>
public PdfPoint SecondControlPoint { get; }
/// <summary>
/// The end point of the curve.
/// </summary>
public PdfPoint EndPoint { get; }
/// <summary>
/// Create a Bezier curve at the provided points.
/// </summary>
public BezierCurve(PdfPoint startPoint, PdfPoint firstControlPoint, PdfPoint secondControlPoint, PdfPoint endPoint)
{
StartPoint = startPoint;
@@ -243,6 +313,7 @@ namespace UglyToad.PdfPig.Geometry
EndPoint = endPoint;
}
/// <inheritdoc />
public PdfRectangle? GetBoundingRectangle()
{
// Optimised
@@ -287,6 +358,13 @@ namespace UglyToad.PdfPig.Geometry
return new PdfRectangle((decimal)minX, (decimal)minY, (decimal)maxX, (decimal)maxY);
}
/// <inheritdoc />
public void WriteSvg(StringBuilder builder)
{
builder.AppendFormat("C {0} {1}, {2} {3}, {4} {5} ", FirstControlPoint.X, FirstControlPoint.Y, SecondControlPoint.X, SecondControlPoint.Y,
EndPoint.X, EndPoint.Y);
}
private bool TrySolveQuadratic(bool isX, double currentMin, double currentMax, out (double min, double max) solutions)
{
@@ -378,12 +456,6 @@ namespace UglyToad.PdfPig.Geometry
return p;
}
public void WriteSvg(StringBuilder builder)
{
builder.AppendFormat("C {0} {1}, {2} {3}, {4} {5} ", FirstControlPoint.X, FirstControlPoint.Y, SecondControlPoint.X, SecondControlPoint.Y,
EndPoint.X, EndPoint.Y);
}
}
internal void Rectangle(decimal x, decimal y, decimal width, decimal height)

View File

@@ -82,6 +82,27 @@
return new PdfVector(X, Y);
}
/// <summary>
/// Returns a value indicating whether this <see cref="PdfPoint"/> is equal to a specified <see cref="PdfPoint"/> .
/// </summary>
/// <param name="obj"></param>
public override bool Equals(object obj)
{
if (obj is PdfPoint point)
{
return point.X == this.X && point.Y == this.Y;
}
return false;
}
/// <summary>
/// Returns the hash code for this <see cref="PdfPoint"/>.
/// </summary>
public override int GetHashCode()
{
return (X, Y).GetHashCode();
}
/// <summary>
/// Get a string representation of this point.
/// </summary>

View File

@@ -30,6 +30,11 @@
/// </summary>
public PdfPoint BottomLeft { get; }
/// <summary>
/// Centroid point of the rectangle.
/// </summary>
public PdfPoint Centroid { get; }
/// <summary>
/// Width of the rectangle.
/// </summary>
@@ -105,15 +110,14 @@
BottomLeft = new PdfPoint(left, bottom);
BottomRight = new PdfPoint(right, bottom);
Centroid = new PdfPoint(left + (right - left) / 2, bottom + (top - bottom) / 2);
}
internal PdfRectangle(PdfVector topLeft, PdfVector topRight, PdfVector bottomLeft, PdfVector bottomRight)
: this(topLeft.ToPoint(), topRight.ToPoint(), bottomLeft.ToPoint(), bottomRight.ToPoint())
{
TopLeft = topLeft.ToPoint();
TopRight = topRight.ToPoint();
BottomLeft = bottomLeft.ToPoint();
BottomRight = bottomRight.ToPoint();
}
internal PdfRectangle(PdfPoint topLeft, PdfPoint topRight, PdfPoint bottomLeft, PdfPoint bottomRight)
@@ -123,6 +127,8 @@
BottomLeft = bottomLeft;
BottomRight = bottomRight;
Centroid = new PdfPoint(topLeft.X + (topRight.X - topLeft.X) / 2, bottomLeft.Y + (topLeft.Y - bottomLeft.Y) / 2);
}
/// <summary>

View File

@@ -1,6 +1,9 @@
namespace UglyToad.PdfPig.Graphics.Operations.InlineImages
{
using System;
using System.Collections.Generic;
using System.IO;
using Tokens;
/// <inheritdoc />
/// <summary>
@@ -14,15 +17,27 @@
public const string Symbol = "EI";
/// <summary>
/// The instance of the <see cref="EndInlineImage"/> operation.
/// The tokens declared in order for this inline image object.
/// </summary>
public static readonly EndInlineImage Value = new EndInlineImage();
public IReadOnlyList<IToken> ImageTokens { get; }
/// <summary>
/// The raw data for the inline image which should be interpreted according to the <see cref="ImageTokens"/>.
/// </summary>
public IReadOnlyList<byte> ImageData { get; }
/// <inheritdoc />
public string Operator => Symbol;
private EndInlineImage()
/// <summary>
/// Create a new <see cref="EndInlineImage"/> operation.
/// </summary>
/// <param name="imageTokens">The tokens which were set during the declaration of this image.</param>
/// <param name="imageData">The raw byte data of this image.</param>
public EndInlineImage(IReadOnlyList<IToken> imageTokens, IReadOnlyList<byte> imageData)
{
ImageTokens = imageTokens ?? throw new ArgumentNullException(nameof(imageTokens));
ImageData = imageData ?? throw new ArgumentNullException(nameof(imageData));
}
/// <inheritdoc />

View File

@@ -75,6 +75,8 @@
public void Seek(long position)
{
isAtEnd = false;
if (position == 0)
{
stream.Seek(0, SeekOrigin.Begin);

View File

@@ -3,6 +3,7 @@
using System.Collections.Generic;
using Graphics;
using Graphics.Operations;
using Graphics.Operations.InlineImages;
using IO;
using Tokenization.Scanner;
using Tokens;
@@ -27,7 +28,13 @@
{
var token = scanner.CurrentToken;
if (token is OperatorToken op)
if (token is InlineImageDataToken inlineImageData)
{
graphicsStateOperations.Add(BeginInlineImageData.Value);
graphicsStateOperations.Add(new EndInlineImage(precedingTokens, inlineImageData.Data));
precedingTokens.Clear();
}
else if (token is OperatorToken op)
{
var operation = operationFactory.Create(op, precedingTokens);

View File

@@ -20,11 +20,12 @@
private readonly ScannerScope scope;
private readonly IInputBytes inputBytes;
private readonly List<byte> currentBuffer = new List<byte>();
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
internal long CurrentTokenStart { get; private set; }
public IToken CurrentToken { get; private set; }
public bool TryReadToken<T>(out T token) where T : class, IToken
{
token = default(T);
@@ -51,6 +52,7 @@
public long CurrentPosition => inputBytes.CurrentOffset;
private bool hasBytePreRead;
private bool isInInlineImage;
internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None)
{
@@ -60,8 +62,6 @@
public bool MoveNext()
{
currentBuffer.Clear();
var endAngleBracesRead = 0;
bool isSkippingSymbol = false;
@@ -89,7 +89,6 @@
continue;
}
// If we failed to read the symbol for whatever reason we pass over it.
if (isSkippingSymbol && c != '>')
{
@@ -161,6 +160,23 @@
continue;
}
if (token is OperatorToken op)
{
if (op.Data == "BI")
{
isInInlineImage = true;
}
else if (isInInlineImage && op.Data == "ID")
{
// Special case handling for inline images.
var imageData = ReadInlineImageData();
isInInlineImage = false;
CurrentToken = new InlineImageDataToken(imageData);
hasBytePreRead = false;
return true;
}
}
CurrentToken = token;
/*
@@ -190,6 +206,35 @@
customTokenizers.RemoveAll(x => ReferenceEquals(x.tokenizer, tokenizer));
}
private IReadOnlyList<byte> ReadInlineImageData()
{
// The ID operator should be followed by a single white-space character, and the next character is interpreted
// as the first byte of image data.
if (inputBytes.CurrentByte != ' ')
{
throw new PdfDocumentFormatException($"No whitespace character following the image data (ID) operator. Position: {inputBytes.CurrentOffset}.");
}
var startsAt = inputBytes.CurrentOffset - 2;
var imageData = new List<byte>();
byte prevByte = 0;
while (inputBytes.MoveNext())
{
if (inputBytes.CurrentByte == 'I' && prevByte == 'E')
{
imageData.RemoveAt(imageData.Count - 1);
return imageData;
}
imageData.Add(inputBytes.CurrentByte);
prevByte = inputBytes.CurrentByte;
}
throw new PdfDocumentFormatException($"No end of inline image data (EI) was found for image data at position {startsAt}.");
}
private static bool IsEmpty(byte b)
{
return b == ' ' || b == '\r' || b == '\n' || b == 0;

View File

@@ -0,0 +1,22 @@
namespace UglyToad.PdfPig.Tokens
{
using System.Collections.Generic;
/// <summary>
/// Inline image data is used to embed images in PDF content streams. The content is wrapped by ID and ED tags in a BI operation.
/// </summary>
public class InlineImageDataToken : IDataToken<IReadOnlyList<byte>>
{
/// <inheritdoc />
public IReadOnlyList<byte> Data { get; }
/// <summary>
/// Create a new <see cref="InlineImageDataToken"/>.
/// </summary>
/// <param name="data"></param>
public InlineImageDataToken(IReadOnlyList<byte> data)
{
Data = data;
}
}
}

View File

@@ -6,15 +6,15 @@
<DebugType>full</DebugType>
<Authors>UglyToad</Authors>
<Title>PdfPig</Title>
<Description>Reads text content from PDF documents and supports document creation.</Description>
<Description>Reads text content from PDF documents and supports document creation. Apache 2.0 licensed.</Description>
<PackageLicenseUrl>https://raw.githubusercontent.com/UglyToad/PdfPig/master/LICENSE</PackageLicenseUrl>
<PackageProjectUrl>https://github.com/UglyToad/PdfPig</PackageProjectUrl>
<PackageTags>PDF;Reader;Document;Adobe;PDFBox;PdfPig;pdf-extract</PackageTags>
<RepositoryUrl>https://github.com/UglyToad/PdfPig</RepositoryUrl>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<Version>0.0.6</Version>
<AssemblyVersion>0.0.6.0</AssemblyVersion>
<FileVersion>0.0.6.0</FileVersion>
<Version>0.0.7</Version>
<AssemblyVersion>0.0.7.0</AssemblyVersion>
<FileVersion>0.0.7.0</FileVersion>
<PackageIconUrl>https://raw.githubusercontent.com/UglyToad/PdfPig/master/documentation/pdfpig.png</PackageIconUrl>
<Product>PdfPig</Product>
<PublishRepositoryUrl>true</PublishRepositoryUrl>