#11 early access to the raw xobjects for images.

temporary 'safe' untested implementation of seac for type 1 charstrings.
make structure public
bump version of package and project to 0.0.3 (it had accidentally increased to 0.0.5)
This commit is contained in:
Eliot Jones
2018-11-26 19:46:41 +00:00
parent 48fa4a4f15
commit 997979cc92
18 changed files with 199 additions and 46 deletions

View File

@@ -17,7 +17,7 @@
// Add the full path back on, we removed it so we could see it in the test explorer.
documentName = Path.Combine(DocumentFolder.Value, documentName);
using (var document = PdfDocument.Open(documentName, new ParsingOptions{ UseLenientParsing = false}))
using (var document = PdfDocument.Open(documentName, new ParsingOptions { UseLenientParsing = false }))
{
for (var i = 0; i < document.NumberOfPages; i++)
{
@@ -32,11 +32,11 @@
{
documentName = Path.Combine(DocumentFolder.Value, documentName);
using (var document = PdfDocument.Open(documentName, new ParsingOptions{ UseLenientParsing = false }))
using (var document = PdfDocument.Open(documentName, new ParsingOptions { UseLenientParsing = false }))
{
Assert.NotNull(document.Structure.Catalog);
Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count > 0 , "Cross reference table was empty.");
Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count > 0, "Cross reference table was empty.");
foreach (var objectOffset in document.Structure.CrossReferenceTable.ObjectOffsets)
{
var token = document.Structure.GetObject(objectOffset.Key);
@@ -46,6 +46,31 @@
}
}
[Theory]
[MemberData(nameof(GetAllDocuments))]
public void CanAccessImagesOnEveryPage(string documentName)
{
documentName = Path.Combine(DocumentFolder.Value, documentName);
using (var document = PdfDocument.Open(documentName, new ParsingOptions { UseLenientParsing = false }))
{
for (var i = 0; i < document.NumberOfPages; i++)
{
var page = document.GetPage(i + 1);
var images = page.ExperimentalAccess.GetRawImages();
Assert.NotNull(images);
foreach (var image in images)
{
Assert.True(image.Width > 0, $"Image had width of zero on page {i + 1}.");
Assert.True(image.Height > 0, $"Image had height of zero on page {i + 1}.");
}
}
}
}
public static IEnumerable<object[]> GetAllDocuments
{
get
@@ -53,7 +78,7 @@
var files = Directory.GetFiles(DocumentFolder.Value, "*.pdf");
// Return the shortname so we can see it in the test explorer.
return files.Select(x => new object[] {Path.GetFileName(x)});
return files.Select(x => new object[] { Path.GetFileName(x) });
}
}
}

View File

@@ -1,5 +1,6 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using System.Linq;
using Xunit;
public class OldGutnishTests
@@ -40,9 +41,9 @@
{
var page = document.GetPage(1);
page.Content.GetImages();
var images = page.ExperimentalAccess.GetRawImages().ToList();
Assert.Single(images);
}
}
}
}

View File

@@ -56,7 +56,8 @@
"UglyToad.PdfPig.Tokens.ObjectToken",
"UglyToad.PdfPig.Tokens.StreamToken",
"UglyToad.PdfPig.Tokens.StringToken",
"UglyToad.PdfPig.Util.IWordExtractor"
"UglyToad.PdfPig.Util.IWordExtractor",
"UglyToad.PdfPig.XObjects.XObjectImage"
};
foreach (var publicTypeName in publicTypeNames)

View File

@@ -4,6 +4,8 @@
using System.Collections.Generic;
using System.Linq;
using Util;
using Util.JetBrains.Annotations;
using XObjects;
/// <summary>
/// Contains the content and provides access to methods of a single page in the <see cref="PdfDocument"/>.
@@ -46,6 +48,12 @@
/// </summary>
public PageSize Size { get; }
/// <summary>
/// Access to members whose future locations within the API will change without warning.
/// </summary>
[NotNull]
public Experimental ExperimentalAccess { get; }
internal Page(int number, MediaBox mediaBox, CropBox cropBox, PageContent content)
{
if (number <= 0)
@@ -63,6 +71,7 @@
Height = mediaBox.Bounds.Height;
Size = mediaBox.Bounds.GetPageSize();
ExperimentalAccess = new Experimental(this);
}
private static string GetText(PageContent content)
@@ -90,5 +99,28 @@
{
return (wordExtractor ?? DefaultWordExtractor.Instance).GetWords(Letters);
}
/// <summary>
/// Provides access to useful members which will change in future releases.
/// </summary>
public class Experimental
{
private readonly Page page;
internal Experimental(Page page)
{
this.page = page;
}
/// <summary>
/// Retrieve any images referenced in this page's content.
/// These are returned as <see cref="XObjectImage"/>s which are
/// raw data from the PDF's content rather than images.
/// </summary>
public IEnumerable<XObjectImage> GetRawImages()
{
return page.Content.GetImages();
}
}
}
}

View File

@@ -4,7 +4,7 @@
using Graphics;
using Graphics.Operations;
using Tokenization.Scanner;
using XObject;
using XObjects;
/// <summary>
///
@@ -38,11 +38,11 @@
this.isLenientParsing = isLenientParsing;
}
public void GetImages()
public IEnumerable<XObjectImage> GetImages()
{
foreach (var contentRecord in xObjects[XObjectType.Image])
{
xObjectFactory.CreateImage(contentRecord, pdfScanner, isLenientParsing);
yield return xObjectFactory.CreateImage(contentRecord, pdfScanner, isLenientParsing);
}
}
}

View File

@@ -1,7 +1,5 @@
namespace UglyToad.PdfPig.Fonts.Type1.CharStrings.Commands.StartFinishOutline
{
using System;
/// <summary>
/// Standard encoding accented character.
/// Makes an accented character from two other characters in the font program.
@@ -29,8 +27,8 @@
var baseCharacter = context.GetCharacter((int)baseCharacterCode);
var accentCharacter = context.GetCharacter((int) accentCharacterCode);
// TODO
throw new NotImplementedException("Not done yet...");
// TODO: full seac implementation.
context.SetPath(baseCharacter);
context.Stack.Clear();
}

View File

@@ -1,10 +1,13 @@
namespace UglyToad.PdfPig.Fonts.Type1.CharStrings.Commands
{
using System;
using System.Collections.Generic;
using Geometry;
using Util.JetBrains.Annotations;
internal class Type1BuildCharContext
{
private readonly Func<int, CharacterPath> characterByIndexFactory;
public IReadOnlyDictionary<int, Type1CharStrings.CommandSequence> Subroutines { get; }
public decimal WidthX { get; set; }
@@ -17,7 +20,8 @@
public bool IsFlexing { get; set; }
public CharacterPath Path { get; } = new CharacterPath();
[NotNull]
public CharacterPath Path { get; private set; } = new CharacterPath();
public PdfPoint CurrentPosition { get; set; }
@@ -27,9 +31,11 @@
public IReadOnlyList<PdfPoint> FlexPoints { get; }
public Type1BuildCharContext(IReadOnlyDictionary<int, Type1CharStrings.CommandSequence> subroutines)
public Type1BuildCharContext(IReadOnlyDictionary<int, Type1CharStrings.CommandSequence> subroutines,
Func<int, CharacterPath> characterByIndexFactory)
{
Subroutines = subroutines;
this.characterByIndexFactory = characterByIndexFactory ?? throw new ArgumentNullException(nameof(characterByIndexFactory));
Subroutines = subroutines ?? throw new ArgumentNullException(nameof(subroutines));
}
public void AddFlexPoint(PdfPoint point)
@@ -39,7 +45,12 @@
public CharacterPath GetCharacter(int characterCode)
{
return null;
return characterByIndexFactory(characterCode);
}
public void SetPath(CharacterPath path)
{
Path = path ?? throw new ArgumentNullException(nameof(path));
}
public void ClearFlexPoints()

View File

@@ -46,12 +46,15 @@
}
var charStringResults = new Dictionary<string, Type1CharStrings.CommandSequence>(charStrings.Count);
var charStringIndexToName = new Dictionary<int, string>();
foreach (var charString in charStrings)
for (var i = 0; i < charStrings.Count; i++)
{
var charString = charStrings[i];
var commandSequence = ParseSingle(charString.Bytes);
charStringResults[charString.Name] = new Type1CharStrings.CommandSequence(commandSequence);
charStringIndexToName[i] = charString.Name;
}
var subroutineResults = new Dictionary<int, Type1CharStrings.CommandSequence>(subroutines.Count);
@@ -63,7 +66,7 @@
subroutineResults[subroutine.Index] = new Type1CharStrings.CommandSequence(commandSequence);
}
return new Type1CharStrings(charStringResults, subroutineResults);
return new Type1CharStrings(charStringResults, charStringIndexToName, subroutineResults);
}
private static IReadOnlyList<Union<decimal, LazyType1Command>> ParseSingle(IReadOnlyList<byte> charStringBytes)

View File

@@ -8,6 +8,7 @@
internal class Type1CharStrings
{
private readonly IReadOnlyDictionary<int, string> charStringIndexToName;
private readonly object locker = new object();
private readonly Dictionary<string, CharacterPath> glyphs = new Dictionary<string, CharacterPath>();
@@ -15,8 +16,10 @@
public IReadOnlyDictionary<int, CommandSequence> Subroutines { get; }
public Type1CharStrings(IReadOnlyDictionary<string, CommandSequence> charStrings, IReadOnlyDictionary<int, CommandSequence> subroutines)
public Type1CharStrings(IReadOnlyDictionary<string, CommandSequence> charStrings, IReadOnlyDictionary<int, string> charStringIndexToName,
IReadOnlyDictionary<int, CommandSequence> subroutines)
{
this.charStringIndexToName = charStringIndexToName ?? throw new ArgumentNullException(nameof(charStringIndexToName));
CharStrings = charStrings ?? throw new ArgumentNullException(nameof(charStrings));
Subroutines = subroutines ?? throw new ArgumentNullException(nameof(subroutines));
}
@@ -46,7 +49,29 @@
private CharacterPath Run(CommandSequence sequence)
{
var context = new Type1BuildCharContext(Subroutines);
var context = new Type1BuildCharContext(Subroutines, i =>
{
if (!charStringIndexToName.TryGetValue(i, out var name))
{
throw new InvalidOperationException($"Tried to retrieve Type 1 charstring by index {i} which did not exist.");
}
if (glyphs.TryGetValue(name, out var result))
{
return result;
}
if (!CharStrings.TryGetValue(name, out var charstring))
{
throw new InvalidOperationException($"Tried to retrieve Type 1 charstring by index {i} which mapped to name {name} but was not found in the charstrings.");
}
var path = Run(charstring);
glyphs[name] = path;
return path;
});
foreach (var command in sequence.Commands)
{
command.Match(x => context.Stack.Push(x),

View File

@@ -12,7 +12,7 @@
using Tokenization.Scanner;
using Tokens;
using Util;
using XObject;
using XObjects;
internal class ContentStreamProcessor : IOperationContext
{

View File

@@ -4,7 +4,7 @@
using PdfPig.Core;
using Tokens;
using Util.JetBrains.Annotations;
using XObject;
using XObjects;
internal class XObjectContentRecord
{

View File

@@ -12,7 +12,7 @@
using Tokenization.Scanner;
using Tokens;
using Util;
using XObject;
using XObjects;
internal class PageFactory : IPageFactory
{

View File

@@ -22,7 +22,7 @@
using Tokenization.Scanner;
using Tokens;
using Util;
using XObject;
using XObjects;
internal static class PdfDocumentFactory
{

View File

@@ -45,7 +45,7 @@
/// Access to the underlying raw structure of the document.
/// </summary>
[NotNull]
internal Structure Structure { get; }
public Structure Structure { get; }
/// <summary>
/// The version number of the PDF specification which this file conforms to, for example 1.4.

View File

@@ -12,9 +12,9 @@
<PackageTags>PDF;Reader;Document;Adobe;PDFBox;PdfPig</PackageTags>
<RepositoryUrl>https://github.com/UglyToad/PdfPig</RepositoryUrl>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<Version>0.0.5</Version>
<AssemblyVersion>0.0.1.3</AssemblyVersion>
<FileVersion>0.0.1.3</FileVersion>
<Version>0.0.3</Version>
<AssemblyVersion>0.0.3.0</AssemblyVersion>
<FileVersion>0.0.3.0</FileVersion>
<PackageIconUrl>https://raw.githubusercontent.com/UglyToad/PdfPig/master/documentation/pdfpig.png</PackageIconUrl>
<Product>PdfPig</Product>
<PublishRepositoryUrl>true</PublishRepositoryUrl>

View File

@@ -1,4 +1,4 @@
namespace UglyToad.PdfPig.XObject
namespace UglyToad.PdfPig.XObjects
{
using System;
using Graphics;
@@ -7,7 +7,7 @@
internal class XObjectFactory
{
public void CreateImage(XObjectContentRecord xObject, IPdfTokenScanner pdfScanner, bool isLenientParsing)
public XObjectImage CreateImage(XObjectContentRecord xObject, IPdfTokenScanner pdfScanner, bool isLenientParsing)
{
if (xObject == null)
{
@@ -25,23 +25,13 @@
var isJpxDecode = xObject.Stream.StreamDictionary.TryGet(NameToken.Filter, out var token)
&& token is NameToken filterName
&& filterName.Equals(NameToken.JpxDecode);
if (isJpxDecode)
{
return;
}
var isImageMask = xObject.Stream.StreamDictionary.TryGet(NameToken.ImageMask, out var maskToken)
&& maskToken is BooleanToken maskBoolean
&& maskBoolean.Data;
if (isImageMask)
{
return;
}
var bitsPerComponents = xObject.Stream.StreamDictionary.Get<NumericToken>(NameToken.BitsPerComponent, pdfScanner).Int;
return new XObjectImage(width, height, isJpxDecode, isImageMask, xObject.Stream.StreamDictionary,
xObject.Stream.Data);
}
}
}

View File

@@ -0,0 +1,67 @@
namespace UglyToad.PdfPig.XObjects
{
using System;
using System.Collections.Generic;
using Tokens;
using Util.JetBrains.Annotations;
/// <summary>
/// The raw stream from a PDF document representing an image XObject.
/// </summary>
public class XObjectImage
{
/// <summary>
/// The width of the image in samples.
/// </summary>
public int Width { get; }
/// <summary>
/// The height of the image in samples.
/// </summary>
public int Height { get; }
/// <summary>
/// The JPX filter encodes data using the JPEG2000 compression method.
/// A JPEG2000 data stream allows different versions of the image to be decoded
/// allowing for thumbnails to be extracted.
/// </summary>
public bool IsJpxEncoded { get; }
/// <summary>
/// Whether this image should be treated as an image maske.
/// </summary>
public bool IsImageMask { get; }
/// <summary>
/// The full dictionary for this Image XObject.
/// </summary>
[NotNull]
public DictionaryToken ImageDictionary { get; }
/// <summary>
/// The encoded bytes of this image, must be decoded via any
/// filters defined in the <see cref="ImageDictionary"/> prior to consumption.
/// </summary>
[NotNull]
public IReadOnlyList<byte> Bytes { get; }
/// <summary>
/// Creates a new <see cref="XObjectImage"/>.
/// </summary>
internal XObjectImage(int width, int height, bool isJpxEncoded, bool isImageMask, DictionaryToken imageDictionary, IReadOnlyList<byte> bytes)
{
Width = width;
Height = height;
IsJpxEncoded = isJpxEncoded;
IsImageMask = isImageMask;
ImageDictionary = imageDictionary ?? throw new ArgumentNullException(nameof(imageDictionary));
Bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
}
/// <inheritdoc />
public override string ToString()
{
return ImageDictionary.ToString();
}
}
}

View File

@@ -1,4 +1,4 @@
namespace UglyToad.PdfPig.XObject
namespace UglyToad.PdfPig.XObjects
{
internal enum XObjectType
{