#11 early access to the raw xobjects for images.

temporary 'safe' untested implementation of seac for type 1 charstrings.
make structure public
bump version of package and project to 0.0.3 (it had accidentally increased to 0.0.5)
This commit is contained in:
Eliot Jones
2018-11-26 19:46:41 +00:00
parent 48fa4a4f15
commit 997979cc92
18 changed files with 199 additions and 46 deletions

View File

@@ -17,7 +17,7 @@
// Add the full path back on, we removed it so we could see it in the test explorer. // Add the full path back on, we removed it so we could see it in the test explorer.
documentName = Path.Combine(DocumentFolder.Value, documentName); documentName = Path.Combine(DocumentFolder.Value, documentName);
using (var document = PdfDocument.Open(documentName, new ParsingOptions{ UseLenientParsing = false})) using (var document = PdfDocument.Open(documentName, new ParsingOptions { UseLenientParsing = false }))
{ {
for (var i = 0; i < document.NumberOfPages; i++) for (var i = 0; i < document.NumberOfPages; i++)
{ {
@@ -32,11 +32,11 @@
{ {
documentName = Path.Combine(DocumentFolder.Value, documentName); documentName = Path.Combine(DocumentFolder.Value, documentName);
using (var document = PdfDocument.Open(documentName, new ParsingOptions{ UseLenientParsing = false })) using (var document = PdfDocument.Open(documentName, new ParsingOptions { UseLenientParsing = false }))
{ {
Assert.NotNull(document.Structure.Catalog); Assert.NotNull(document.Structure.Catalog);
Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count > 0 , "Cross reference table was empty."); Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count > 0, "Cross reference table was empty.");
foreach (var objectOffset in document.Structure.CrossReferenceTable.ObjectOffsets) foreach (var objectOffset in document.Structure.CrossReferenceTable.ObjectOffsets)
{ {
var token = document.Structure.GetObject(objectOffset.Key); var token = document.Structure.GetObject(objectOffset.Key);
@@ -46,6 +46,31 @@
} }
} }
[Theory]
[MemberData(nameof(GetAllDocuments))]
public void CanAccessImagesOnEveryPage(string documentName)
{
documentName = Path.Combine(DocumentFolder.Value, documentName);
using (var document = PdfDocument.Open(documentName, new ParsingOptions { UseLenientParsing = false }))
{
for (var i = 0; i < document.NumberOfPages; i++)
{
var page = document.GetPage(i + 1);
var images = page.ExperimentalAccess.GetRawImages();
Assert.NotNull(images);
foreach (var image in images)
{
Assert.True(image.Width > 0, $"Image had width of zero on page {i + 1}.");
Assert.True(image.Height > 0, $"Image had height of zero on page {i + 1}.");
}
}
}
}
public static IEnumerable<object[]> GetAllDocuments public static IEnumerable<object[]> GetAllDocuments
{ {
get get
@@ -53,7 +78,7 @@
var files = Directory.GetFiles(DocumentFolder.Value, "*.pdf"); var files = Directory.GetFiles(DocumentFolder.Value, "*.pdf");
// Return the shortname so we can see it in the test explorer. // Return the shortname so we can see it in the test explorer.
return files.Select(x => new object[] {Path.GetFileName(x)}); return files.Select(x => new object[] { Path.GetFileName(x) });
} }
} }
} }

View File

@@ -1,5 +1,6 @@
namespace UglyToad.PdfPig.Tests.Integration namespace UglyToad.PdfPig.Tests.Integration
{ {
using System.Linq;
using Xunit; using Xunit;
public class OldGutnishTests public class OldGutnishTests
@@ -40,9 +41,9 @@
{ {
var page = document.GetPage(1); var page = document.GetPage(1);
page.Content.GetImages(); var images = page.ExperimentalAccess.GetRawImages().ToList();
Assert.Single(images);
} }
} }
} }
} }

View File

@@ -56,7 +56,8 @@
"UglyToad.PdfPig.Tokens.ObjectToken", "UglyToad.PdfPig.Tokens.ObjectToken",
"UglyToad.PdfPig.Tokens.StreamToken", "UglyToad.PdfPig.Tokens.StreamToken",
"UglyToad.PdfPig.Tokens.StringToken", "UglyToad.PdfPig.Tokens.StringToken",
"UglyToad.PdfPig.Util.IWordExtractor" "UglyToad.PdfPig.Util.IWordExtractor",
"UglyToad.PdfPig.XObjects.XObjectImage"
}; };
foreach (var publicTypeName in publicTypeNames) foreach (var publicTypeName in publicTypeNames)

View File

@@ -4,6 +4,8 @@
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
using Util; using Util;
using Util.JetBrains.Annotations;
using XObjects;
/// <summary> /// <summary>
/// Contains the content and provides access to methods of a single page in the <see cref="PdfDocument"/>. /// Contains the content and provides access to methods of a single page in the <see cref="PdfDocument"/>.
@@ -46,6 +48,12 @@
/// </summary> /// </summary>
public PageSize Size { get; } public PageSize Size { get; }
/// <summary>
/// Access to members whose future locations within the API will change without warning.
/// </summary>
[NotNull]
public Experimental ExperimentalAccess { get; }
internal Page(int number, MediaBox mediaBox, CropBox cropBox, PageContent content) internal Page(int number, MediaBox mediaBox, CropBox cropBox, PageContent content)
{ {
if (number <= 0) if (number <= 0)
@@ -63,6 +71,7 @@
Height = mediaBox.Bounds.Height; Height = mediaBox.Bounds.Height;
Size = mediaBox.Bounds.GetPageSize(); Size = mediaBox.Bounds.GetPageSize();
ExperimentalAccess = new Experimental(this);
} }
private static string GetText(PageContent content) private static string GetText(PageContent content)
@@ -90,5 +99,28 @@
{ {
return (wordExtractor ?? DefaultWordExtractor.Instance).GetWords(Letters); return (wordExtractor ?? DefaultWordExtractor.Instance).GetWords(Letters);
} }
/// <summary>
/// Provides access to useful members which will change in future releases.
/// </summary>
public class Experimental
{
private readonly Page page;
internal Experimental(Page page)
{
this.page = page;
}
/// <summary>
/// Retrieve any images referenced in this page's content.
/// These are returned as <see cref="XObjectImage"/>s which are
/// raw data from the PDF's content rather than images.
/// </summary>
public IEnumerable<XObjectImage> GetRawImages()
{
return page.Content.GetImages();
}
}
} }
} }

View File

@@ -4,7 +4,7 @@
using Graphics; using Graphics;
using Graphics.Operations; using Graphics.Operations;
using Tokenization.Scanner; using Tokenization.Scanner;
using XObject; using XObjects;
/// <summary> /// <summary>
/// ///
@@ -38,11 +38,11 @@
this.isLenientParsing = isLenientParsing; this.isLenientParsing = isLenientParsing;
} }
public void GetImages() public IEnumerable<XObjectImage> GetImages()
{ {
foreach (var contentRecord in xObjects[XObjectType.Image]) foreach (var contentRecord in xObjects[XObjectType.Image])
{ {
xObjectFactory.CreateImage(contentRecord, pdfScanner, isLenientParsing); yield return xObjectFactory.CreateImage(contentRecord, pdfScanner, isLenientParsing);
} }
} }
} }

View File

@@ -1,7 +1,5 @@
namespace UglyToad.PdfPig.Fonts.Type1.CharStrings.Commands.StartFinishOutline namespace UglyToad.PdfPig.Fonts.Type1.CharStrings.Commands.StartFinishOutline
{ {
using System;
/// <summary> /// <summary>
/// Standard encoding accented character. /// Standard encoding accented character.
/// Makes an accented character from two other characters in the font program. /// Makes an accented character from two other characters in the font program.
@@ -29,8 +27,8 @@
var baseCharacter = context.GetCharacter((int)baseCharacterCode); var baseCharacter = context.GetCharacter((int)baseCharacterCode);
var accentCharacter = context.GetCharacter((int) accentCharacterCode); var accentCharacter = context.GetCharacter((int) accentCharacterCode);
// TODO // TODO: full seac implementation.
throw new NotImplementedException("Not done yet..."); context.SetPath(baseCharacter);
context.Stack.Clear(); context.Stack.Clear();
} }

View File

@@ -1,10 +1,13 @@
namespace UglyToad.PdfPig.Fonts.Type1.CharStrings.Commands namespace UglyToad.PdfPig.Fonts.Type1.CharStrings.Commands
{ {
using System;
using System.Collections.Generic; using System.Collections.Generic;
using Geometry; using Geometry;
using Util.JetBrains.Annotations;
internal class Type1BuildCharContext internal class Type1BuildCharContext
{ {
private readonly Func<int, CharacterPath> characterByIndexFactory;
public IReadOnlyDictionary<int, Type1CharStrings.CommandSequence> Subroutines { get; } public IReadOnlyDictionary<int, Type1CharStrings.CommandSequence> Subroutines { get; }
public decimal WidthX { get; set; } public decimal WidthX { get; set; }
@@ -17,7 +20,8 @@
public bool IsFlexing { get; set; } public bool IsFlexing { get; set; }
public CharacterPath Path { get; } = new CharacterPath(); [NotNull]
public CharacterPath Path { get; private set; } = new CharacterPath();
public PdfPoint CurrentPosition { get; set; } public PdfPoint CurrentPosition { get; set; }
@@ -27,9 +31,11 @@
public IReadOnlyList<PdfPoint> FlexPoints { get; } public IReadOnlyList<PdfPoint> FlexPoints { get; }
public Type1BuildCharContext(IReadOnlyDictionary<int, Type1CharStrings.CommandSequence> subroutines) public Type1BuildCharContext(IReadOnlyDictionary<int, Type1CharStrings.CommandSequence> subroutines,
Func<int, CharacterPath> characterByIndexFactory)
{ {
Subroutines = subroutines; this.characterByIndexFactory = characterByIndexFactory ?? throw new ArgumentNullException(nameof(characterByIndexFactory));
Subroutines = subroutines ?? throw new ArgumentNullException(nameof(subroutines));
} }
public void AddFlexPoint(PdfPoint point) public void AddFlexPoint(PdfPoint point)
@@ -39,7 +45,12 @@
public CharacterPath GetCharacter(int characterCode) public CharacterPath GetCharacter(int characterCode)
{ {
return null; return characterByIndexFactory(characterCode);
}
public void SetPath(CharacterPath path)
{
Path = path ?? throw new ArgumentNullException(nameof(path));
} }
public void ClearFlexPoints() public void ClearFlexPoints()

View File

@@ -46,12 +46,15 @@
} }
var charStringResults = new Dictionary<string, Type1CharStrings.CommandSequence>(charStrings.Count); var charStringResults = new Dictionary<string, Type1CharStrings.CommandSequence>(charStrings.Count);
var charStringIndexToName = new Dictionary<int, string>();
foreach (var charString in charStrings) for (var i = 0; i < charStrings.Count; i++)
{ {
var charString = charStrings[i];
var commandSequence = ParseSingle(charString.Bytes); var commandSequence = ParseSingle(charString.Bytes);
charStringResults[charString.Name] = new Type1CharStrings.CommandSequence(commandSequence); charStringResults[charString.Name] = new Type1CharStrings.CommandSequence(commandSequence);
charStringIndexToName[i] = charString.Name;
} }
var subroutineResults = new Dictionary<int, Type1CharStrings.CommandSequence>(subroutines.Count); var subroutineResults = new Dictionary<int, Type1CharStrings.CommandSequence>(subroutines.Count);
@@ -63,7 +66,7 @@
subroutineResults[subroutine.Index] = new Type1CharStrings.CommandSequence(commandSequence); subroutineResults[subroutine.Index] = new Type1CharStrings.CommandSequence(commandSequence);
} }
return new Type1CharStrings(charStringResults, subroutineResults); return new Type1CharStrings(charStringResults, charStringIndexToName, subroutineResults);
} }
private static IReadOnlyList<Union<decimal, LazyType1Command>> ParseSingle(IReadOnlyList<byte> charStringBytes) private static IReadOnlyList<Union<decimal, LazyType1Command>> ParseSingle(IReadOnlyList<byte> charStringBytes)

View File

@@ -8,6 +8,7 @@
internal class Type1CharStrings internal class Type1CharStrings
{ {
private readonly IReadOnlyDictionary<int, string> charStringIndexToName;
private readonly object locker = new object(); private readonly object locker = new object();
private readonly Dictionary<string, CharacterPath> glyphs = new Dictionary<string, CharacterPath>(); private readonly Dictionary<string, CharacterPath> glyphs = new Dictionary<string, CharacterPath>();
@@ -15,8 +16,10 @@
public IReadOnlyDictionary<int, CommandSequence> Subroutines { get; } public IReadOnlyDictionary<int, CommandSequence> Subroutines { get; }
public Type1CharStrings(IReadOnlyDictionary<string, CommandSequence> charStrings, IReadOnlyDictionary<int, CommandSequence> subroutines) public Type1CharStrings(IReadOnlyDictionary<string, CommandSequence> charStrings, IReadOnlyDictionary<int, string> charStringIndexToName,
IReadOnlyDictionary<int, CommandSequence> subroutines)
{ {
this.charStringIndexToName = charStringIndexToName ?? throw new ArgumentNullException(nameof(charStringIndexToName));
CharStrings = charStrings ?? throw new ArgumentNullException(nameof(charStrings)); CharStrings = charStrings ?? throw new ArgumentNullException(nameof(charStrings));
Subroutines = subroutines ?? throw new ArgumentNullException(nameof(subroutines)); Subroutines = subroutines ?? throw new ArgumentNullException(nameof(subroutines));
} }
@@ -46,7 +49,29 @@
private CharacterPath Run(CommandSequence sequence) private CharacterPath Run(CommandSequence sequence)
{ {
var context = new Type1BuildCharContext(Subroutines); var context = new Type1BuildCharContext(Subroutines, i =>
{
if (!charStringIndexToName.TryGetValue(i, out var name))
{
throw new InvalidOperationException($"Tried to retrieve Type 1 charstring by index {i} which did not exist.");
}
if (glyphs.TryGetValue(name, out var result))
{
return result;
}
if (!CharStrings.TryGetValue(name, out var charstring))
{
throw new InvalidOperationException($"Tried to retrieve Type 1 charstring by index {i} which mapped to name {name} but was not found in the charstrings.");
}
var path = Run(charstring);
glyphs[name] = path;
return path;
});
foreach (var command in sequence.Commands) foreach (var command in sequence.Commands)
{ {
command.Match(x => context.Stack.Push(x), command.Match(x => context.Stack.Push(x),

View File

@@ -12,7 +12,7 @@
using Tokenization.Scanner; using Tokenization.Scanner;
using Tokens; using Tokens;
using Util; using Util;
using XObject; using XObjects;
internal class ContentStreamProcessor : IOperationContext internal class ContentStreamProcessor : IOperationContext
{ {

View File

@@ -4,7 +4,7 @@
using PdfPig.Core; using PdfPig.Core;
using Tokens; using Tokens;
using Util.JetBrains.Annotations; using Util.JetBrains.Annotations;
using XObject; using XObjects;
internal class XObjectContentRecord internal class XObjectContentRecord
{ {

View File

@@ -12,7 +12,7 @@
using Tokenization.Scanner; using Tokenization.Scanner;
using Tokens; using Tokens;
using Util; using Util;
using XObject; using XObjects;
internal class PageFactory : IPageFactory internal class PageFactory : IPageFactory
{ {

View File

@@ -22,7 +22,7 @@
using Tokenization.Scanner; using Tokenization.Scanner;
using Tokens; using Tokens;
using Util; using Util;
using XObject; using XObjects;
internal static class PdfDocumentFactory internal static class PdfDocumentFactory
{ {

View File

@@ -45,7 +45,7 @@
/// Access to the underlying raw structure of the document. /// Access to the underlying raw structure of the document.
/// </summary> /// </summary>
[NotNull] [NotNull]
internal Structure Structure { get; } public Structure Structure { get; }
/// <summary> /// <summary>
/// The version number of the PDF specification which this file conforms to, for example 1.4. /// The version number of the PDF specification which this file conforms to, for example 1.4.

View File

@@ -12,9 +12,9 @@
<PackageTags>PDF;Reader;Document;Adobe;PDFBox;PdfPig</PackageTags> <PackageTags>PDF;Reader;Document;Adobe;PDFBox;PdfPig</PackageTags>
<RepositoryUrl>https://github.com/UglyToad/PdfPig</RepositoryUrl> <RepositoryUrl>https://github.com/UglyToad/PdfPig</RepositoryUrl>
<GenerateDocumentationFile>true</GenerateDocumentationFile> <GenerateDocumentationFile>true</GenerateDocumentationFile>
<Version>0.0.5</Version> <Version>0.0.3</Version>
<AssemblyVersion>0.0.1.3</AssemblyVersion> <AssemblyVersion>0.0.3.0</AssemblyVersion>
<FileVersion>0.0.1.3</FileVersion> <FileVersion>0.0.3.0</FileVersion>
<PackageIconUrl>https://raw.githubusercontent.com/UglyToad/PdfPig/master/documentation/pdfpig.png</PackageIconUrl> <PackageIconUrl>https://raw.githubusercontent.com/UglyToad/PdfPig/master/documentation/pdfpig.png</PackageIconUrl>
<Product>PdfPig</Product> <Product>PdfPig</Product>
<PublishRepositoryUrl>true</PublishRepositoryUrl> <PublishRepositoryUrl>true</PublishRepositoryUrl>

View File

@@ -1,4 +1,4 @@
namespace UglyToad.PdfPig.XObject namespace UglyToad.PdfPig.XObjects
{ {
using System; using System;
using Graphics; using Graphics;
@@ -7,7 +7,7 @@
internal class XObjectFactory internal class XObjectFactory
{ {
public void CreateImage(XObjectContentRecord xObject, IPdfTokenScanner pdfScanner, bool isLenientParsing) public XObjectImage CreateImage(XObjectContentRecord xObject, IPdfTokenScanner pdfScanner, bool isLenientParsing)
{ {
if (xObject == null) if (xObject == null)
{ {
@@ -25,23 +25,13 @@
var isJpxDecode = xObject.Stream.StreamDictionary.TryGet(NameToken.Filter, out var token) var isJpxDecode = xObject.Stream.StreamDictionary.TryGet(NameToken.Filter, out var token)
&& token is NameToken filterName && token is NameToken filterName
&& filterName.Equals(NameToken.JpxDecode); && filterName.Equals(NameToken.JpxDecode);
if (isJpxDecode)
{
return;
}
var isImageMask = xObject.Stream.StreamDictionary.TryGet(NameToken.ImageMask, out var maskToken) var isImageMask = xObject.Stream.StreamDictionary.TryGet(NameToken.ImageMask, out var maskToken)
&& maskToken is BooleanToken maskBoolean && maskToken is BooleanToken maskBoolean
&& maskBoolean.Data; && maskBoolean.Data;
if (isImageMask) return new XObjectImage(width, height, isJpxDecode, isImageMask, xObject.Stream.StreamDictionary,
{ xObject.Stream.Data);
return;
}
var bitsPerComponents = xObject.Stream.StreamDictionary.Get<NumericToken>(NameToken.BitsPerComponent, pdfScanner).Int;
} }
} }
} }

View File

@@ -0,0 +1,67 @@
namespace UglyToad.PdfPig.XObjects
{
using System;
using System.Collections.Generic;
using Tokens;
using Util.JetBrains.Annotations;
/// <summary>
/// The raw stream from a PDF document representing an image XObject.
/// </summary>
public class XObjectImage
{
/// <summary>
/// The width of the image in samples.
/// </summary>
public int Width { get; }
/// <summary>
/// The height of the image in samples.
/// </summary>
public int Height { get; }
/// <summary>
/// The JPX filter encodes data using the JPEG2000 compression method.
/// A JPEG2000 data stream allows different versions of the image to be decoded
/// allowing for thumbnails to be extracted.
/// </summary>
public bool IsJpxEncoded { get; }
/// <summary>
/// Whether this image should be treated as an image maske.
/// </summary>
public bool IsImageMask { get; }
/// <summary>
/// The full dictionary for this Image XObject.
/// </summary>
[NotNull]
public DictionaryToken ImageDictionary { get; }
/// <summary>
/// The encoded bytes of this image, must be decoded via any
/// filters defined in the <see cref="ImageDictionary"/> prior to consumption.
/// </summary>
[NotNull]
public IReadOnlyList<byte> Bytes { get; }
/// <summary>
/// Creates a new <see cref="XObjectImage"/>.
/// </summary>
internal XObjectImage(int width, int height, bool isJpxEncoded, bool isImageMask, DictionaryToken imageDictionary, IReadOnlyList<byte> bytes)
{
Width = width;
Height = height;
IsJpxEncoded = isJpxEncoded;
IsImageMask = isImageMask;
ImageDictionary = imageDictionary ?? throw new ArgumentNullException(nameof(imageDictionary));
Bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
}
/// <inheritdoc />
public override string ToString()
{
return ImageDictionary.ToString();
}
}
}

View File

@@ -1,4 +1,4 @@
namespace UglyToad.PdfPig.XObject namespace UglyToad.PdfPig.XObjects
{ {
internal enum XObjectType internal enum XObjectType
{ {