mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-14 10:55:04 +08:00
#11 early access to the raw xobjects for images.
temporary 'safe' untested implementation of seac for type 1 charstrings. make structure public bump version of package and project to 0.0.3 (it had accidentally increased to 0.0.5)
This commit is contained in:
@@ -17,7 +17,7 @@
|
||||
// Add the full path back on, we removed it so we could see it in the test explorer.
|
||||
documentName = Path.Combine(DocumentFolder.Value, documentName);
|
||||
|
||||
using (var document = PdfDocument.Open(documentName, new ParsingOptions{ UseLenientParsing = false}))
|
||||
using (var document = PdfDocument.Open(documentName, new ParsingOptions { UseLenientParsing = false }))
|
||||
{
|
||||
for (var i = 0; i < document.NumberOfPages; i++)
|
||||
{
|
||||
@@ -32,11 +32,11 @@
|
||||
{
|
||||
documentName = Path.Combine(DocumentFolder.Value, documentName);
|
||||
|
||||
using (var document = PdfDocument.Open(documentName, new ParsingOptions{ UseLenientParsing = false }))
|
||||
using (var document = PdfDocument.Open(documentName, new ParsingOptions { UseLenientParsing = false }))
|
||||
{
|
||||
Assert.NotNull(document.Structure.Catalog);
|
||||
|
||||
Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count > 0 , "Cross reference table was empty.");
|
||||
Assert.True(document.Structure.CrossReferenceTable.ObjectOffsets.Count > 0, "Cross reference table was empty.");
|
||||
foreach (var objectOffset in document.Structure.CrossReferenceTable.ObjectOffsets)
|
||||
{
|
||||
var token = document.Structure.GetObject(objectOffset.Key);
|
||||
@@ -46,6 +46,31 @@
|
||||
}
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[MemberData(nameof(GetAllDocuments))]
|
||||
public void CanAccessImagesOnEveryPage(string documentName)
|
||||
{
|
||||
documentName = Path.Combine(DocumentFolder.Value, documentName);
|
||||
|
||||
using (var document = PdfDocument.Open(documentName, new ParsingOptions { UseLenientParsing = false }))
|
||||
{
|
||||
for (var i = 0; i < document.NumberOfPages; i++)
|
||||
{
|
||||
var page = document.GetPage(i + 1);
|
||||
|
||||
var images = page.ExperimentalAccess.GetRawImages();
|
||||
|
||||
Assert.NotNull(images);
|
||||
|
||||
foreach (var image in images)
|
||||
{
|
||||
Assert.True(image.Width > 0, $"Image had width of zero on page {i + 1}.");
|
||||
Assert.True(image.Height > 0, $"Image had height of zero on page {i + 1}.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static IEnumerable<object[]> GetAllDocuments
|
||||
{
|
||||
get
|
||||
@@ -53,7 +78,7 @@
|
||||
var files = Directory.GetFiles(DocumentFolder.Value, "*.pdf");
|
||||
|
||||
// Return the shortname so we can see it in the test explorer.
|
||||
return files.Select(x => new object[] {Path.GetFileName(x)});
|
||||
return files.Select(x => new object[] { Path.GetFileName(x) });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,5 +1,6 @@
|
||||
namespace UglyToad.PdfPig.Tests.Integration
|
||||
{
|
||||
using System.Linq;
|
||||
using Xunit;
|
||||
|
||||
public class OldGutnishTests
|
||||
@@ -40,9 +41,9 @@
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
page.Content.GetImages();
|
||||
var images = page.ExperimentalAccess.GetRawImages().ToList();
|
||||
Assert.Single(images);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@@ -56,7 +56,8 @@
|
||||
"UglyToad.PdfPig.Tokens.ObjectToken",
|
||||
"UglyToad.PdfPig.Tokens.StreamToken",
|
||||
"UglyToad.PdfPig.Tokens.StringToken",
|
||||
"UglyToad.PdfPig.Util.IWordExtractor"
|
||||
"UglyToad.PdfPig.Util.IWordExtractor",
|
||||
"UglyToad.PdfPig.XObjects.XObjectImage"
|
||||
};
|
||||
|
||||
foreach (var publicTypeName in publicTypeNames)
|
||||
|
@@ -4,6 +4,8 @@
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Util;
|
||||
using Util.JetBrains.Annotations;
|
||||
using XObjects;
|
||||
|
||||
/// <summary>
|
||||
/// Contains the content and provides access to methods of a single page in the <see cref="PdfDocument"/>.
|
||||
@@ -46,6 +48,12 @@
|
||||
/// </summary>
|
||||
public PageSize Size { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Access to members whose future locations within the API will change without warning.
|
||||
/// </summary>
|
||||
[NotNull]
|
||||
public Experimental ExperimentalAccess { get; }
|
||||
|
||||
internal Page(int number, MediaBox mediaBox, CropBox cropBox, PageContent content)
|
||||
{
|
||||
if (number <= 0)
|
||||
@@ -63,6 +71,7 @@
|
||||
Height = mediaBox.Bounds.Height;
|
||||
|
||||
Size = mediaBox.Bounds.GetPageSize();
|
||||
ExperimentalAccess = new Experimental(this);
|
||||
}
|
||||
|
||||
private static string GetText(PageContent content)
|
||||
@@ -90,5 +99,28 @@
|
||||
{
|
||||
return (wordExtractor ?? DefaultWordExtractor.Instance).GetWords(Letters);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Provides access to useful members which will change in future releases.
|
||||
/// </summary>
|
||||
public class Experimental
|
||||
{
|
||||
private readonly Page page;
|
||||
|
||||
internal Experimental(Page page)
|
||||
{
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Retrieve any images referenced in this page's content.
|
||||
/// These are returned as <see cref="XObjectImage"/>s which are
|
||||
/// raw data from the PDF's content rather than images.
|
||||
/// </summary>
|
||||
public IEnumerable<XObjectImage> GetRawImages()
|
||||
{
|
||||
return page.Content.GetImages();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -4,7 +4,7 @@
|
||||
using Graphics;
|
||||
using Graphics.Operations;
|
||||
using Tokenization.Scanner;
|
||||
using XObject;
|
||||
using XObjects;
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
@@ -38,11 +38,11 @@
|
||||
this.isLenientParsing = isLenientParsing;
|
||||
}
|
||||
|
||||
public void GetImages()
|
||||
public IEnumerable<XObjectImage> GetImages()
|
||||
{
|
||||
foreach (var contentRecord in xObjects[XObjectType.Image])
|
||||
{
|
||||
xObjectFactory.CreateImage(contentRecord, pdfScanner, isLenientParsing);
|
||||
yield return xObjectFactory.CreateImage(contentRecord, pdfScanner, isLenientParsing);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,7 +1,5 @@
|
||||
namespace UglyToad.PdfPig.Fonts.Type1.CharStrings.Commands.StartFinishOutline
|
||||
{
|
||||
using System;
|
||||
|
||||
/// <summary>
|
||||
/// Standard encoding accented character.
|
||||
/// Makes an accented character from two other characters in the font program.
|
||||
@@ -29,8 +27,8 @@
|
||||
var baseCharacter = context.GetCharacter((int)baseCharacterCode);
|
||||
var accentCharacter = context.GetCharacter((int) accentCharacterCode);
|
||||
|
||||
// TODO
|
||||
throw new NotImplementedException("Not done yet...");
|
||||
// TODO: full seac implementation.
|
||||
context.SetPath(baseCharacter);
|
||||
|
||||
context.Stack.Clear();
|
||||
}
|
||||
|
@@ -1,10 +1,13 @@
|
||||
namespace UglyToad.PdfPig.Fonts.Type1.CharStrings.Commands
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Geometry;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
internal class Type1BuildCharContext
|
||||
{
|
||||
private readonly Func<int, CharacterPath> characterByIndexFactory;
|
||||
public IReadOnlyDictionary<int, Type1CharStrings.CommandSequence> Subroutines { get; }
|
||||
|
||||
public decimal WidthX { get; set; }
|
||||
@@ -17,7 +20,8 @@
|
||||
|
||||
public bool IsFlexing { get; set; }
|
||||
|
||||
public CharacterPath Path { get; } = new CharacterPath();
|
||||
[NotNull]
|
||||
public CharacterPath Path { get; private set; } = new CharacterPath();
|
||||
|
||||
public PdfPoint CurrentPosition { get; set; }
|
||||
|
||||
@@ -27,9 +31,11 @@
|
||||
|
||||
public IReadOnlyList<PdfPoint> FlexPoints { get; }
|
||||
|
||||
public Type1BuildCharContext(IReadOnlyDictionary<int, Type1CharStrings.CommandSequence> subroutines)
|
||||
public Type1BuildCharContext(IReadOnlyDictionary<int, Type1CharStrings.CommandSequence> subroutines,
|
||||
Func<int, CharacterPath> characterByIndexFactory)
|
||||
{
|
||||
Subroutines = subroutines;
|
||||
this.characterByIndexFactory = characterByIndexFactory ?? throw new ArgumentNullException(nameof(characterByIndexFactory));
|
||||
Subroutines = subroutines ?? throw new ArgumentNullException(nameof(subroutines));
|
||||
}
|
||||
|
||||
public void AddFlexPoint(PdfPoint point)
|
||||
@@ -39,7 +45,12 @@
|
||||
|
||||
public CharacterPath GetCharacter(int characterCode)
|
||||
{
|
||||
return null;
|
||||
return characterByIndexFactory(characterCode);
|
||||
}
|
||||
|
||||
public void SetPath(CharacterPath path)
|
||||
{
|
||||
Path = path ?? throw new ArgumentNullException(nameof(path));
|
||||
}
|
||||
|
||||
public void ClearFlexPoints()
|
||||
|
@@ -46,12 +46,15 @@
|
||||
}
|
||||
|
||||
var charStringResults = new Dictionary<string, Type1CharStrings.CommandSequence>(charStrings.Count);
|
||||
var charStringIndexToName = new Dictionary<int, string>();
|
||||
|
||||
foreach (var charString in charStrings)
|
||||
for (var i = 0; i < charStrings.Count; i++)
|
||||
{
|
||||
var charString = charStrings[i];
|
||||
var commandSequence = ParseSingle(charString.Bytes);
|
||||
|
||||
charStringResults[charString.Name] = new Type1CharStrings.CommandSequence(commandSequence);
|
||||
charStringIndexToName[i] = charString.Name;
|
||||
}
|
||||
|
||||
var subroutineResults = new Dictionary<int, Type1CharStrings.CommandSequence>(subroutines.Count);
|
||||
@@ -63,7 +66,7 @@
|
||||
subroutineResults[subroutine.Index] = new Type1CharStrings.CommandSequence(commandSequence);
|
||||
}
|
||||
|
||||
return new Type1CharStrings(charStringResults, subroutineResults);
|
||||
return new Type1CharStrings(charStringResults, charStringIndexToName, subroutineResults);
|
||||
}
|
||||
|
||||
private static IReadOnlyList<Union<decimal, LazyType1Command>> ParseSingle(IReadOnlyList<byte> charStringBytes)
|
||||
|
@@ -8,6 +8,7 @@
|
||||
|
||||
internal class Type1CharStrings
|
||||
{
|
||||
private readonly IReadOnlyDictionary<int, string> charStringIndexToName;
|
||||
private readonly object locker = new object();
|
||||
private readonly Dictionary<string, CharacterPath> glyphs = new Dictionary<string, CharacterPath>();
|
||||
|
||||
@@ -15,8 +16,10 @@
|
||||
|
||||
public IReadOnlyDictionary<int, CommandSequence> Subroutines { get; }
|
||||
|
||||
public Type1CharStrings(IReadOnlyDictionary<string, CommandSequence> charStrings, IReadOnlyDictionary<int, CommandSequence> subroutines)
|
||||
public Type1CharStrings(IReadOnlyDictionary<string, CommandSequence> charStrings, IReadOnlyDictionary<int, string> charStringIndexToName,
|
||||
IReadOnlyDictionary<int, CommandSequence> subroutines)
|
||||
{
|
||||
this.charStringIndexToName = charStringIndexToName ?? throw new ArgumentNullException(nameof(charStringIndexToName));
|
||||
CharStrings = charStrings ?? throw new ArgumentNullException(nameof(charStrings));
|
||||
Subroutines = subroutines ?? throw new ArgumentNullException(nameof(subroutines));
|
||||
}
|
||||
@@ -46,7 +49,29 @@
|
||||
|
||||
private CharacterPath Run(CommandSequence sequence)
|
||||
{
|
||||
var context = new Type1BuildCharContext(Subroutines);
|
||||
var context = new Type1BuildCharContext(Subroutines, i =>
|
||||
{
|
||||
if (!charStringIndexToName.TryGetValue(i, out var name))
|
||||
{
|
||||
throw new InvalidOperationException($"Tried to retrieve Type 1 charstring by index {i} which did not exist.");
|
||||
}
|
||||
|
||||
if (glyphs.TryGetValue(name, out var result))
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
if (!CharStrings.TryGetValue(name, out var charstring))
|
||||
{
|
||||
throw new InvalidOperationException($"Tried to retrieve Type 1 charstring by index {i} which mapped to name {name} but was not found in the charstrings.");
|
||||
}
|
||||
|
||||
var path = Run(charstring);
|
||||
|
||||
glyphs[name] = path;
|
||||
|
||||
return path;
|
||||
});
|
||||
foreach (var command in sequence.Commands)
|
||||
{
|
||||
command.Match(x => context.Stack.Push(x),
|
||||
|
@@ -12,7 +12,7 @@
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
using Util;
|
||||
using XObject;
|
||||
using XObjects;
|
||||
|
||||
internal class ContentStreamProcessor : IOperationContext
|
||||
{
|
||||
|
@@ -4,7 +4,7 @@
|
||||
using PdfPig.Core;
|
||||
using Tokens;
|
||||
using Util.JetBrains.Annotations;
|
||||
using XObject;
|
||||
using XObjects;
|
||||
|
||||
internal class XObjectContentRecord
|
||||
{
|
||||
|
@@ -12,7 +12,7 @@
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
using Util;
|
||||
using XObject;
|
||||
using XObjects;
|
||||
|
||||
internal class PageFactory : IPageFactory
|
||||
{
|
||||
|
@@ -22,7 +22,7 @@
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
using Util;
|
||||
using XObject;
|
||||
using XObjects;
|
||||
|
||||
internal static class PdfDocumentFactory
|
||||
{
|
||||
|
@@ -45,7 +45,7 @@
|
||||
/// Access to the underlying raw structure of the document.
|
||||
/// </summary>
|
||||
[NotNull]
|
||||
internal Structure Structure { get; }
|
||||
public Structure Structure { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The version number of the PDF specification which this file conforms to, for example 1.4.
|
||||
|
@@ -12,9 +12,9 @@
|
||||
<PackageTags>PDF;Reader;Document;Adobe;PDFBox;PdfPig</PackageTags>
|
||||
<RepositoryUrl>https://github.com/UglyToad/PdfPig</RepositoryUrl>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<Version>0.0.5</Version>
|
||||
<AssemblyVersion>0.0.1.3</AssemblyVersion>
|
||||
<FileVersion>0.0.1.3</FileVersion>
|
||||
<Version>0.0.3</Version>
|
||||
<AssemblyVersion>0.0.3.0</AssemblyVersion>
|
||||
<FileVersion>0.0.3.0</FileVersion>
|
||||
<PackageIconUrl>https://raw.githubusercontent.com/UglyToad/PdfPig/master/documentation/pdfpig.png</PackageIconUrl>
|
||||
<Product>PdfPig</Product>
|
||||
<PublishRepositoryUrl>true</PublishRepositoryUrl>
|
||||
|
@@ -1,4 +1,4 @@
|
||||
namespace UglyToad.PdfPig.XObject
|
||||
namespace UglyToad.PdfPig.XObjects
|
||||
{
|
||||
using System;
|
||||
using Graphics;
|
||||
@@ -7,7 +7,7 @@
|
||||
|
||||
internal class XObjectFactory
|
||||
{
|
||||
public void CreateImage(XObjectContentRecord xObject, IPdfTokenScanner pdfScanner, bool isLenientParsing)
|
||||
public XObjectImage CreateImage(XObjectContentRecord xObject, IPdfTokenScanner pdfScanner, bool isLenientParsing)
|
||||
{
|
||||
if (xObject == null)
|
||||
{
|
||||
@@ -25,23 +25,13 @@
|
||||
var isJpxDecode = xObject.Stream.StreamDictionary.TryGet(NameToken.Filter, out var token)
|
||||
&& token is NameToken filterName
|
||||
&& filterName.Equals(NameToken.JpxDecode);
|
||||
|
||||
if (isJpxDecode)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
var isImageMask = xObject.Stream.StreamDictionary.TryGet(NameToken.ImageMask, out var maskToken)
|
||||
&& maskToken is BooleanToken maskBoolean
|
||||
&& maskBoolean.Data;
|
||||
|
||||
if (isImageMask)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var bitsPerComponents = xObject.Stream.StreamDictionary.Get<NumericToken>(NameToken.BitsPerComponent, pdfScanner).Int;
|
||||
|
||||
return new XObjectImage(width, height, isJpxDecode, isImageMask, xObject.Stream.StreamDictionary,
|
||||
xObject.Stream.Data);
|
||||
}
|
||||
}
|
||||
}
|
67
src/UglyToad.PdfPig/XObjects/XObjectImage.cs
Normal file
67
src/UglyToad.PdfPig/XObjects/XObjectImage.cs
Normal file
@@ -0,0 +1,67 @@
|
||||
namespace UglyToad.PdfPig.XObjects
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Tokens;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
/// <summary>
|
||||
/// The raw stream from a PDF document representing an image XObject.
|
||||
/// </summary>
|
||||
public class XObjectImage
|
||||
{
|
||||
/// <summary>
|
||||
/// The width of the image in samples.
|
||||
/// </summary>
|
||||
public int Width { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The height of the image in samples.
|
||||
/// </summary>
|
||||
public int Height { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The JPX filter encodes data using the JPEG2000 compression method.
|
||||
/// A JPEG2000 data stream allows different versions of the image to be decoded
|
||||
/// allowing for thumbnails to be extracted.
|
||||
/// </summary>
|
||||
public bool IsJpxEncoded { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether this image should be treated as an image maske.
|
||||
/// </summary>
|
||||
public bool IsImageMask { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The full dictionary for this Image XObject.
|
||||
/// </summary>
|
||||
[NotNull]
|
||||
public DictionaryToken ImageDictionary { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The encoded bytes of this image, must be decoded via any
|
||||
/// filters defined in the <see cref="ImageDictionary"/> prior to consumption.
|
||||
/// </summary>
|
||||
[NotNull]
|
||||
public IReadOnlyList<byte> Bytes { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new <see cref="XObjectImage"/>.
|
||||
/// </summary>
|
||||
internal XObjectImage(int width, int height, bool isJpxEncoded, bool isImageMask, DictionaryToken imageDictionary, IReadOnlyList<byte> bytes)
|
||||
{
|
||||
Width = width;
|
||||
Height = height;
|
||||
IsJpxEncoded = isJpxEncoded;
|
||||
IsImageMask = isImageMask;
|
||||
ImageDictionary = imageDictionary ?? throw new ArgumentNullException(nameof(imageDictionary));
|
||||
Bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
return ImageDictionary.ToString();
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,4 +1,4 @@
|
||||
namespace UglyToad.PdfPig.XObject
|
||||
namespace UglyToad.PdfPig.XObjects
|
||||
{
|
||||
internal enum XObjectType
|
||||
{
|
Reference in New Issue
Block a user