Add AddPageFactory() methods and GetPage<T>() to PdfDocument. Make public IPageFactory<TPage>, PageFactoryBase<TPage>, IResourceStore, ILookupFilterProvider, IPageContentParser, IPdfTokenScanner, UserSpaceUnit, ResourceColorSpace

This commit is contained in:
BobLd 2023-07-01 13:18:54 +01:00
parent 94cc9be967
commit ae5d3627e0
21 changed files with 731 additions and 246 deletions

View File

@ -0,0 +1,176 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using System.Collections.Generic;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.Filters;
using UglyToad.PdfPig.Geometry;
using UglyToad.PdfPig.Logging;
using UglyToad.PdfPig.Outline;
using UglyToad.PdfPig.Parser;
using UglyToad.PdfPig.Tokenization.Scanner;
using UglyToad.PdfPig.Tokens;
using Xunit;
public class PageFactoryTests
{
[Fact]
public void SimpleFactory1()
{
var file = IntegrationHelpers.GetDocumentPath("Various Content Types");
using (var document = PdfDocument.Open(file))
{
document.AddPageFactory<SimplePage>(typeof(SimplePageFactory));
var page = document.GetPage<SimplePage>(1);
Assert.Equal(1, page.Number);
page = document.GetPage<SimplePage>(1);
Assert.Equal(1, page.Number);
}
}
[Fact]
public void SimpleFactory2()
{
var file = IntegrationHelpers.GetDocumentPath("Various Content Types");
using (var document = PdfDocument.Open(file))
{
document.AddPageFactory(new SimplePageFactory());
var page = document.GetPage<SimplePage>(1);
Assert.Equal(1, page.Number);
page = document.GetPage<SimplePage>(1);
Assert.Equal(1, page.Number);
}
}
[Fact]
public void InformationFactory()
{
var file = IntegrationHelpers.GetDocumentPath("Various Content Types");
using (var document = PdfDocument.Open(file))
{
document.AddPageFactory<PageInformation>(typeof(PageInformationFactory));
Page page = document.GetPage(1);
PageInformation pageInfo = document.GetPage<PageInformation>(1);
Assert.Equal(page.Number, pageInfo.Number);
Assert.Equal(page.Rotation, pageInfo.Rotation);
Assert.Equal(page.MediaBox.Bounds, pageInfo.MediaBox.Bounds);
Assert.Equal(page.CropBox.Bounds, pageInfo.CropBox.Bounds);
//Assert.Equal(page.Unit, pageInfo.UserSpaceUnit);
pageInfo = document.GetPage<PageInformation>(1);
Assert.Equal(page.Number, pageInfo.Number);
Assert.Equal(page.Rotation, pageInfo.Rotation);
Assert.Equal(page.MediaBox.Bounds, pageInfo.MediaBox.Bounds);
Assert.Equal(page.CropBox.Bounds, pageInfo.CropBox.Bounds);
}
}
#region SimplePage
public class SimplePage
{
public int Number { get; }
public int Rotation { get; }
public MediaBox MediaBox { get; }
public SimplePage(int number, int rotation, MediaBox mediaBox)
{
Number = number;
Rotation = rotation;
MediaBox = mediaBox;
}
}
public class SimplePageFactory : IPageFactory<SimplePage>
{
public SimplePageFactory()
{
// do nothing
}
public SimplePageFactory(
IPdfTokenScanner pdfScanner,
IResourceStore resourceStore,
ILookupFilterProvider filterProvider,
IPageContentParser pageContentParser,
ILog log)
{ }
public SimplePage Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, NamedDestinations annotationProvider, IParsingOptions parsingOptions)
{
return new SimplePage(number, pageTreeMembers.Rotation, pageTreeMembers.MediaBox);
}
}
#endregion
#region PageInformation
public class PageInformation
{
public int Number { get; set; }
public PageRotationDegrees Rotation { get; set; }
public MediaBox MediaBox { get; set; }
public CropBox CropBox { get; set; }
public UserSpaceUnit UserSpaceUnit { get; set; }
}
public class PageInformationFactory : PageFactoryBase<PageInformation>
{
public PageInformationFactory(
IPdfTokenScanner pdfScanner,
IResourceStore resourceStore,
ILookupFilterProvider filterProvider,
IPageContentParser pageContentParser,
ILog log)
: base(pdfScanner, resourceStore, filterProvider, pageContentParser, log)
{
}
protected override PageInformation ProcessPage(
int pageNumber,
DictionaryToken dictionary,
NamedDestinations namedDestinations,
IReadOnlyList<byte> contentBytes,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
MediaBox mediaBox,
IParsingOptions parsingOptions)
{
return ProcessPage(pageNumber, dictionary, namedDestinations, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
}
protected override PageInformation ProcessPage(int pageNumber,
DictionaryToken dictionary,
NamedDestinations namedDestinations,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
MediaBox mediaBox,
IParsingOptions parsingOptions)
{
return new PageInformation()
{
Number = pageNumber,
Rotation = rotation,
MediaBox = mediaBox,
CropBox = cropBox,
UserSpaceUnit = userSpaceUnit
};
}
}
#endregion
}
}

View File

@ -81,17 +81,21 @@
"UglyToad.PdfPig.Content.Hyperlink",
"UglyToad.PdfPig.Content.InlineImage",
"UglyToad.PdfPig.Content.IPdfImage",
"UglyToad.PdfPig.Content.IResourceStore",
"UglyToad.PdfPig.Content.Letter",
"UglyToad.PdfPig.Content.MarkedContentElement",
"UglyToad.PdfPig.Content.MediaBox",
"UglyToad.PdfPig.Content.OptionalContentGroupElement",
"UglyToad.PdfPig.Content.Page",
"UglyToad.PdfPig.Content.PageFactoryBase`1",
"UglyToad.PdfPig.Content.PageRotationDegrees",
"UglyToad.PdfPig.Content.PageSize",
"UglyToad.PdfPig.Content.PageTreeNode",
"UglyToad.PdfPig.Content.PageTreeMembers",
"UglyToad.PdfPig.Content.Word",
"UglyToad.PdfPig.Content.TextOrientation",
"UglyToad.PdfPig.Content.XmpMetadata",
"UglyToad.PdfPig.Content.IPageFactory`1",
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
@ -99,6 +103,7 @@
"UglyToad.PdfPig.Filters.DefaultFilterProvider",
"UglyToad.PdfPig.Filters.IFilter",
"UglyToad.PdfPig.Filters.IFilterProvider",
"UglyToad.PdfPig.Filters.ILookupFilterProvider",
"UglyToad.PdfPig.Functions.FunctionTypes",
"UglyToad.PdfPig.Functions.PdfFunction",
"UglyToad.PdfPig.PdfFonts.CharacterBoundingBox",
@ -109,9 +114,11 @@
"UglyToad.PdfPig.PdfFonts.FontStretch",
"UglyToad.PdfPig.PdfFonts.IFont",
"UglyToad.PdfPig.Geometry.GeometryExtensions",
"UglyToad.PdfPig.Geometry.UserSpaceUnit",
"UglyToad.PdfPig.Graphics.Colors.CMYKColor",
"UglyToad.PdfPig.Graphics.Colors.ColorSpace",
"UglyToad.PdfPig.Graphics.PdfPath",
"UglyToad.PdfPig.Graphics.Colors.ResourceColorSpace",
"UglyToad.PdfPig.Graphics.Colors.ColorSpaceExtensions",
"UglyToad.PdfPig.Graphics.Colors.ColorSpaceFamily",
"UglyToad.PdfPig.Graphics.Colors.GrayColor",
@ -230,6 +237,7 @@
"UglyToad.PdfPig.Graphics.TextMatrices",
"UglyToad.PdfPig.Graphics.XObjectContentRecord",
"UglyToad.PdfPig.Images.ColorSpaceDetailsByteConverter",
"UglyToad.PdfPig.IParsingOptions",
"UglyToad.PdfPig.Logging.ILog",
"UglyToad.PdfPig.Outline.Bookmarks",
"UglyToad.PdfPig.Outline.BookmarkNode",
@ -237,15 +245,18 @@
"UglyToad.PdfPig.Outline.EmbeddedBookmarkNode",
"UglyToad.PdfPig.Outline.ExternalBookmarkNode",
"UglyToad.PdfPig.Outline.UriBookmarkNode",
"UglyToad.PdfPig.Outline.NamedDestinations",
"UglyToad.PdfPig.Outline.Destinations.ExplicitDestination",
"UglyToad.PdfPig.Outline.Destinations.ExplicitDestinationCoordinates",
"UglyToad.PdfPig.Outline.Destinations.ExplicitDestinationType",
"UglyToad.PdfPig.ParsingOptions",
"UglyToad.PdfPig.Parser.IPageContentParser",
"UglyToad.PdfPig.PdfDocument",
"UglyToad.PdfPig.PdfExtensions",
"UglyToad.PdfPig.Rendering.IPageImageRenderer",
"UglyToad.PdfPig.Rendering.PdfRendererImageFormat",
"UglyToad.PdfPig.Structure",
"UglyToad.PdfPig.Tokenization.Scanner.IPdfTokenScanner",
"UglyToad.PdfPig.Util.Adler32Checksum",
"UglyToad.PdfPig.Util.IWordExtractor",
"UglyToad.PdfPig.Util.DefaultWordExtractor",

View File

@ -3,12 +3,19 @@
using Outline;
using Tokens;
internal interface IPageFactory
/// <summary>
/// Page factory interface.
/// </summary>
/// <typeparam name="TPage">The type of page the page factory creates.</typeparam>
public interface IPageFactory<TPage>
{
Page Create(int number,
/// <summary>
/// Create the page.
/// </summary>
TPage Create(int number,
DictionaryToken dictionary,
PageTreeMembers pageTreeMembers,
NamedDestinations annotationProvider,
InternalParsingOptions parsingOptions);
IParsingOptions parsingOptions);
}
}

View File

@ -5,9 +5,15 @@
using System.Collections.Generic;
using Tokens;
internal interface IResourceStore
/// <summary>
/// Resource store.
/// </summary>
public interface IResourceStore
{
void LoadResourceDictionary(DictionaryToken resourceDictionary, InternalParsingOptions parsingOptions);
/// <summary>
/// Load the resource dictionary.
/// </summary>
void LoadResourceDictionary(DictionaryToken resourceDictionary, IParsingOptions parsingOptions);
/// <summary>
/// Remove any named resources and associated state for the last resource dictionary loaded.
@ -15,22 +21,49 @@
/// </summary>
void UnloadResourceDictionary();
/// <summary>
/// Get the font corresponding to the name.
/// </summary>
IFont GetFont(NameToken name);
/// <summary>
/// Try get the XObject corresponding to the name.
/// </summary>
bool TryGetXObject(NameToken name, out StreamToken stream);
/// <summary>
/// Get the extended graphics state dictionary corresponding to the name.
/// </summary>
DictionaryToken GetExtendedGraphicsStateDictionary(NameToken name);
/// <summary>
/// Get the font from the <see cref="IndirectReferenceToken"/>.
/// </summary>
IFont GetFontDirectly(IndirectReferenceToken fontReferenceToken);
/// <summary>
/// Get the named color space by its name.
/// </summary>
bool TryGetNamedColorSpace(NameToken name, out ResourceColorSpace namedColorSpace);
/// <summary>
/// Get the color space details corresponding to the name.
/// </summary>
ColorSpaceDetails GetColorSpaceDetails(NameToken name, DictionaryToken dictionary);
/// <summary>
/// Get the marked content properties dictionary corresponding to the name.
/// </summary>
DictionaryToken GetMarkedContentPropertiesDictionary(NameToken name);
/// <summary>
/// Get all <see cref="PatternColor"/> as a dictionnary. Keys are the <see cref="PatternColor"/> names.
/// </summary>
IReadOnlyDictionary<NameToken, PatternColor> GetPatterns();
/// <summary>
/// Get the shading corresponding to the name.
/// </summary>
Shading GetShading(NameToken name);
}
}

View File

@ -0,0 +1,264 @@
namespace UglyToad.PdfPig.Content
{
using Core;
using System;
using System.Collections.Generic;
using UglyToad.PdfPig.Filters;
using UglyToad.PdfPig.Geometry;
using UglyToad.PdfPig.Logging;
using UglyToad.PdfPig.Outline;
using UglyToad.PdfPig.Parser;
using UglyToad.PdfPig.Parser.Parts;
using UglyToad.PdfPig.Tokenization.Scanner;
using UglyToad.PdfPig.Tokens;
using UglyToad.PdfPig.Util;
/// <summary>
/// Page factory abstract class.
/// </summary>
/// <typeparam name="TPage">The type of page the page factory creates.</typeparam>
public abstract class PageFactoryBase<TPage> : IPageFactory<TPage>
{
/// <summary>
/// The Pdf token scanner.
/// </summary>
public readonly IPdfTokenScanner pdfScanner;
/// <summary>
/// The resource store.
/// </summary>
public readonly IResourceStore resourceStore;
/// <summary>
/// The filter provider.
/// </summary>
public readonly ILookupFilterProvider filterProvider;
/// <summary>
/// The page content parser.
/// </summary>
public readonly IPageContentParser pageContentParser;
/// <summary>
/// The <see cref="ILog"/> used to record messages raised by the parsing process.
/// </summary>
public readonly ILog log;
/// <summary>
/// Create a <see cref="PageFactoryBase{TPage}"/>.
/// </summary>
protected PageFactoryBase(
IPdfTokenScanner pdfScanner,
IResourceStore resourceStore,
ILookupFilterProvider filterProvider,
IPageContentParser pageContentParser,
ILog log)
{
this.resourceStore = resourceStore;
this.filterProvider = filterProvider;
this.pageContentParser = pageContentParser;
this.pdfScanner = pdfScanner;
this.log = log;
}
/// <inheritdoc/>
public TPage Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers,
NamedDestinations namedDestinations, IParsingOptions parsingOptions)
{
if (dictionary == null)
{
throw new ArgumentNullException(nameof(dictionary));
}
var type = dictionary.GetNameOrDefault(NameToken.Type);
if (type != null && !type.Equals(NameToken.Page))
{
parsingOptions.Logger.Error($"Page {number} had its type specified as {type} rather than 'Page'.");
}
MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers);
CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox);
var rotation = new PageRotationDegrees(pageTreeMembers.Rotation);
// TODO - check if NameToken.Rotate is already looked for in Pages.cs, we don't need to look again
if (dictionary.TryGet(NameToken.Rotate, pdfScanner, out NumericToken rotateToken))
{
rotation = new PageRotationDegrees(rotateToken.Int);
}
var stackDepth = 0;
while (pageTreeMembers.ParentResources.Count > 0)
{
var resource = pageTreeMembers.ParentResources.Dequeue();
resourceStore.LoadResourceDictionary(resource, parsingOptions);
stackDepth++;
}
if (dictionary.TryGet(NameToken.Resources, pdfScanner, out DictionaryToken resources))
{
resourceStore.LoadResourceDictionary(resources, parsingOptions);
stackDepth++;
}
UserSpaceUnit userSpaceUnit = GetUserSpaceUnits(dictionary);
TPage page;
if (!dictionary.TryGet(NameToken.Contents, out var contents))
{
page = ProcessPage(number, dictionary, namedDestinations, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
}
else if (DirectObjectFinder.TryGet<ArrayToken>(contents, pdfScanner, out var array))
{
var bytes = new List<byte>();
for (var i = 0; i < array.Data.Count; i++)
{
var item = array.Data[i];
if (!(item is IndirectReferenceToken obj))
{
throw new PdfDocumentFormatException($"The contents contained something which was not an indirect reference: {item}.");
}
var contentStream = DirectObjectFinder.Get<StreamToken>(obj, pdfScanner);
if (contentStream == null)
{
throw new InvalidOperationException($"Could not find the contents for object {obj}.");
}
bytes.AddRange(contentStream.Decode(filterProvider, pdfScanner));
if (i < array.Data.Count - 1)
{
bytes.Add((byte)'\n');
}
}
page = ProcessPage(number, dictionary, namedDestinations, bytes, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
}
else
{
var contentStream = DirectObjectFinder.Get<StreamToken>(contents, pdfScanner);
if (contentStream == null)
{
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
}
var bytes = contentStream.Decode(filterProvider, pdfScanner);
page = ProcessPage(number, dictionary, namedDestinations, bytes, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
}
for (var i = 0; i < stackDepth; i++)
{
resourceStore.UnloadResourceDictionary();
}
return page;
}
/// <summary>
/// Process a page with no content.
/// </summary>
protected abstract TPage ProcessPage(
int pageNumber,
DictionaryToken dictionary,
NamedDestinations namedDestinations,
IReadOnlyList<byte> contentBytes,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
MediaBox mediaBox,
IParsingOptions parsingOptions);
/// <summary>
/// Process a page with no content.
/// </summary>
protected abstract TPage ProcessPage(
int pageNumber,
DictionaryToken dictionary,
NamedDestinations namedDestinations,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
MediaBox mediaBox,
IParsingOptions parsingOptions);
/// <summary>
/// Get the user space units.
/// </summary>
public static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary)
{
var spaceUnits = UserSpaceUnit.Default;
if (dictionary.TryGet(NameToken.UserUnit, out var userUnitBase) && userUnitBase is NumericToken userUnitNumber)
{
spaceUnits = new UserSpaceUnit(userUnitNumber.Int);
}
return spaceUnits;
}
/// <summary>
/// Get the crop box.
/// </summary>
public CropBox GetCropBox(DictionaryToken dictionary, PageTreeMembers pageTreeMembers, MediaBox mediaBox)
{
if (dictionary.TryGet(NameToken.CropBox, out var cropBoxObject) &&
DirectObjectFinder.TryGet(cropBoxObject, pdfScanner, out ArrayToken cropBoxArray))
{
if (cropBoxArray.Length != 4)
{
log.Error($"The CropBox was the wrong length in the dictionary: {dictionary}. Array was: {cropBoxArray}. Using MediaBox.");
return new CropBox(mediaBox.Bounds);
}
return new CropBox(cropBoxArray.ToRectangle(pdfScanner));
}
else
{
return pageTreeMembers.GetCropBox() ?? new CropBox(mediaBox.Bounds);
}
}
/// <summary>
/// Get the media box.
/// </summary>
public MediaBox GetMediaBox(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers)
{
MediaBox mediaBox;
if (dictionary.TryGet(NameToken.MediaBox, out var mediaBoxObject)
&& DirectObjectFinder.TryGet(mediaBoxObject, pdfScanner, out ArrayToken mediaBoxArray))
{
if (mediaBoxArray.Length != 4)
{
log.Error($"The MediaBox was the wrong length in the dictionary: {dictionary}. Array was: {mediaBoxArray}. Defaulting to US Letter.");
return MediaBox.Letter;
}
mediaBox = new MediaBox(mediaBoxArray.ToRectangle(pdfScanner));
}
else
{
mediaBox = pageTreeMembers.MediaBox;
if (mediaBox == null)
{
log.Error($"The MediaBox was the wrong missing for page {number}. Using US Letter.");
// PDFBox defaults to US Letter.
mediaBox = MediaBox.Letter;
}
}
return mediaBox;
}
}
}

View File

@ -6,17 +6,26 @@
/// <summary>
/// Contains the values inherited from the Page Tree for this page.
/// </summary>
internal class PageTreeMembers
public class PageTreeMembers
{
public CropBox GetCropBox()
internal CropBox GetCropBox()
{
return null;
}
/// <summary>
/// The page media box.
/// </summary>
public MediaBox MediaBox { get; set; }
/// <summary>
/// The page rotation.
/// </summary>
public int Rotation { get; set; }
/// <summary>
/// The page parent resources.
/// </summary>
public Queue<DictionaryToken> ParentResources { get; } = new Queue<DictionaryToken>();
}
}

View File

@ -3,14 +3,22 @@
using Core;
using Outline;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Data;
using System.Linq;
using System.Runtime.Serialization;
using System.Runtime.Versioning;
using Tokenization.Scanner;
using Tokens;
using UglyToad.PdfPig.Parser;
using Util;
internal class Pages
{
private readonly IPageFactory pageFactory;
private readonly ConcurrentDictionary<Type, object> pageFactoryCache = new ConcurrentDictionary<Type, object>();
private readonly IPageFactory<Page> defaultPageFactory;
private readonly IPdfTokenScanner pdfScanner;
private readonly Dictionary<int, PageTreeNode> pagesByNumber;
public int Count => pagesByNumber.Count;
@ -20,15 +28,29 @@
/// </summary>
public PageTreeNode PageTree { get; }
internal Pages(IPageFactory pageFactory, IPdfTokenScanner pdfScanner, PageTreeNode pageTree, Dictionary<int, PageTreeNode> pagesByNumber)
internal Pages(IPageFactory<Page> pageFactory, IPdfTokenScanner pdfScanner, PageTreeNode pageTree, Dictionary<int, PageTreeNode> pagesByNumber)
{
this.pageFactory = pageFactory ?? throw new ArgumentNullException(nameof(pageFactory));
this.defaultPageFactory = pageFactory ?? throw new ArgumentNullException(nameof(pageFactory));
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
this.pagesByNumber = pagesByNumber;
PageTree = pageTree;
AddPageFactory(this.defaultPageFactory);
}
internal Page GetPage(int pageNumber, NamedDestinations namedDestinations, InternalParsingOptions parsingOptions)
internal Page GetPage(int pageNumber, NamedDestinations namedDestinations, InternalParsingOptions parsingOptions) => GetPage(defaultPageFactory, pageNumber, namedDestinations, parsingOptions);
internal TPage GetPage<TPage>(int pageNumber, NamedDestinations namedDestinations, InternalParsingOptions parsingOptions)
{
if (pageFactoryCache.TryGetValue(typeof(TPage), out var o) && o is IPageFactory<TPage> pageFactory)
{
return GetPage(pageFactory, pageNumber, namedDestinations, parsingOptions);
}
throw new InvalidOperationException($"Could not find {typeof(IPageFactory<TPage>)} for page type {typeof(TPage)}.");
}
private TPage GetPage<TPage>(IPageFactory<TPage> pageFactory, int pageNumber, NamedDestinations namedDestinations, InternalParsingOptions parsingOptions)
{
if (pageNumber <= 0 || pageNumber > Count)
{
@ -70,14 +92,37 @@
}
}
var page = pageFactory.Create(
return pageFactory.Create(
pageNumber,
pageNode.NodeDictionary,
pageTreeMembers,
namedDestinations,
parsingOptions);
}
return page;
internal void AddPageFactory<TPage>(IPageFactory<TPage> pageFactory)
{
// TODO - throw if already exists
pageFactoryCache.TryAdd(typeof(TPage), pageFactory);
}
internal void AddPageFactory<TPage>(Type type)
{
// TODO - check for type, should implement IPageFactory<TPage>
if (!typeof(IPageFactory<TPage>).IsAssignableFrom(type))
{
throw new ArgumentException($"The type provided does not implement {typeof(IPageFactory<TPage>)}.");
}
var defaultPageFactory = (PageFactory)pageFactoryCache[typeof(Page)];
// TODO - careful here - resourceStore is not thread safe
var pageFactory = (IPageFactory<TPage>)Activator.CreateInstance(type,
defaultPageFactory.pdfScanner, defaultPageFactory.resourceStore,
defaultPageFactory.filterProvider, defaultPageFactory.pageContentParser,
defaultPageFactory.log);
AddPageFactory(pageFactory);
}
internal PageTreeNode GetPageNode(int pageNumber)

View File

@ -10,7 +10,7 @@
using Tokens;
using Util;
internal class PagesFactory
internal static class PagesFactory
{
private class PageCounter
{
@ -21,7 +21,7 @@
}
}
public static Pages Create(IndirectReference pagesReference, DictionaryToken pagesDictionary, IPdfTokenScanner scanner, IPageFactory pageFactory, ILog log, bool isLenientParsing)
public static Pages Create(IndirectReference pagesReference, DictionaryToken pagesDictionary, IPdfTokenScanner scanner, IPageFactory<Page> pageFactory, ILog log, bool isLenientParsing)
{
var pageNumber = new PageCounter();

View File

@ -41,7 +41,7 @@
this.filterProvider = filterProvider;
}
public void LoadResourceDictionary(DictionaryToken resourceDictionary, InternalParsingOptions parsingOptions)
public void LoadResourceDictionary(DictionaryToken resourceDictionary, IParsingOptions parsingOptions)
{
lastLoadedFont = (null, null);
loadedNamedColorSpaceDetails.Clear();
@ -176,7 +176,7 @@
namedColorSpaces.Pop();
}
private void LoadFontDictionary(DictionaryToken fontDictionary, InternalParsingOptions parsingOptions)
private void LoadFontDictionary(DictionaryToken fontDictionary, IParsingOptions parsingOptions)
{
lastLoadedFont = (null, null);

View File

@ -25,8 +25,14 @@
IReadOnlyList<IFilter> GetAllFilters();
}
internal interface ILookupFilterProvider : IFilterProvider
/// <summary>
/// Gets filter implementations (<see cref="IFilter"/>) for decoding PDF data with lookup.
/// </summary>
public interface ILookupFilterProvider : IFilterProvider
{
/// <summary>
/// Get the filters specified in this dictionary.
/// </summary>
IReadOnlyList<IFilter> GetFilters(DictionaryToken dictionary, IPdfTokenScanner scanner);
}
}

View File

@ -7,8 +7,11 @@
/// By default user space units correspond to 1/72nd of an inch (a typographic point).
/// The UserUnit entry in a page dictionary can define the space units as a different multiple of 1/72 (1 point).
/// </summary>
internal readonly struct UserSpaceUnit
public readonly struct UserSpaceUnit
{
/// <summary>
/// Default <see cref="UserSpaceUnit"/> with <see cref="PointMultiples"/> set to 1.
/// </summary>
public static readonly UserSpaceUnit Default = new UserSpaceUnit(1);
/// <summary>
@ -29,6 +32,7 @@
PointMultiples = pointMultiples;
}
/// <inheritdoc/>
public override string ToString()
{
return PointMultiples.ToString(CultureInfo.InvariantCulture);

View File

@ -5,18 +5,24 @@
/// <summary>
/// A color space definition from a resource dictionary.
/// </summary>
internal struct ResourceColorSpace
public readonly struct ResourceColorSpace
{
/// <summary>
/// The color space name.
/// </summary>
public NameToken Name { get; }
/// <summary>
/// The color space data.
/// </summary>
public IToken Data { get; }
public ResourceColorSpace(NameToken name, IToken data)
internal ResourceColorSpace(NameToken name, IToken data)
{
Name = name;
Data = data;
}
public ResourceColorSpace(NameToken name) : this(name, null) { }
internal ResourceColorSpace(NameToken name) : this(name, null) { }
}
}

View File

@ -49,7 +49,7 @@
private readonly IPdfTokenScanner pdfScanner;
private readonly IPageContentParser pageContentParser;
private readonly ILookupFilterProvider filterProvider;
private readonly InternalParsingOptions parsingOptions;
private readonly IParsingOptions parsingOptions;
private readonly MarkedContentStack markedContentStack = new MarkedContentStack();
private Stack<CurrentGraphicsState> graphicsStack = new Stack<CurrentGraphicsState>();
@ -92,7 +92,7 @@
IPdfTokenScanner pdfScanner,
IPageContentParser pageContentParser,
ILookupFilterProvider filterProvider,
InternalParsingOptions parsingOptions)
IParsingOptions parsingOptions)
{
this.pageNumber = pageNumber;
this.resourceStore = resourceStore;

View File

@ -0,0 +1,40 @@
namespace UglyToad.PdfPig
{
using System.Collections.Generic;
using UglyToad.PdfPig.Logging;
/// <summary>
/// Parsing options interface.
/// </summary>
public interface IParsingOptions
{
/// <summary>
/// Should the parser apply clipping to paths?
/// Defaults to <see langword="false"/>.
/// <para>Bezier curves will be transformed into polylines if clipping is set to <see langword="true"/>.</para>
/// </summary>
bool ClipPaths { get; }
/// <summary>
/// Should the parser ignore issues where the document does not conform to the PDF specification?
/// </summary>
bool UseLenientParsing { get; }
/// <summary>
/// All passwords to try when opening this document, will include any values set for <see cref="ParsingOptions.Password"/>.
/// </summary>
List<string> Passwords { get; }
/// <summary>
/// Skip extracting content where the font could not be found, will result in some letters being skipped/missed
/// but will prevent the library throwing where the source PDF has some corrupted text. Also skips XObjects like
/// forms and images when missing.
/// </summary>
bool SkipMissingFonts { get; }
/// <summary>
/// The <see cref="ILog"/> used to record messages raised by the parsing process.
/// </summary>
ILog Logger { get; }
}
}

View File

@ -4,11 +4,11 @@
using System.Collections.Generic;
/// <summary>
/// <see cref="ParsingOptions"/> but without being a public API/
/// <see cref="ParsingOptions"/> but without being a public API.
/// </summary>
internal class InternalParsingOptions
internal class InternalParsingOptions : IParsingOptions
{
public IReadOnlyList<string> Passwords { get; }
public List<string> Passwords { get; }
public bool UseLenientParsing { get; }
@ -21,7 +21,7 @@
public ILog Logger { get; }
public InternalParsingOptions(
IReadOnlyList<string> passwords,
List<string> passwords,
bool useLenientParsing,
bool clipPaths,
bool skipMissingFonts,

View File

@ -9,7 +9,7 @@
/// <summary>
/// Named destinations in a PDF document
/// </summary>
internal class NamedDestinations
public class NamedDestinations
{
/// <summary>
/// Dictionary containing explicit destinations, keyed by name

View File

@ -1,13 +1,18 @@
namespace UglyToad.PdfPig.Parser
{
using System.Collections.Generic;
using Core;
using Graphics.Operations;
using Logging;
using System.Collections.Generic;
internal interface IPageContentParser
/// <summary>
/// Page content parser interface.
/// </summary>
public interface IPageContentParser
{
IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes,
ILog log);
/// <summary>
/// Parse the <see cref="IInputBytes"/> into <see cref="IGraphicsStateOperation"/>s.
/// </summary>
IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes, ILog log);
}
}

View File

@ -1,167 +1,39 @@
namespace UglyToad.PdfPig.Parser
{
using System;
using System.Collections.Generic;
using Annotations;
using Content;
using Core;
using Filters;
using Geometry;
using Graphics;
using Graphics.Operations;
using Logging;
using Outline;
using Parts;
using System.Collections.Generic;
using Tokenization.Scanner;
using Tokens;
using Util;
using UglyToad.PdfPig.Core;
internal class PageFactory : IPageFactory
internal class PageFactory : PageFactoryBase<Page>
{
private readonly IPdfTokenScanner pdfScanner;
private readonly IResourceStore resourceStore;
private readonly ILookupFilterProvider filterProvider;
private readonly IPageContentParser pageContentParser;
private readonly ILog log;
public PageFactory(
IPdfTokenScanner pdfScanner,
IResourceStore resourceStore,
ILookupFilterProvider filterProvider,
IPageContentParser pageContentParser,
ILog log)
{
this.resourceStore = resourceStore;
this.filterProvider = filterProvider;
this.pageContentParser = pageContentParser;
this.pdfScanner = pdfScanner;
this.log = log;
}
: base(pdfScanner, resourceStore, filterProvider, pageContentParser, log)
{ }
public Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers,
NamedDestinations namedDestinations, InternalParsingOptions parsingOptions)
{
if (dictionary == null)
{
throw new ArgumentNullException(nameof(dictionary));
}
var type = dictionary.GetNameOrDefault(NameToken.Type);
if (type != null && !type.Equals(NameToken.Page))
{
parsingOptions.Logger.Error($"Page {number} had its type specified as {type} rather than 'Page'.");
}
MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers);
CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox);
var rotation = new PageRotationDegrees(pageTreeMembers.Rotation);
if (dictionary.TryGet(NameToken.Rotate, pdfScanner, out NumericToken rotateToken))
{
rotation = new PageRotationDegrees(rotateToken.Int);
}
var stackDepth = 0;
while (pageTreeMembers.ParentResources.Count > 0)
{
var resource = pageTreeMembers.ParentResources.Dequeue();
resourceStore.LoadResourceDictionary(resource, parsingOptions);
stackDepth++;
}
if (dictionary.TryGet(NameToken.Resources, pdfScanner, out DictionaryToken resources))
{
resourceStore.LoadResourceDictionary(resources, parsingOptions);
stackDepth++;
}
UserSpaceUnit userSpaceUnit = GetUserSpaceUnits(dictionary);
PageContent content;
if (!dictionary.TryGet(NameToken.Contents, out var contents))
{
content = new PageContent(EmptyArray<IGraphicsStateOperation>.Instance,
EmptyArray<Letter>.Instance,
EmptyArray<PdfPath>.Instance,
EmptyArray<Union<XObjectContentRecord, InlineImage>>.Instance,
EmptyArray<MarkedContentElement>.Instance,
pdfScanner,
filterProvider,
resourceStore);
// ignored for now, is it possible? check the spec...
}
else if (DirectObjectFinder.TryGet<ArrayToken>(contents, pdfScanner, out var array))
{
var bytes = new List<byte>();
for (var i = 0; i < array.Data.Count; i++)
{
var item = array.Data[i];
if (!(item is IndirectReferenceToken obj))
{
throw new PdfDocumentFormatException($"The contents contained something which was not an indirect reference: {item}.");
}
var contentStream = DirectObjectFinder.Get<StreamToken>(obj, pdfScanner);
if (contentStream == null)
{
throw new InvalidOperationException($"Could not find the contents for object {obj}.");
}
bytes.AddRange(contentStream.Decode(filterProvider, pdfScanner));
if (i < array.Data.Count - 1)
{
bytes.Add((byte)'\n');
}
}
content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
}
else
{
var contentStream = DirectObjectFinder.Get<StreamToken>(contents, pdfScanner);
if (contentStream == null)
{
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
}
var bytes = contentStream.Decode(filterProvider, pdfScanner);
content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
}
var initialMatrix = ContentStreamProcessor.GetInitialMatrix(userSpaceUnit, mediaBox, cropBox, rotation, log);
var annotationProvider = new AnnotationProvider(pdfScanner, dictionary, initialMatrix, namedDestinations, log);
var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, annotationProvider, pdfScanner);
for (var i = 0; i < stackDepth; i++)
{
resourceStore.UnloadResourceDictionary();
}
return page;
}
private PageContent GetContent(
int pageNumber,
protected override Page ProcessPage(int pageNumber,
DictionaryToken dictionary,
NamedDestinations namedDestinations,
IReadOnlyList<byte> contentBytes,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
MediaBox mediaBox,
InternalParsingOptions parsingOptions)
IParsingOptions parsingOptions)
{
var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes),
parsingOptions.Logger);
var context = new ContentStreamProcessor(
pageNumber,
resourceStore,
@ -174,82 +46,38 @@
filterProvider,
parsingOptions);
return context.Process(pageNumber, operations);
var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes), parsingOptions.Logger);
var content = context.Process(pageNumber, operations);
var initialMatrix = ContentStreamProcessor.GetInitialMatrix(userSpaceUnit, mediaBox, cropBox, rotation, log);
var annotationProvider = new AnnotationProvider(pdfScanner, dictionary, initialMatrix, namedDestinations, log);
return new Page(pageNumber, dictionary, mediaBox, cropBox, rotation, content, annotationProvider, pdfScanner);
}
private static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary)
{
var spaceUnits = UserSpaceUnit.Default;
if (dictionary.TryGet(NameToken.UserUnit, out var userUnitBase) && userUnitBase is NumericToken userUnitNumber)
{
spaceUnits = new UserSpaceUnit(userUnitNumber.Int);
}
return spaceUnits;
}
private CropBox GetCropBox(
protected override Page ProcessPage(
int pageNumber,
DictionaryToken dictionary,
PageTreeMembers pageTreeMembers,
MediaBox mediaBox)
NamedDestinations namedDestinations,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
MediaBox mediaBox,
IParsingOptions parsingOptions)
{
CropBox cropBox;
if (dictionary.TryGet(NameToken.CropBox, out var cropBoxObject) &&
DirectObjectFinder.TryGet(cropBoxObject, pdfScanner, out ArrayToken cropBoxArray))
{
if (cropBoxArray.Length != 4)
{
log.Error($"The CropBox was the wrong length in the dictionary: {dictionary}. Array was: {cropBoxArray}. Using MediaBox.");
var initialMatrix = ContentStreamProcessor.GetInitialMatrix(userSpaceUnit, mediaBox, cropBox, rotation, log);
var annotationProvider = new AnnotationProvider(pdfScanner, dictionary, initialMatrix, namedDestinations, log);
cropBox = new CropBox(mediaBox.Bounds);
var content = new PageContent(EmptyArray<IGraphicsStateOperation>.Instance,
EmptyArray<Letter>.Instance,
EmptyArray<PdfPath>.Instance,
EmptyArray<Union<XObjectContentRecord, InlineImage>>.Instance,
EmptyArray<MarkedContentElement>.Instance,
pdfScanner,
filterProvider,
resourceStore);
// ignored for now, is it possible? check the spec...
return cropBox;
}
cropBox = new CropBox(cropBoxArray.ToRectangle(pdfScanner));
}
else
{
cropBox = pageTreeMembers.GetCropBox() ?? new CropBox(mediaBox.Bounds);
}
return cropBox;
}
private MediaBox GetMediaBox(
int number,
DictionaryToken dictionary,
PageTreeMembers pageTreeMembers)
{
MediaBox mediaBox;
if (dictionary.TryGet(NameToken.MediaBox, out var mediaBoxObject)
&& DirectObjectFinder.TryGet(mediaBoxObject, pdfScanner, out ArrayToken mediaBoxArray))
{
if (mediaBoxArray.Length != 4)
{
log.Error($"The MediaBox was the wrong length in the dictionary: {dictionary}. Array was: {mediaBoxArray}. Defaulting to US Letter.");
mediaBox = MediaBox.Letter;
return mediaBox;
}
mediaBox = new MediaBox(mediaBoxArray.ToRectangle(pdfScanner));
}
else
{
mediaBox = pageTreeMembers.MediaBox;
if (mediaBox == null)
{
log.Error($"The MediaBox was the wrong missing for page {number}. Using US Letter.");
// PDFBox defaults to US Letter.
mediaBox = MediaBox.Letter;
}
}
return mediaBox;
return new Page(pageNumber, dictionary, mediaBox, cropBox, rotation, content, annotationProvider, pdfScanner);
}
}
}

View File

@ -6,7 +6,7 @@
/// <summary>
/// Configures options used by the parser when reading PDF documents.
/// </summary>
public class ParsingOptions
public class ParsingOptions : IParsingOptions
{
/// <summary>
/// A default <see cref="ParsingOptions"/> with <see cref="UseLenientParsing"/> set to false.

View File

@ -161,6 +161,57 @@
}
}
/// <summary>
/// Get the page with the specified page number (1 indexed).
/// </summary>
/// <typeparam name="TPage"></typeparam>
/// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
/// <returns>The page.</returns>
public TPage GetPage<TPage>(int pageNumber)
{
// TODO - update log with log type
if (isDisposed)
{
throw new ObjectDisposedException("Cannot access page after the document is disposed.");
}
parsingOptions.Logger.Debug($"Accessing page {pageNumber}.");
try
{
return pages.GetPage<TPage>(pageNumber, namedDestinations, parsingOptions);
}
catch (Exception ex)
{
if (IsEncrypted)
{
throw new PdfDocumentEncryptedException("Document was encrypted which may have caused error when retrieving page.", encryptionDictionary, ex);
}
throw;
}
}
/// <summary>
/// TODO
/// </summary>
/// <typeparam name="TPage"></typeparam>
/// <param name="pageFactory"></param>
public void AddPageFactory<TPage>(IPageFactory<TPage> pageFactory)
{
pages.AddPageFactory(pageFactory);
}
/// <summary>
/// TODO
/// </summary>
/// <typeparam name="TPage"></typeparam>
/// <param name="type"></param>
public void AddPageFactory<TPage>(Type type)
{
pages.AddPageFactory<TPage>(type);
}
/// <summary>
/// Gets all pages in this document in order.
/// </summary>

View File

@ -7,7 +7,7 @@
/// <summary>
/// Tokenizes objects from bytes in a PDF file.
/// </summary>
internal interface IPdfTokenScanner : ISeekableTokenScanner, IDisposable
public interface IPdfTokenScanner : ISeekableTokenScanner, IDisposable
{
/// <summary>
/// Tokenize the object with a given object number.