mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-06-28 15:30:17 +08:00
Add AddPageFactory() methods and GetPage<T>() to PdfDocument. Make public IPageFactory<TPage>, PageFactoryBase<TPage>, IResourceStore, ILookupFilterProvider, IPageContentParser, IPdfTokenScanner, UserSpaceUnit, ResourceColorSpace
This commit is contained in:
parent
94cc9be967
commit
ae5d3627e0
176
src/UglyToad.PdfPig.Tests/Integration/PageFactoryTests.cs
Normal file
176
src/UglyToad.PdfPig.Tests/Integration/PageFactoryTests.cs
Normal file
@ -0,0 +1,176 @@
|
||||
namespace UglyToad.PdfPig.Tests.Integration
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using UglyToad.PdfPig.Content;
|
||||
using UglyToad.PdfPig.Filters;
|
||||
using UglyToad.PdfPig.Geometry;
|
||||
using UglyToad.PdfPig.Logging;
|
||||
using UglyToad.PdfPig.Outline;
|
||||
using UglyToad.PdfPig.Parser;
|
||||
using UglyToad.PdfPig.Tokenization.Scanner;
|
||||
using UglyToad.PdfPig.Tokens;
|
||||
using Xunit;
|
||||
|
||||
public class PageFactoryTests
|
||||
{
|
||||
[Fact]
|
||||
public void SimpleFactory1()
|
||||
{
|
||||
var file = IntegrationHelpers.GetDocumentPath("Various Content Types");
|
||||
|
||||
using (var document = PdfDocument.Open(file))
|
||||
{
|
||||
document.AddPageFactory<SimplePage>(typeof(SimplePageFactory));
|
||||
|
||||
var page = document.GetPage<SimplePage>(1);
|
||||
Assert.Equal(1, page.Number);
|
||||
|
||||
page = document.GetPage<SimplePage>(1);
|
||||
Assert.Equal(1, page.Number);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SimpleFactory2()
|
||||
{
|
||||
var file = IntegrationHelpers.GetDocumentPath("Various Content Types");
|
||||
|
||||
using (var document = PdfDocument.Open(file))
|
||||
{
|
||||
document.AddPageFactory(new SimplePageFactory());
|
||||
|
||||
var page = document.GetPage<SimplePage>(1);
|
||||
Assert.Equal(1, page.Number);
|
||||
|
||||
page = document.GetPage<SimplePage>(1);
|
||||
Assert.Equal(1, page.Number);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void InformationFactory()
|
||||
{
|
||||
var file = IntegrationHelpers.GetDocumentPath("Various Content Types");
|
||||
|
||||
using (var document = PdfDocument.Open(file))
|
||||
{
|
||||
document.AddPageFactory<PageInformation>(typeof(PageInformationFactory));
|
||||
|
||||
Page page = document.GetPage(1);
|
||||
|
||||
PageInformation pageInfo = document.GetPage<PageInformation>(1);
|
||||
Assert.Equal(page.Number, pageInfo.Number);
|
||||
Assert.Equal(page.Rotation, pageInfo.Rotation);
|
||||
Assert.Equal(page.MediaBox.Bounds, pageInfo.MediaBox.Bounds);
|
||||
Assert.Equal(page.CropBox.Bounds, pageInfo.CropBox.Bounds);
|
||||
//Assert.Equal(page.Unit, pageInfo.UserSpaceUnit);
|
||||
|
||||
pageInfo = document.GetPage<PageInformation>(1);
|
||||
Assert.Equal(page.Number, pageInfo.Number);
|
||||
Assert.Equal(page.Rotation, pageInfo.Rotation);
|
||||
Assert.Equal(page.MediaBox.Bounds, pageInfo.MediaBox.Bounds);
|
||||
Assert.Equal(page.CropBox.Bounds, pageInfo.CropBox.Bounds);
|
||||
}
|
||||
}
|
||||
|
||||
#region SimplePage
|
||||
public class SimplePage
|
||||
{
|
||||
public int Number { get; }
|
||||
|
||||
public int Rotation { get; }
|
||||
|
||||
public MediaBox MediaBox { get; }
|
||||
|
||||
public SimplePage(int number, int rotation, MediaBox mediaBox)
|
||||
{
|
||||
Number = number;
|
||||
Rotation = rotation;
|
||||
MediaBox = mediaBox;
|
||||
}
|
||||
}
|
||||
|
||||
public class SimplePageFactory : IPageFactory<SimplePage>
|
||||
{
|
||||
public SimplePageFactory()
|
||||
{
|
||||
// do nothing
|
||||
}
|
||||
|
||||
public SimplePageFactory(
|
||||
IPdfTokenScanner pdfScanner,
|
||||
IResourceStore resourceStore,
|
||||
ILookupFilterProvider filterProvider,
|
||||
IPageContentParser pageContentParser,
|
||||
ILog log)
|
||||
{ }
|
||||
|
||||
public SimplePage Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, NamedDestinations annotationProvider, IParsingOptions parsingOptions)
|
||||
{
|
||||
return new SimplePage(number, pageTreeMembers.Rotation, pageTreeMembers.MediaBox);
|
||||
}
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region PageInformation
|
||||
public class PageInformation
|
||||
{
|
||||
public int Number { get; set; }
|
||||
|
||||
public PageRotationDegrees Rotation { get; set; }
|
||||
|
||||
public MediaBox MediaBox { get; set; }
|
||||
|
||||
public CropBox CropBox { get; set; }
|
||||
|
||||
public UserSpaceUnit UserSpaceUnit { get; set; }
|
||||
}
|
||||
|
||||
public class PageInformationFactory : PageFactoryBase<PageInformation>
|
||||
{
|
||||
public PageInformationFactory(
|
||||
IPdfTokenScanner pdfScanner,
|
||||
IResourceStore resourceStore,
|
||||
ILookupFilterProvider filterProvider,
|
||||
IPageContentParser pageContentParser,
|
||||
ILog log)
|
||||
: base(pdfScanner, resourceStore, filterProvider, pageContentParser, log)
|
||||
{
|
||||
}
|
||||
|
||||
protected override PageInformation ProcessPage(
|
||||
int pageNumber,
|
||||
DictionaryToken dictionary,
|
||||
NamedDestinations namedDestinations,
|
||||
IReadOnlyList<byte> contentBytes,
|
||||
CropBox cropBox,
|
||||
UserSpaceUnit userSpaceUnit,
|
||||
PageRotationDegrees rotation,
|
||||
MediaBox mediaBox,
|
||||
IParsingOptions parsingOptions)
|
||||
{
|
||||
return ProcessPage(pageNumber, dictionary, namedDestinations, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
|
||||
}
|
||||
|
||||
protected override PageInformation ProcessPage(int pageNumber,
|
||||
DictionaryToken dictionary,
|
||||
NamedDestinations namedDestinations,
|
||||
CropBox cropBox,
|
||||
UserSpaceUnit userSpaceUnit,
|
||||
PageRotationDegrees rotation,
|
||||
MediaBox mediaBox,
|
||||
IParsingOptions parsingOptions)
|
||||
{
|
||||
return new PageInformation()
|
||||
{
|
||||
Number = pageNumber,
|
||||
Rotation = rotation,
|
||||
MediaBox = mediaBox,
|
||||
CropBox = cropBox,
|
||||
UserSpaceUnit = userSpaceUnit
|
||||
};
|
||||
}
|
||||
}
|
||||
#endregion
|
||||
}
|
||||
}
|
@ -81,17 +81,21 @@
|
||||
"UglyToad.PdfPig.Content.Hyperlink",
|
||||
"UglyToad.PdfPig.Content.InlineImage",
|
||||
"UglyToad.PdfPig.Content.IPdfImage",
|
||||
"UglyToad.PdfPig.Content.IResourceStore",
|
||||
"UglyToad.PdfPig.Content.Letter",
|
||||
"UglyToad.PdfPig.Content.MarkedContentElement",
|
||||
"UglyToad.PdfPig.Content.MediaBox",
|
||||
"UglyToad.PdfPig.Content.OptionalContentGroupElement",
|
||||
"UglyToad.PdfPig.Content.Page",
|
||||
"UglyToad.PdfPig.Content.PageFactoryBase`1",
|
||||
"UglyToad.PdfPig.Content.PageRotationDegrees",
|
||||
"UglyToad.PdfPig.Content.PageSize",
|
||||
"UglyToad.PdfPig.Content.PageTreeNode",
|
||||
"UglyToad.PdfPig.Content.PageTreeMembers",
|
||||
"UglyToad.PdfPig.Content.Word",
|
||||
"UglyToad.PdfPig.Content.TextOrientation",
|
||||
"UglyToad.PdfPig.Content.XmpMetadata",
|
||||
"UglyToad.PdfPig.Content.IPageFactory`1",
|
||||
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
|
||||
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
|
||||
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
|
||||
@ -99,6 +103,7 @@
|
||||
"UglyToad.PdfPig.Filters.DefaultFilterProvider",
|
||||
"UglyToad.PdfPig.Filters.IFilter",
|
||||
"UglyToad.PdfPig.Filters.IFilterProvider",
|
||||
"UglyToad.PdfPig.Filters.ILookupFilterProvider",
|
||||
"UglyToad.PdfPig.Functions.FunctionTypes",
|
||||
"UglyToad.PdfPig.Functions.PdfFunction",
|
||||
"UglyToad.PdfPig.PdfFonts.CharacterBoundingBox",
|
||||
@ -109,9 +114,11 @@
|
||||
"UglyToad.PdfPig.PdfFonts.FontStretch",
|
||||
"UglyToad.PdfPig.PdfFonts.IFont",
|
||||
"UglyToad.PdfPig.Geometry.GeometryExtensions",
|
||||
"UglyToad.PdfPig.Geometry.UserSpaceUnit",
|
||||
"UglyToad.PdfPig.Graphics.Colors.CMYKColor",
|
||||
"UglyToad.PdfPig.Graphics.Colors.ColorSpace",
|
||||
"UglyToad.PdfPig.Graphics.PdfPath",
|
||||
"UglyToad.PdfPig.Graphics.Colors.ResourceColorSpace",
|
||||
"UglyToad.PdfPig.Graphics.Colors.ColorSpaceExtensions",
|
||||
"UglyToad.PdfPig.Graphics.Colors.ColorSpaceFamily",
|
||||
"UglyToad.PdfPig.Graphics.Colors.GrayColor",
|
||||
@ -230,6 +237,7 @@
|
||||
"UglyToad.PdfPig.Graphics.TextMatrices",
|
||||
"UglyToad.PdfPig.Graphics.XObjectContentRecord",
|
||||
"UglyToad.PdfPig.Images.ColorSpaceDetailsByteConverter",
|
||||
"UglyToad.PdfPig.IParsingOptions",
|
||||
"UglyToad.PdfPig.Logging.ILog",
|
||||
"UglyToad.PdfPig.Outline.Bookmarks",
|
||||
"UglyToad.PdfPig.Outline.BookmarkNode",
|
||||
@ -237,15 +245,18 @@
|
||||
"UglyToad.PdfPig.Outline.EmbeddedBookmarkNode",
|
||||
"UglyToad.PdfPig.Outline.ExternalBookmarkNode",
|
||||
"UglyToad.PdfPig.Outline.UriBookmarkNode",
|
||||
"UglyToad.PdfPig.Outline.NamedDestinations",
|
||||
"UglyToad.PdfPig.Outline.Destinations.ExplicitDestination",
|
||||
"UglyToad.PdfPig.Outline.Destinations.ExplicitDestinationCoordinates",
|
||||
"UglyToad.PdfPig.Outline.Destinations.ExplicitDestinationType",
|
||||
"UglyToad.PdfPig.ParsingOptions",
|
||||
"UglyToad.PdfPig.Parser.IPageContentParser",
|
||||
"UglyToad.PdfPig.PdfDocument",
|
||||
"UglyToad.PdfPig.PdfExtensions",
|
||||
"UglyToad.PdfPig.Rendering.IPageImageRenderer",
|
||||
"UglyToad.PdfPig.Rendering.PdfRendererImageFormat",
|
||||
"UglyToad.PdfPig.Structure",
|
||||
"UglyToad.PdfPig.Tokenization.Scanner.IPdfTokenScanner",
|
||||
"UglyToad.PdfPig.Util.Adler32Checksum",
|
||||
"UglyToad.PdfPig.Util.IWordExtractor",
|
||||
"UglyToad.PdfPig.Util.DefaultWordExtractor",
|
||||
|
@ -3,12 +3,19 @@
|
||||
using Outline;
|
||||
using Tokens;
|
||||
|
||||
internal interface IPageFactory
|
||||
/// <summary>
|
||||
/// Page factory interface.
|
||||
/// </summary>
|
||||
/// <typeparam name="TPage">The type of page the page factory creates.</typeparam>
|
||||
public interface IPageFactory<TPage>
|
||||
{
|
||||
Page Create(int number,
|
||||
/// <summary>
|
||||
/// Create the page.
|
||||
/// </summary>
|
||||
TPage Create(int number,
|
||||
DictionaryToken dictionary,
|
||||
PageTreeMembers pageTreeMembers,
|
||||
NamedDestinations annotationProvider,
|
||||
InternalParsingOptions parsingOptions);
|
||||
IParsingOptions parsingOptions);
|
||||
}
|
||||
}
|
@ -5,9 +5,15 @@
|
||||
using System.Collections.Generic;
|
||||
using Tokens;
|
||||
|
||||
internal interface IResourceStore
|
||||
/// <summary>
|
||||
/// Resource store.
|
||||
/// </summary>
|
||||
public interface IResourceStore
|
||||
{
|
||||
void LoadResourceDictionary(DictionaryToken resourceDictionary, InternalParsingOptions parsingOptions);
|
||||
/// <summary>
|
||||
/// Load the resource dictionary.
|
||||
/// </summary>
|
||||
void LoadResourceDictionary(DictionaryToken resourceDictionary, IParsingOptions parsingOptions);
|
||||
|
||||
/// <summary>
|
||||
/// Remove any named resources and associated state for the last resource dictionary loaded.
|
||||
@ -15,22 +21,49 @@
|
||||
/// </summary>
|
||||
void UnloadResourceDictionary();
|
||||
|
||||
/// <summary>
|
||||
/// Get the font corresponding to the name.
|
||||
/// </summary>
|
||||
IFont GetFont(NameToken name);
|
||||
|
||||
/// <summary>
|
||||
/// Try get the XObject corresponding to the name.
|
||||
/// </summary>
|
||||
bool TryGetXObject(NameToken name, out StreamToken stream);
|
||||
|
||||
/// <summary>
|
||||
/// Get the extended graphics state dictionary corresponding to the name.
|
||||
/// </summary>
|
||||
DictionaryToken GetExtendedGraphicsStateDictionary(NameToken name);
|
||||
|
||||
/// <summary>
|
||||
/// Get the font from the <see cref="IndirectReferenceToken"/>.
|
||||
/// </summary>
|
||||
IFont GetFontDirectly(IndirectReferenceToken fontReferenceToken);
|
||||
|
||||
/// <summary>
|
||||
/// Get the named color space by its name.
|
||||
/// </summary>
|
||||
bool TryGetNamedColorSpace(NameToken name, out ResourceColorSpace namedColorSpace);
|
||||
|
||||
/// <summary>
|
||||
/// Get the color space details corresponding to the name.
|
||||
/// </summary>
|
||||
ColorSpaceDetails GetColorSpaceDetails(NameToken name, DictionaryToken dictionary);
|
||||
|
||||
/// <summary>
|
||||
/// Get the marked content properties dictionary corresponding to the name.
|
||||
/// </summary>
|
||||
DictionaryToken GetMarkedContentPropertiesDictionary(NameToken name);
|
||||
|
||||
/// <summary>
|
||||
/// Get all <see cref="PatternColor"/> as a dictionnary. Keys are the <see cref="PatternColor"/> names.
|
||||
/// </summary>
|
||||
IReadOnlyDictionary<NameToken, PatternColor> GetPatterns();
|
||||
|
||||
/// <summary>
|
||||
/// Get the shading corresponding to the name.
|
||||
/// </summary>
|
||||
Shading GetShading(NameToken name);
|
||||
}
|
||||
}
|
264
src/UglyToad.PdfPig/Content/PageFactoryBase.cs
Normal file
264
src/UglyToad.PdfPig/Content/PageFactoryBase.cs
Normal file
@ -0,0 +1,264 @@
|
||||
namespace UglyToad.PdfPig.Content
|
||||
{
|
||||
using Core;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using UglyToad.PdfPig.Filters;
|
||||
using UglyToad.PdfPig.Geometry;
|
||||
using UglyToad.PdfPig.Logging;
|
||||
using UglyToad.PdfPig.Outline;
|
||||
using UglyToad.PdfPig.Parser;
|
||||
using UglyToad.PdfPig.Parser.Parts;
|
||||
using UglyToad.PdfPig.Tokenization.Scanner;
|
||||
using UglyToad.PdfPig.Tokens;
|
||||
using UglyToad.PdfPig.Util;
|
||||
|
||||
/// <summary>
|
||||
/// Page factory abstract class.
|
||||
/// </summary>
|
||||
/// <typeparam name="TPage">The type of page the page factory creates.</typeparam>
|
||||
public abstract class PageFactoryBase<TPage> : IPageFactory<TPage>
|
||||
{
|
||||
/// <summary>
|
||||
/// The Pdf token scanner.
|
||||
/// </summary>
|
||||
public readonly IPdfTokenScanner pdfScanner;
|
||||
|
||||
/// <summary>
|
||||
/// The resource store.
|
||||
/// </summary>
|
||||
public readonly IResourceStore resourceStore;
|
||||
|
||||
/// <summary>
|
||||
/// The filter provider.
|
||||
/// </summary>
|
||||
public readonly ILookupFilterProvider filterProvider;
|
||||
|
||||
/// <summary>
|
||||
/// The page content parser.
|
||||
/// </summary>
|
||||
public readonly IPageContentParser pageContentParser;
|
||||
|
||||
/// <summary>
|
||||
/// The <see cref="ILog"/> used to record messages raised by the parsing process.
|
||||
/// </summary>
|
||||
public readonly ILog log;
|
||||
|
||||
/// <summary>
|
||||
/// Create a <see cref="PageFactoryBase{TPage}"/>.
|
||||
/// </summary>
|
||||
protected PageFactoryBase(
|
||||
IPdfTokenScanner pdfScanner,
|
||||
IResourceStore resourceStore,
|
||||
ILookupFilterProvider filterProvider,
|
||||
IPageContentParser pageContentParser,
|
||||
ILog log)
|
||||
{
|
||||
this.resourceStore = resourceStore;
|
||||
this.filterProvider = filterProvider;
|
||||
this.pageContentParser = pageContentParser;
|
||||
this.pdfScanner = pdfScanner;
|
||||
this.log = log;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public TPage Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers,
|
||||
NamedDestinations namedDestinations, IParsingOptions parsingOptions)
|
||||
{
|
||||
if (dictionary == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(dictionary));
|
||||
}
|
||||
|
||||
var type = dictionary.GetNameOrDefault(NameToken.Type);
|
||||
|
||||
if (type != null && !type.Equals(NameToken.Page))
|
||||
{
|
||||
parsingOptions.Logger.Error($"Page {number} had its type specified as {type} rather than 'Page'.");
|
||||
}
|
||||
|
||||
MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers);
|
||||
CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox);
|
||||
|
||||
var rotation = new PageRotationDegrees(pageTreeMembers.Rotation);
|
||||
// TODO - check if NameToken.Rotate is already looked for in Pages.cs, we don't need to look again
|
||||
if (dictionary.TryGet(NameToken.Rotate, pdfScanner, out NumericToken rotateToken))
|
||||
{
|
||||
rotation = new PageRotationDegrees(rotateToken.Int);
|
||||
}
|
||||
|
||||
var stackDepth = 0;
|
||||
|
||||
while (pageTreeMembers.ParentResources.Count > 0)
|
||||
{
|
||||
var resource = pageTreeMembers.ParentResources.Dequeue();
|
||||
|
||||
resourceStore.LoadResourceDictionary(resource, parsingOptions);
|
||||
stackDepth++;
|
||||
}
|
||||
|
||||
if (dictionary.TryGet(NameToken.Resources, pdfScanner, out DictionaryToken resources))
|
||||
{
|
||||
resourceStore.LoadResourceDictionary(resources, parsingOptions);
|
||||
stackDepth++;
|
||||
}
|
||||
|
||||
UserSpaceUnit userSpaceUnit = GetUserSpaceUnits(dictionary);
|
||||
|
||||
TPage page;
|
||||
|
||||
if (!dictionary.TryGet(NameToken.Contents, out var contents))
|
||||
{
|
||||
page = ProcessPage(number, dictionary, namedDestinations, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
|
||||
}
|
||||
else if (DirectObjectFinder.TryGet<ArrayToken>(contents, pdfScanner, out var array))
|
||||
{
|
||||
var bytes = new List<byte>();
|
||||
|
||||
for (var i = 0; i < array.Data.Count; i++)
|
||||
{
|
||||
var item = array.Data[i];
|
||||
|
||||
if (!(item is IndirectReferenceToken obj))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"The contents contained something which was not an indirect reference: {item}.");
|
||||
}
|
||||
|
||||
var contentStream = DirectObjectFinder.Get<StreamToken>(obj, pdfScanner);
|
||||
|
||||
if (contentStream == null)
|
||||
{
|
||||
throw new InvalidOperationException($"Could not find the contents for object {obj}.");
|
||||
}
|
||||
|
||||
bytes.AddRange(contentStream.Decode(filterProvider, pdfScanner));
|
||||
|
||||
if (i < array.Data.Count - 1)
|
||||
{
|
||||
bytes.Add((byte)'\n');
|
||||
}
|
||||
}
|
||||
|
||||
page = ProcessPage(number, dictionary, namedDestinations, bytes, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
|
||||
}
|
||||
else
|
||||
{
|
||||
var contentStream = DirectObjectFinder.Get<StreamToken>(contents, pdfScanner);
|
||||
|
||||
if (contentStream == null)
|
||||
{
|
||||
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
|
||||
}
|
||||
|
||||
var bytes = contentStream.Decode(filterProvider, pdfScanner);
|
||||
|
||||
page = ProcessPage(number, dictionary, namedDestinations, bytes, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
|
||||
}
|
||||
|
||||
for (var i = 0; i < stackDepth; i++)
|
||||
{
|
||||
resourceStore.UnloadResourceDictionary();
|
||||
}
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Process a page with no content.
|
||||
/// </summary>
|
||||
protected abstract TPage ProcessPage(
|
||||
int pageNumber,
|
||||
DictionaryToken dictionary,
|
||||
NamedDestinations namedDestinations,
|
||||
IReadOnlyList<byte> contentBytes,
|
||||
CropBox cropBox,
|
||||
UserSpaceUnit userSpaceUnit,
|
||||
PageRotationDegrees rotation,
|
||||
MediaBox mediaBox,
|
||||
IParsingOptions parsingOptions);
|
||||
|
||||
/// <summary>
|
||||
/// Process a page with no content.
|
||||
/// </summary>
|
||||
protected abstract TPage ProcessPage(
|
||||
int pageNumber,
|
||||
DictionaryToken dictionary,
|
||||
NamedDestinations namedDestinations,
|
||||
CropBox cropBox,
|
||||
UserSpaceUnit userSpaceUnit,
|
||||
PageRotationDegrees rotation,
|
||||
MediaBox mediaBox,
|
||||
IParsingOptions parsingOptions);
|
||||
|
||||
/// <summary>
|
||||
/// Get the user space units.
|
||||
/// </summary>
|
||||
public static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary)
|
||||
{
|
||||
var spaceUnits = UserSpaceUnit.Default;
|
||||
if (dictionary.TryGet(NameToken.UserUnit, out var userUnitBase) && userUnitBase is NumericToken userUnitNumber)
|
||||
{
|
||||
spaceUnits = new UserSpaceUnit(userUnitNumber.Int);
|
||||
}
|
||||
|
||||
return spaceUnits;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the crop box.
|
||||
/// </summary>
|
||||
public CropBox GetCropBox(DictionaryToken dictionary, PageTreeMembers pageTreeMembers, MediaBox mediaBox)
|
||||
{
|
||||
if (dictionary.TryGet(NameToken.CropBox, out var cropBoxObject) &&
|
||||
DirectObjectFinder.TryGet(cropBoxObject, pdfScanner, out ArrayToken cropBoxArray))
|
||||
{
|
||||
if (cropBoxArray.Length != 4)
|
||||
{
|
||||
log.Error($"The CropBox was the wrong length in the dictionary: {dictionary}. Array was: {cropBoxArray}. Using MediaBox.");
|
||||
|
||||
return new CropBox(mediaBox.Bounds);
|
||||
}
|
||||
|
||||
return new CropBox(cropBoxArray.ToRectangle(pdfScanner));
|
||||
}
|
||||
else
|
||||
{
|
||||
return pageTreeMembers.GetCropBox() ?? new CropBox(mediaBox.Bounds);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the media box.
|
||||
/// </summary>
|
||||
public MediaBox GetMediaBox(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers)
|
||||
{
|
||||
MediaBox mediaBox;
|
||||
if (dictionary.TryGet(NameToken.MediaBox, out var mediaBoxObject)
|
||||
&& DirectObjectFinder.TryGet(mediaBoxObject, pdfScanner, out ArrayToken mediaBoxArray))
|
||||
{
|
||||
if (mediaBoxArray.Length != 4)
|
||||
{
|
||||
log.Error($"The MediaBox was the wrong length in the dictionary: {dictionary}. Array was: {mediaBoxArray}. Defaulting to US Letter.");
|
||||
|
||||
return MediaBox.Letter;
|
||||
}
|
||||
|
||||
mediaBox = new MediaBox(mediaBoxArray.ToRectangle(pdfScanner));
|
||||
}
|
||||
else
|
||||
{
|
||||
mediaBox = pageTreeMembers.MediaBox;
|
||||
|
||||
if (mediaBox == null)
|
||||
{
|
||||
log.Error($"The MediaBox was the wrong missing for page {number}. Using US Letter.");
|
||||
|
||||
// PDFBox defaults to US Letter.
|
||||
mediaBox = MediaBox.Letter;
|
||||
}
|
||||
}
|
||||
|
||||
return mediaBox;
|
||||
}
|
||||
}
|
||||
}
|
@ -6,17 +6,26 @@
|
||||
/// <summary>
|
||||
/// Contains the values inherited from the Page Tree for this page.
|
||||
/// </summary>
|
||||
internal class PageTreeMembers
|
||||
public class PageTreeMembers
|
||||
{
|
||||
public CropBox GetCropBox()
|
||||
internal CropBox GetCropBox()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The page media box.
|
||||
/// </summary>
|
||||
public MediaBox MediaBox { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The page rotation.
|
||||
/// </summary>
|
||||
public int Rotation { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The page parent resources.
|
||||
/// </summary>
|
||||
public Queue<DictionaryToken> ParentResources { get; } = new Queue<DictionaryToken>();
|
||||
}
|
||||
}
|
@ -3,14 +3,22 @@
|
||||
using Core;
|
||||
using Outline;
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Data;
|
||||
using System.Linq;
|
||||
using System.Runtime.Serialization;
|
||||
using System.Runtime.Versioning;
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
using UglyToad.PdfPig.Parser;
|
||||
using Util;
|
||||
|
||||
internal class Pages
|
||||
{
|
||||
private readonly IPageFactory pageFactory;
|
||||
private readonly ConcurrentDictionary<Type, object> pageFactoryCache = new ConcurrentDictionary<Type, object>();
|
||||
|
||||
private readonly IPageFactory<Page> defaultPageFactory;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
private readonly Dictionary<int, PageTreeNode> pagesByNumber;
|
||||
public int Count => pagesByNumber.Count;
|
||||
@ -20,21 +28,35 @@
|
||||
/// </summary>
|
||||
public PageTreeNode PageTree { get; }
|
||||
|
||||
internal Pages(IPageFactory pageFactory, IPdfTokenScanner pdfScanner, PageTreeNode pageTree, Dictionary<int, PageTreeNode> pagesByNumber)
|
||||
internal Pages(IPageFactory<Page> pageFactory, IPdfTokenScanner pdfScanner, PageTreeNode pageTree, Dictionary<int, PageTreeNode> pagesByNumber)
|
||||
{
|
||||
this.pageFactory = pageFactory ?? throw new ArgumentNullException(nameof(pageFactory));
|
||||
this.defaultPageFactory = pageFactory ?? throw new ArgumentNullException(nameof(pageFactory));
|
||||
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
|
||||
this.pagesByNumber = pagesByNumber;
|
||||
PageTree = pageTree;
|
||||
|
||||
AddPageFactory(this.defaultPageFactory);
|
||||
}
|
||||
|
||||
internal Page GetPage(int pageNumber, NamedDestinations namedDestinations, InternalParsingOptions parsingOptions)
|
||||
internal Page GetPage(int pageNumber, NamedDestinations namedDestinations, InternalParsingOptions parsingOptions) => GetPage(defaultPageFactory, pageNumber, namedDestinations, parsingOptions);
|
||||
|
||||
internal TPage GetPage<TPage>(int pageNumber, NamedDestinations namedDestinations, InternalParsingOptions parsingOptions)
|
||||
{
|
||||
if (pageFactoryCache.TryGetValue(typeof(TPage), out var o) && o is IPageFactory<TPage> pageFactory)
|
||||
{
|
||||
return GetPage(pageFactory, pageNumber, namedDestinations, parsingOptions);
|
||||
}
|
||||
|
||||
throw new InvalidOperationException($"Could not find {typeof(IPageFactory<TPage>)} for page type {typeof(TPage)}.");
|
||||
}
|
||||
|
||||
private TPage GetPage<TPage>(IPageFactory<TPage> pageFactory, int pageNumber, NamedDestinations namedDestinations, InternalParsingOptions parsingOptions)
|
||||
{
|
||||
if (pageNumber <= 0 || pageNumber > Count)
|
||||
{
|
||||
parsingOptions.Logger.Error($"Page {pageNumber} requested but is out of range.");
|
||||
|
||||
throw new ArgumentOutOfRangeException(nameof(pageNumber),
|
||||
throw new ArgumentOutOfRangeException(nameof(pageNumber),
|
||||
$"Page number {pageNumber} invalid, must be between 1 and {Count}.");
|
||||
}
|
||||
|
||||
@ -49,7 +71,7 @@
|
||||
}
|
||||
|
||||
var pageTreeMembers = new PageTreeMembers();
|
||||
|
||||
|
||||
while (pageStack.Count > 0)
|
||||
{
|
||||
currentNode = pageStack.Pop();
|
||||
@ -58,7 +80,7 @@
|
||||
{
|
||||
pageTreeMembers.ParentResources.Enqueue(resourcesDictionary);
|
||||
}
|
||||
|
||||
|
||||
if (currentNode.NodeDictionary.TryGet(NameToken.MediaBox, pdfScanner, out ArrayToken mediaBox))
|
||||
{
|
||||
pageTreeMembers.MediaBox = new MediaBox(mediaBox.ToRectangle(pdfScanner));
|
||||
@ -70,14 +92,37 @@
|
||||
}
|
||||
}
|
||||
|
||||
var page = pageFactory.Create(
|
||||
return pageFactory.Create(
|
||||
pageNumber,
|
||||
pageNode.NodeDictionary,
|
||||
pageTreeMembers,
|
||||
namedDestinations,
|
||||
parsingOptions);
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
internal void AddPageFactory<TPage>(IPageFactory<TPage> pageFactory)
|
||||
{
|
||||
// TODO - throw if already exists
|
||||
pageFactoryCache.TryAdd(typeof(TPage), pageFactory);
|
||||
}
|
||||
|
||||
internal void AddPageFactory<TPage>(Type type)
|
||||
{
|
||||
// TODO - check for type, should implement IPageFactory<TPage>
|
||||
|
||||
if (!typeof(IPageFactory<TPage>).IsAssignableFrom(type))
|
||||
{
|
||||
throw new ArgumentException($"The type provided does not implement {typeof(IPageFactory<TPage>)}.");
|
||||
}
|
||||
|
||||
var defaultPageFactory = (PageFactory)pageFactoryCache[typeof(Page)];
|
||||
|
||||
// TODO - careful here - resourceStore is not thread safe
|
||||
var pageFactory = (IPageFactory<TPage>)Activator.CreateInstance(type,
|
||||
defaultPageFactory.pdfScanner, defaultPageFactory.resourceStore,
|
||||
defaultPageFactory.filterProvider, defaultPageFactory.pageContentParser,
|
||||
defaultPageFactory.log);
|
||||
AddPageFactory(pageFactory);
|
||||
}
|
||||
|
||||
internal PageTreeNode GetPageNode(int pageNumber)
|
||||
|
@ -10,7 +10,7 @@
|
||||
using Tokens;
|
||||
using Util;
|
||||
|
||||
internal class PagesFactory
|
||||
internal static class PagesFactory
|
||||
{
|
||||
private class PageCounter
|
||||
{
|
||||
@ -21,7 +21,7 @@
|
||||
}
|
||||
}
|
||||
|
||||
public static Pages Create(IndirectReference pagesReference, DictionaryToken pagesDictionary, IPdfTokenScanner scanner, IPageFactory pageFactory, ILog log, bool isLenientParsing)
|
||||
public static Pages Create(IndirectReference pagesReference, DictionaryToken pagesDictionary, IPdfTokenScanner scanner, IPageFactory<Page> pageFactory, ILog log, bool isLenientParsing)
|
||||
{
|
||||
var pageNumber = new PageCounter();
|
||||
|
||||
|
@ -41,7 +41,7 @@
|
||||
this.filterProvider = filterProvider;
|
||||
}
|
||||
|
||||
public void LoadResourceDictionary(DictionaryToken resourceDictionary, InternalParsingOptions parsingOptions)
|
||||
public void LoadResourceDictionary(DictionaryToken resourceDictionary, IParsingOptions parsingOptions)
|
||||
{
|
||||
lastLoadedFont = (null, null);
|
||||
loadedNamedColorSpaceDetails.Clear();
|
||||
@ -176,7 +176,7 @@
|
||||
namedColorSpaces.Pop();
|
||||
}
|
||||
|
||||
private void LoadFontDictionary(DictionaryToken fontDictionary, InternalParsingOptions parsingOptions)
|
||||
private void LoadFontDictionary(DictionaryToken fontDictionary, IParsingOptions parsingOptions)
|
||||
{
|
||||
lastLoadedFont = (null, null);
|
||||
|
||||
|
@ -25,8 +25,14 @@
|
||||
IReadOnlyList<IFilter> GetAllFilters();
|
||||
}
|
||||
|
||||
internal interface ILookupFilterProvider : IFilterProvider
|
||||
/// <summary>
|
||||
/// Gets filter implementations (<see cref="IFilter"/>) for decoding PDF data with lookup.
|
||||
/// </summary>
|
||||
public interface ILookupFilterProvider : IFilterProvider
|
||||
{
|
||||
/// <summary>
|
||||
/// Get the filters specified in this dictionary.
|
||||
/// </summary>
|
||||
IReadOnlyList<IFilter> GetFilters(DictionaryToken dictionary, IPdfTokenScanner scanner);
|
||||
}
|
||||
}
|
@ -7,8 +7,11 @@
|
||||
/// By default user space units correspond to 1/72nd of an inch (a typographic point).
|
||||
/// The UserUnit entry in a page dictionary can define the space units as a different multiple of 1/72 (1 point).
|
||||
/// </summary>
|
||||
internal readonly struct UserSpaceUnit
|
||||
public readonly struct UserSpaceUnit
|
||||
{
|
||||
/// <summary>
|
||||
/// Default <see cref="UserSpaceUnit"/> with <see cref="PointMultiples"/> set to 1.
|
||||
/// </summary>
|
||||
public static readonly UserSpaceUnit Default = new UserSpaceUnit(1);
|
||||
|
||||
/// <summary>
|
||||
@ -29,6 +32,7 @@
|
||||
PointMultiples = pointMultiples;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override string ToString()
|
||||
{
|
||||
return PointMultiples.ToString(CultureInfo.InvariantCulture);
|
||||
|
@ -5,18 +5,24 @@
|
||||
/// <summary>
|
||||
/// A color space definition from a resource dictionary.
|
||||
/// </summary>
|
||||
internal struct ResourceColorSpace
|
||||
public readonly struct ResourceColorSpace
|
||||
{
|
||||
/// <summary>
|
||||
/// The color space name.
|
||||
/// </summary>
|
||||
public NameToken Name { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The color space data.
|
||||
/// </summary>
|
||||
public IToken Data { get; }
|
||||
|
||||
public ResourceColorSpace(NameToken name, IToken data)
|
||||
internal ResourceColorSpace(NameToken name, IToken data)
|
||||
{
|
||||
Name = name;
|
||||
Data = data;
|
||||
}
|
||||
|
||||
public ResourceColorSpace(NameToken name) : this(name, null) { }
|
||||
internal ResourceColorSpace(NameToken name) : this(name, null) { }
|
||||
}
|
||||
}
|
||||
|
@ -49,7 +49,7 @@
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
private readonly IPageContentParser pageContentParser;
|
||||
private readonly ILookupFilterProvider filterProvider;
|
||||
private readonly InternalParsingOptions parsingOptions;
|
||||
private readonly IParsingOptions parsingOptions;
|
||||
private readonly MarkedContentStack markedContentStack = new MarkedContentStack();
|
||||
|
||||
private Stack<CurrentGraphicsState> graphicsStack = new Stack<CurrentGraphicsState>();
|
||||
@ -92,7 +92,7 @@
|
||||
IPdfTokenScanner pdfScanner,
|
||||
IPageContentParser pageContentParser,
|
||||
ILookupFilterProvider filterProvider,
|
||||
InternalParsingOptions parsingOptions)
|
||||
IParsingOptions parsingOptions)
|
||||
{
|
||||
this.pageNumber = pageNumber;
|
||||
this.resourceStore = resourceStore;
|
||||
|
40
src/UglyToad.PdfPig/IParsingOptions.cs
Normal file
40
src/UglyToad.PdfPig/IParsingOptions.cs
Normal file
@ -0,0 +1,40 @@
|
||||
namespace UglyToad.PdfPig
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using UglyToad.PdfPig.Logging;
|
||||
|
||||
/// <summary>
|
||||
/// Parsing options interface.
|
||||
/// </summary>
|
||||
public interface IParsingOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Should the parser apply clipping to paths?
|
||||
/// Defaults to <see langword="false"/>.
|
||||
/// <para>Bezier curves will be transformed into polylines if clipping is set to <see langword="true"/>.</para>
|
||||
/// </summary>
|
||||
bool ClipPaths { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Should the parser ignore issues where the document does not conform to the PDF specification?
|
||||
/// </summary>
|
||||
bool UseLenientParsing { get; }
|
||||
|
||||
/// <summary>
|
||||
/// All passwords to try when opening this document, will include any values set for <see cref="ParsingOptions.Password"/>.
|
||||
/// </summary>
|
||||
List<string> Passwords { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Skip extracting content where the font could not be found, will result in some letters being skipped/missed
|
||||
/// but will prevent the library throwing where the source PDF has some corrupted text. Also skips XObjects like
|
||||
/// forms and images when missing.
|
||||
/// </summary>
|
||||
bool SkipMissingFonts { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The <see cref="ILog"/> used to record messages raised by the parsing process.
|
||||
/// </summary>
|
||||
ILog Logger { get; }
|
||||
}
|
||||
}
|
@ -4,11 +4,11 @@
|
||||
using System.Collections.Generic;
|
||||
|
||||
/// <summary>
|
||||
/// <see cref="ParsingOptions"/> but without being a public API/
|
||||
/// <see cref="ParsingOptions"/> but without being a public API.
|
||||
/// </summary>
|
||||
internal class InternalParsingOptions
|
||||
internal class InternalParsingOptions : IParsingOptions
|
||||
{
|
||||
public IReadOnlyList<string> Passwords { get; }
|
||||
public List<string> Passwords { get; }
|
||||
|
||||
public bool UseLenientParsing { get; }
|
||||
|
||||
@ -21,7 +21,7 @@
|
||||
public ILog Logger { get; }
|
||||
|
||||
public InternalParsingOptions(
|
||||
IReadOnlyList<string> passwords,
|
||||
List<string> passwords,
|
||||
bool useLenientParsing,
|
||||
bool clipPaths,
|
||||
bool skipMissingFonts,
|
||||
|
@ -9,7 +9,7 @@
|
||||
/// <summary>
|
||||
/// Named destinations in a PDF document
|
||||
/// </summary>
|
||||
internal class NamedDestinations
|
||||
public class NamedDestinations
|
||||
{
|
||||
/// <summary>
|
||||
/// Dictionary containing explicit destinations, keyed by name
|
||||
|
@ -1,13 +1,18 @@
|
||||
namespace UglyToad.PdfPig.Parser
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using Core;
|
||||
using Graphics.Operations;
|
||||
using Logging;
|
||||
using System.Collections.Generic;
|
||||
|
||||
internal interface IPageContentParser
|
||||
/// <summary>
|
||||
/// Page content parser interface.
|
||||
/// </summary>
|
||||
public interface IPageContentParser
|
||||
{
|
||||
IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes,
|
||||
ILog log);
|
||||
/// <summary>
|
||||
/// Parse the <see cref="IInputBytes"/> into <see cref="IGraphicsStateOperation"/>s.
|
||||
/// </summary>
|
||||
IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes, ILog log);
|
||||
}
|
||||
}
|
@ -1,167 +1,39 @@
|
||||
namespace UglyToad.PdfPig.Parser
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Annotations;
|
||||
using Content;
|
||||
using Core;
|
||||
using Filters;
|
||||
using Geometry;
|
||||
using Graphics;
|
||||
using Graphics.Operations;
|
||||
using Logging;
|
||||
using Outline;
|
||||
using Parts;
|
||||
using System.Collections.Generic;
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
using Util;
|
||||
using UglyToad.PdfPig.Core;
|
||||
|
||||
internal class PageFactory : IPageFactory
|
||||
internal class PageFactory : PageFactoryBase<Page>
|
||||
{
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
private readonly IResourceStore resourceStore;
|
||||
private readonly ILookupFilterProvider filterProvider;
|
||||
private readonly IPageContentParser pageContentParser;
|
||||
private readonly ILog log;
|
||||
|
||||
public PageFactory(
|
||||
IPdfTokenScanner pdfScanner,
|
||||
IResourceStore resourceStore,
|
||||
ILookupFilterProvider filterProvider,
|
||||
IPageContentParser pageContentParser,
|
||||
ILog log)
|
||||
{
|
||||
this.resourceStore = resourceStore;
|
||||
this.filterProvider = filterProvider;
|
||||
this.pageContentParser = pageContentParser;
|
||||
this.pdfScanner = pdfScanner;
|
||||
this.log = log;
|
||||
}
|
||||
: base(pdfScanner, resourceStore, filterProvider, pageContentParser, log)
|
||||
{ }
|
||||
|
||||
public Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers,
|
||||
NamedDestinations namedDestinations, InternalParsingOptions parsingOptions)
|
||||
{
|
||||
if (dictionary == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(dictionary));
|
||||
}
|
||||
|
||||
var type = dictionary.GetNameOrDefault(NameToken.Type);
|
||||
|
||||
if (type != null && !type.Equals(NameToken.Page))
|
||||
{
|
||||
parsingOptions.Logger.Error($"Page {number} had its type specified as {type} rather than 'Page'.");
|
||||
}
|
||||
|
||||
MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers);
|
||||
CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox);
|
||||
|
||||
var rotation = new PageRotationDegrees(pageTreeMembers.Rotation);
|
||||
if (dictionary.TryGet(NameToken.Rotate, pdfScanner, out NumericToken rotateToken))
|
||||
{
|
||||
rotation = new PageRotationDegrees(rotateToken.Int);
|
||||
}
|
||||
|
||||
var stackDepth = 0;
|
||||
|
||||
while (pageTreeMembers.ParentResources.Count > 0)
|
||||
{
|
||||
var resource = pageTreeMembers.ParentResources.Dequeue();
|
||||
|
||||
resourceStore.LoadResourceDictionary(resource, parsingOptions);
|
||||
stackDepth++;
|
||||
}
|
||||
|
||||
if (dictionary.TryGet(NameToken.Resources, pdfScanner, out DictionaryToken resources))
|
||||
{
|
||||
resourceStore.LoadResourceDictionary(resources, parsingOptions);
|
||||
stackDepth++;
|
||||
}
|
||||
|
||||
UserSpaceUnit userSpaceUnit = GetUserSpaceUnits(dictionary);
|
||||
|
||||
PageContent content;
|
||||
|
||||
if (!dictionary.TryGet(NameToken.Contents, out var contents))
|
||||
{
|
||||
content = new PageContent(EmptyArray<IGraphicsStateOperation>.Instance,
|
||||
EmptyArray<Letter>.Instance,
|
||||
EmptyArray<PdfPath>.Instance,
|
||||
EmptyArray<Union<XObjectContentRecord, InlineImage>>.Instance,
|
||||
EmptyArray<MarkedContentElement>.Instance,
|
||||
pdfScanner,
|
||||
filterProvider,
|
||||
resourceStore);
|
||||
// ignored for now, is it possible? check the spec...
|
||||
}
|
||||
else if (DirectObjectFinder.TryGet<ArrayToken>(contents, pdfScanner, out var array))
|
||||
{
|
||||
var bytes = new List<byte>();
|
||||
|
||||
for (var i = 0; i < array.Data.Count; i++)
|
||||
{
|
||||
var item = array.Data[i];
|
||||
|
||||
if (!(item is IndirectReferenceToken obj))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"The contents contained something which was not an indirect reference: {item}.");
|
||||
}
|
||||
|
||||
var contentStream = DirectObjectFinder.Get<StreamToken>(obj, pdfScanner);
|
||||
|
||||
if (contentStream == null)
|
||||
{
|
||||
throw new InvalidOperationException($"Could not find the contents for object {obj}.");
|
||||
}
|
||||
|
||||
bytes.AddRange(contentStream.Decode(filterProvider, pdfScanner));
|
||||
|
||||
if (i < array.Data.Count - 1)
|
||||
{
|
||||
bytes.Add((byte)'\n');
|
||||
}
|
||||
}
|
||||
|
||||
content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
|
||||
}
|
||||
else
|
||||
{
|
||||
var contentStream = DirectObjectFinder.Get<StreamToken>(contents, pdfScanner);
|
||||
|
||||
if (contentStream == null)
|
||||
{
|
||||
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
|
||||
}
|
||||
|
||||
var bytes = contentStream.Decode(filterProvider, pdfScanner);
|
||||
|
||||
content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
|
||||
}
|
||||
|
||||
var initialMatrix = ContentStreamProcessor.GetInitialMatrix(userSpaceUnit, mediaBox, cropBox, rotation, log);
|
||||
var annotationProvider = new AnnotationProvider(pdfScanner, dictionary, initialMatrix, namedDestinations, log);
|
||||
var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, annotationProvider, pdfScanner);
|
||||
|
||||
for (var i = 0; i < stackDepth; i++)
|
||||
{
|
||||
resourceStore.UnloadResourceDictionary();
|
||||
}
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
private PageContent GetContent(
|
||||
int pageNumber,
|
||||
protected override Page ProcessPage(int pageNumber,
|
||||
DictionaryToken dictionary,
|
||||
NamedDestinations namedDestinations,
|
||||
IReadOnlyList<byte> contentBytes,
|
||||
CropBox cropBox,
|
||||
UserSpaceUnit userSpaceUnit,
|
||||
PageRotationDegrees rotation,
|
||||
MediaBox mediaBox,
|
||||
InternalParsingOptions parsingOptions)
|
||||
IParsingOptions parsingOptions)
|
||||
{
|
||||
var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes),
|
||||
parsingOptions.Logger);
|
||||
|
||||
var context = new ContentStreamProcessor(
|
||||
pageNumber,
|
||||
resourceStore,
|
||||
@ -174,82 +46,38 @@
|
||||
filterProvider,
|
||||
parsingOptions);
|
||||
|
||||
return context.Process(pageNumber, operations);
|
||||
var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes), parsingOptions.Logger);
|
||||
var content = context.Process(pageNumber, operations);
|
||||
|
||||
var initialMatrix = ContentStreamProcessor.GetInitialMatrix(userSpaceUnit, mediaBox, cropBox, rotation, log);
|
||||
var annotationProvider = new AnnotationProvider(pdfScanner, dictionary, initialMatrix, namedDestinations, log);
|
||||
return new Page(pageNumber, dictionary, mediaBox, cropBox, rotation, content, annotationProvider, pdfScanner);
|
||||
}
|
||||
|
||||
private static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary)
|
||||
{
|
||||
var spaceUnits = UserSpaceUnit.Default;
|
||||
if (dictionary.TryGet(NameToken.UserUnit, out var userUnitBase) && userUnitBase is NumericToken userUnitNumber)
|
||||
{
|
||||
spaceUnits = new UserSpaceUnit(userUnitNumber.Int);
|
||||
}
|
||||
|
||||
return spaceUnits;
|
||||
}
|
||||
|
||||
private CropBox GetCropBox(
|
||||
protected override Page ProcessPage(
|
||||
int pageNumber,
|
||||
DictionaryToken dictionary,
|
||||
PageTreeMembers pageTreeMembers,
|
||||
MediaBox mediaBox)
|
||||
NamedDestinations namedDestinations,
|
||||
CropBox cropBox,
|
||||
UserSpaceUnit userSpaceUnit,
|
||||
PageRotationDegrees rotation,
|
||||
MediaBox mediaBox,
|
||||
IParsingOptions parsingOptions)
|
||||
{
|
||||
CropBox cropBox;
|
||||
if (dictionary.TryGet(NameToken.CropBox, out var cropBoxObject) &&
|
||||
DirectObjectFinder.TryGet(cropBoxObject, pdfScanner, out ArrayToken cropBoxArray))
|
||||
{
|
||||
if (cropBoxArray.Length != 4)
|
||||
{
|
||||
log.Error($"The CropBox was the wrong length in the dictionary: {dictionary}. Array was: {cropBoxArray}. Using MediaBox.");
|
||||
|
||||
cropBox = new CropBox(mediaBox.Bounds);
|
||||
var initialMatrix = ContentStreamProcessor.GetInitialMatrix(userSpaceUnit, mediaBox, cropBox, rotation, log);
|
||||
var annotationProvider = new AnnotationProvider(pdfScanner, dictionary, initialMatrix, namedDestinations, log);
|
||||
|
||||
return cropBox;
|
||||
}
|
||||
var content = new PageContent(EmptyArray<IGraphicsStateOperation>.Instance,
|
||||
EmptyArray<Letter>.Instance,
|
||||
EmptyArray<PdfPath>.Instance,
|
||||
EmptyArray<Union<XObjectContentRecord, InlineImage>>.Instance,
|
||||
EmptyArray<MarkedContentElement>.Instance,
|
||||
pdfScanner,
|
||||
filterProvider,
|
||||
resourceStore);
|
||||
// ignored for now, is it possible? check the spec...
|
||||
|
||||
cropBox = new CropBox(cropBoxArray.ToRectangle(pdfScanner));
|
||||
}
|
||||
else
|
||||
{
|
||||
cropBox = pageTreeMembers.GetCropBox() ?? new CropBox(mediaBox.Bounds);
|
||||
}
|
||||
|
||||
return cropBox;
|
||||
}
|
||||
|
||||
private MediaBox GetMediaBox(
|
||||
int number,
|
||||
DictionaryToken dictionary,
|
||||
PageTreeMembers pageTreeMembers)
|
||||
{
|
||||
MediaBox mediaBox;
|
||||
if (dictionary.TryGet(NameToken.MediaBox, out var mediaBoxObject)
|
||||
&& DirectObjectFinder.TryGet(mediaBoxObject, pdfScanner, out ArrayToken mediaBoxArray))
|
||||
{
|
||||
if (mediaBoxArray.Length != 4)
|
||||
{
|
||||
log.Error($"The MediaBox was the wrong length in the dictionary: {dictionary}. Array was: {mediaBoxArray}. Defaulting to US Letter.");
|
||||
|
||||
mediaBox = MediaBox.Letter;
|
||||
|
||||
return mediaBox;
|
||||
}
|
||||
|
||||
mediaBox = new MediaBox(mediaBoxArray.ToRectangle(pdfScanner));
|
||||
}
|
||||
else
|
||||
{
|
||||
mediaBox = pageTreeMembers.MediaBox;
|
||||
|
||||
if (mediaBox == null)
|
||||
{
|
||||
log.Error($"The MediaBox was the wrong missing for page {number}. Using US Letter.");
|
||||
|
||||
// PDFBox defaults to US Letter.
|
||||
mediaBox = MediaBox.Letter;
|
||||
}
|
||||
}
|
||||
|
||||
return mediaBox;
|
||||
return new Page(pageNumber, dictionary, mediaBox, cropBox, rotation, content, annotationProvider, pdfScanner);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -6,7 +6,7 @@
|
||||
/// <summary>
|
||||
/// Configures options used by the parser when reading PDF documents.
|
||||
/// </summary>
|
||||
public class ParsingOptions
|
||||
public class ParsingOptions : IParsingOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// A default <see cref="ParsingOptions"/> with <see cref="UseLenientParsing"/> set to false.
|
||||
|
@ -161,6 +161,57 @@
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the page with the specified page number (1 indexed).
|
||||
/// </summary>
|
||||
/// <typeparam name="TPage"></typeparam>
|
||||
/// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
|
||||
/// <returns>The page.</returns>
|
||||
public TPage GetPage<TPage>(int pageNumber)
|
||||
{
|
||||
// TODO - update log with log type
|
||||
if (isDisposed)
|
||||
{
|
||||
throw new ObjectDisposedException("Cannot access page after the document is disposed.");
|
||||
}
|
||||
|
||||
parsingOptions.Logger.Debug($"Accessing page {pageNumber}.");
|
||||
|
||||
try
|
||||
{
|
||||
return pages.GetPage<TPage>(pageNumber, namedDestinations, parsingOptions);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
if (IsEncrypted)
|
||||
{
|
||||
throw new PdfDocumentEncryptedException("Document was encrypted which may have caused error when retrieving page.", encryptionDictionary, ex);
|
||||
}
|
||||
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// TODO
|
||||
/// </summary>
|
||||
/// <typeparam name="TPage"></typeparam>
|
||||
/// <param name="pageFactory"></param>
|
||||
public void AddPageFactory<TPage>(IPageFactory<TPage> pageFactory)
|
||||
{
|
||||
pages.AddPageFactory(pageFactory);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// TODO
|
||||
/// </summary>
|
||||
/// <typeparam name="TPage"></typeparam>
|
||||
/// <param name="type"></param>
|
||||
public void AddPageFactory<TPage>(Type type)
|
||||
{
|
||||
pages.AddPageFactory<TPage>(type);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all pages in this document in order.
|
||||
/// </summary>
|
||||
|
@ -7,7 +7,7 @@
|
||||
/// <summary>
|
||||
/// Tokenizes objects from bytes in a PDF file.
|
||||
/// </summary>
|
||||
internal interface IPdfTokenScanner : ISeekableTokenScanner, IDisposable
|
||||
public interface IPdfTokenScanner : ISeekableTokenScanner, IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// Tokenize the object with a given object number.
|
||||
|
Loading…
Reference in New Issue
Block a user