Add GetPage<TPage> and AddPageFactory<TPage and TPageFactory> methods

This commit is contained in:
BobLd 2023-11-02 08:12:46 +00:00
parent 3fbf8aaa6c
commit 3a96af3dcd
4 changed files with 692 additions and 6 deletions

View File

@ -0,0 +1,280 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using Content;
using Outline.Destinations;
using PdfPig.Core;
using PdfPig.Filters;
using PdfPig.Geometry;
using PdfPig.Graphics.Operations;
using PdfPig.Parser;
using PdfPig.Tokenization.Scanner;
using PdfPig.Tokens;
using System;
using System.Collections.Generic;
using Xunit;
public class PageFactoryTests
{
[Fact]
public void SimpleFactory1()
{
var file = IntegrationHelpers.GetDocumentPath("ICML03-081");
using (var document = PdfDocument.Open(file))
{
document.AddPageFactory<SimplePage, SimplePageFactory>();
for (int p = 1; p < document.NumberOfPages; p++)
{
var page = document.GetPage(p);
var pageInfo = document.GetPage<SimplePage>(p);
Assert.Equal(page.Number, pageInfo.Number);
Assert.Equal(page.Rotation.Value, pageInfo.Rotation);
Assert.Equal(page.MediaBox.Bounds, pageInfo.MediaBox.Bounds);
}
}
}
[Fact]
public void SimpleFactory2()
{
var file = IntegrationHelpers.GetDocumentPath("cat-genetics");
using (var document = PdfDocument.Open(file))
{
document.AddPageFactory(new SimplePageFactory());
var page = document.GetPage(1);
var pageInfo = document.GetPage<SimplePage>(1);
Assert.Equal(page.Number, pageInfo.Number);
Assert.Equal(page.Rotation.Value, pageInfo.Rotation);
Assert.Equal(page.MediaBox.Bounds, pageInfo.MediaBox.Bounds);
// Run again
pageInfo = document.GetPage<SimplePage>(1);
Assert.Equal(page.Number, pageInfo.Number);
Assert.Equal(page.Rotation.Value, pageInfo.Rotation);
Assert.Equal(page.MediaBox.Bounds, pageInfo.MediaBox.Bounds);
}
}
[Fact]
public void InformationFactory()
{
var file = IntegrationHelpers.GetDocumentPath("Gamebook");
using (var document = PdfDocument.Open(file))
{
document.AddPageFactory<PageInformation, PageInformationFactory>();
for (int p = 1; p < document.NumberOfPages; p++)
{
var page = document.GetPage(p);
var pageInfo = document.GetPage<PageInformation>(p);
Assert.Equal(page.Number, pageInfo.Number);
Assert.Equal(page.Rotation, pageInfo.Rotation);
Assert.Equal(page.Width, pageInfo.Width);
Assert.Equal(page.Height, pageInfo.Height);
// Run again
pageInfo = document.GetPage<PageInformation>(p);
Assert.Equal(page.Number, pageInfo.Number);
Assert.Equal(page.Rotation, pageInfo.Rotation);
Assert.Equal(page.Width, pageInfo.Width);
Assert.Equal(page.Height, pageInfo.Height);
}
}
}
[Fact]
public void SimpleAndInformationFactory()
{
var file = IntegrationHelpers.GetDocumentPath("DeviceN_CS_test");
using (var document = PdfDocument.Open(file))
{
document.AddPageFactory<PageInformation, PageInformationFactory>();
document.AddPageFactory<SimplePage, SimplePageFactory>();
for (int p = 1; p < document.NumberOfPages; p++)
{
var page = document.GetPage(p);
var pageInfo = document.GetPage<PageInformation>(p);
Assert.Equal(page.Number, pageInfo.Number);
Assert.Equal(page.Rotation, pageInfo.Rotation);
Assert.Equal(page.Width, pageInfo.Width);
Assert.Equal(page.Height, pageInfo.Height);
var simplePage = document.GetPage<SimplePage>(p);
Assert.Equal(page.Number, simplePage.Number);
Assert.Equal(page.Rotation.Value, simplePage.Rotation);
Assert.Equal(page.MediaBox.Bounds, simplePage.MediaBox.Bounds);
}
}
}
[Fact]
public void NoPageFactory()
{
var file = IntegrationHelpers.GetDocumentPath("cat-genetics");
using (var document = PdfDocument.Open(file))
{
var exception = Assert.Throws<InvalidOperationException>(() => document.GetPage<SimplePage>(1));
Assert.StartsWith("Could not find page factory of type", exception.Message);
}
}
[Fact]
public void WrongSignatureFactory()
{
var file = IntegrationHelpers.GetDocumentPath("Gamebook");
using (var document = PdfDocument.Open(file))
{
var exception = Assert.Throws<InvalidOperationException>(() =>
document.AddPageFactory<PageInformation, WrongConstructorFactory>());
Assert.StartsWith("Could not find valid constructor for page factory of type ", exception.Message);
}
}
#region Wrong
public class WrongConstructorFactory : BasePageFactory<PageInformation>
{
public WrongConstructorFactory(
IResourceStore resourceStore,
ILookupFilterProvider filterProvider,
IPageContentParser pageContentParser,
ParsingOptions parsingOptions)
: base(null, resourceStore, filterProvider, pageContentParser, parsingOptions)
{
}
protected override PageInformation ProcessPage(int pageNumber,
DictionaryToken dictionary,
NamedDestinations namedDestinations,
MediaBox mediaBox,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
TransformationMatrix initialMatrix,
IReadOnlyList<IGraphicsStateOperation> operations)
{
throw new Exception();
}
}
#endregion
#region SimplePage
public sealed class SimplePage
{
public int Number { get; }
public int Rotation { get; }
public MediaBox MediaBox { get; }
public SimplePage(int number, int rotation, MediaBox mediaBox)
{
Number = number;
Rotation = rotation;
MediaBox = mediaBox;
}
}
public sealed class SimplePageFactory : IPageFactory<SimplePage>
{
public SimplePageFactory()
{
// do nothing
}
public SimplePageFactory(
IPdfTokenScanner pdfScanner,
IResourceStore resourceStore,
ILookupFilterProvider filterProvider,
IPageContentParser pageContentParser,
ParsingOptions parsingOptions)
{
// do nothing
}
public SimplePage Create(int number,
DictionaryToken dictionary,
PageTreeMembers pageTreeMembers,
NamedDestinations namedDestinations)
{
return new SimplePage(number, pageTreeMembers.Rotation, pageTreeMembers.MediaBox);
}
}
#endregion
#region PageInformation
public readonly struct PageInformation
{
public int Number { get; }
public PageRotationDegrees Rotation { get; }
public double Width { get; }
public double Height { get; }
public UserSpaceUnit UserSpaceUnit { get; }
public PageInformation(int number,
PageRotationDegrees rotation,
double width,
double height,
UserSpaceUnit userSpaceUnit)
{
Number = number;
Rotation = rotation;
Width = width;
Height = height;
UserSpaceUnit = userSpaceUnit;
}
}
public sealed class PageInformationFactory : BasePageFactory<PageInformation>
{
public PageInformationFactory(
IPdfTokenScanner pdfScanner,
IResourceStore resourceStore,
ILookupFilterProvider filterProvider,
IPageContentParser pageContentParser,
ParsingOptions parsingOptions)
: base(pdfScanner, resourceStore, filterProvider, pageContentParser, parsingOptions)
{
}
protected override PageInformation ProcessPage(int pageNumber,
DictionaryToken dictionary,
NamedDestinations namedDestinations,
MediaBox mediaBox,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
TransformationMatrix initialMatrix,
IReadOnlyList<IGraphicsStateOperation> operations)
{
// Same logic as in Page class:
// Special case where cropbox is outside mediabox: use cropbox instead of intersection
var viewBox = mediaBox.Bounds.Intersect(cropBox.Bounds) ?? cropBox.Bounds;
return new PageInformation(pageNumber, rotation, viewBox.Width, viewBox.Height, userSpaceUnit);
}
}
#endregion
}
}

View File

@ -0,0 +1,257 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using Content;
using Outline.Destinations;
using PdfFonts;
using PdfPig.Core;
using PdfPig.Filters;
using PdfPig.Geometry;
using PdfPig.Graphics;
using PdfPig.Graphics.Colors;
using PdfPig.Graphics.Operations;
using PdfPig.Parser;
using PdfPig.Tokenization.Scanner;
using PdfPig.Tokens;
using System.Collections.Generic;
using System.Linq;
using Xunit;
public class StreamProcessorTests
{
[Fact]
public void TextOnly()
{
var file = IntegrationHelpers.GetDocumentPath("cat-genetics");
using (var document = PdfDocument.Open(file))
{
document.AddPageFactory<TextOnlyPage, TextOnlyPageInformationFactory>();
var page = document.GetPage(1);
var textOnlyPage = document.GetPage<TextOnlyPage>(1);
string expected = string.Concat(page.Letters.Select(l => l.Value));
Assert.Equal(expected, textOnlyPage.Text);
}
}
#region AdvancedPage
public readonly struct TextOnlyPage
{
public int Number { get; }
public string Text { get; }
public TextOnlyPage(int number, string text)
{
Number = number;
Text = text;
}
}
public readonly struct TextOnlyPageContent
{
public IReadOnlyList<string> Letters { get; }
public TextOnlyPageContent(IReadOnlyList<string> letters)
{
Letters = letters;
}
}
public class TextOnlyPageInformationFactory : BasePageFactory<TextOnlyPage>
{
public TextOnlyPageInformationFactory(
IPdfTokenScanner pdfScanner,
IResourceStore resourceStore,
ILookupFilterProvider filterProvider,
IPageContentParser pageContentParser,
ParsingOptions parsingOptions)
: base(pdfScanner, resourceStore, filterProvider, pageContentParser, parsingOptions)
{
}
protected override TextOnlyPage ProcessPage(int pageNumber,
DictionaryToken dictionary,
NamedDestinations namedDestinations,
MediaBox mediaBox,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
TransformationMatrix initialMatrix,
IReadOnlyList<IGraphicsStateOperation> operations)
{
if (operations.Count == 0)
{
return new TextOnlyPage(pageNumber, string.Empty);
}
var context = new TextOnlyStreamProcessor(
pageNumber,
ResourceStore,
PdfScanner,
PageContentParser,
FilterProvider,
cropBox,
userSpaceUnit,
rotation,
initialMatrix,
ParsingOptions);
TextOnlyPageContent content = context.Process(pageNumber, operations);
return new TextOnlyPage(pageNumber, string.Concat(content.Letters));
}
}
public sealed class TextOnlyStreamProcessor : BaseStreamProcessor<TextOnlyPageContent>
{
private readonly List<string> _letters = new List<string>();
public TextOnlyStreamProcessor(int pageNumber,
IResourceStore resourceStore,
IPdfTokenScanner pdfScanner,
IPageContentParser pageContentParser,
ILookupFilterProvider filterProvider,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
TransformationMatrix initialMatrix,
ParsingOptions parsingOptions)
: base(pageNumber,
resourceStore,
pdfScanner,
pageContentParser,
filterProvider,
cropBox,
userSpaceUnit,
rotation,
initialMatrix,
parsingOptions)
{
}
public override TextOnlyPageContent Process(int pageNumberCurrent,
IReadOnlyList<IGraphicsStateOperation> operations)
{
CloneAllStates();
ProcessOperations(operations);
return new TextOnlyPageContent(_letters);
}
public override void RenderGlyph(IFont font,
IColor strokingColor,
IColor nonStrokingColor,
TextRenderingMode textRenderingMode,
double fontSize,
double pointSize,
int code,
string unicode,
long currentOffset,
TransformationMatrix renderingMatrix,
TransformationMatrix textMatrix,
TransformationMatrix transformationMatrix,
CharacterBoundingBox characterBoundingBox)
{
_letters.Add(unicode);
}
protected override void RenderXObjectImage(XObjectContentRecord xObjectContentRecord)
{
// No op
}
public override void BeginSubpath()
{
// No op
}
public override PdfPoint? CloseSubpath()
{
return new PdfPoint();
}
public override void StrokePath(bool close)
{
// No op
}
public override void FillPath(FillingRule fillingRule, bool close)
{
// No op
}
public override void FillStrokePath(FillingRule fillingRule, bool close)
{
// No op
}
public override void MoveTo(double x, double y)
{
// No op
}
public override void BezierCurveTo(double x1, double y1, double x2, double y2, double x3, double y3)
{
// No op
}
public override void LineTo(double x, double y)
{
// No op
}
public override void Rectangle(double x, double y, double width, double height)
{
// No op
}
public override void EndPath()
{
// No op
}
public override void ClosePath()
{
// No op
}
public override void BeginMarkedContent(NameToken name,
NameToken propertyDictionaryName,
DictionaryToken properties)
{
// No op
}
public override void EndMarkedContent()
{
// No op
}
public override void ModifyClippingIntersect(FillingRule clippingRule)
{
// No op
}
public override void PaintShading(NameToken shadingName)
{
// No op
}
protected override void RenderInlineImage(InlineImage inlineImage)
{
// No op
}
public override void BezierCurveTo(double x2, double y2, double x3, double y3)
{
// No op
}
}
#endregion
}
}

View File

@ -3,14 +3,17 @@
using System;
using System.Collections.Generic;
using Core;
using Filters;
using Outline.Destinations;
using Parser;
using Tokenization.Scanner;
using Tokens;
using Util;
internal class Pages
internal sealed class Pages : IDisposable
{
private readonly IPageFactory<Page> pageFactory;
private readonly Dictionary<Type, object> pageFactoryCache;
private readonly PageFactory defaultPageFactory;
private readonly IPdfTokenScanner pdfScanner;
private readonly Dictionary<int, PageTreeNode> pagesByNumber;
@ -21,15 +24,38 @@
/// </summary>
public PageTreeNode PageTree { get; }
internal Pages(IPageFactory<Page> pageFactory, IPdfTokenScanner pdfScanner, PageTreeNode pageTree, Dictionary<int, PageTreeNode> pagesByNumber)
internal Pages(IPageFactory<Page> pageFactory,
IPdfTokenScanner pdfScanner,
PageTreeNode pageTree,
Dictionary<int, PageTreeNode> pagesByNumber)
{
this.pageFactory = pageFactory ?? throw new ArgumentNullException(nameof(pageFactory));
pageFactoryCache = new Dictionary<Type, object>();
defaultPageFactory = (PageFactory)pageFactory ?? throw new ArgumentNullException(nameof(pageFactory));
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
this.pagesByNumber = pagesByNumber;
PageTree = pageTree;
AddPageFactory(defaultPageFactory);
}
internal Page GetPage(int pageNumber, NamedDestinations namedDestinations, ParsingOptions parsingOptions)
internal Page GetPage(int pageNumber, NamedDestinations namedDestinations, ParsingOptions parsingOptions) =>
GetPage(defaultPageFactory, pageNumber, namedDestinations, parsingOptions);
internal TPage GetPage<TPage>(int pageNumber, NamedDestinations namedDestinations, ParsingOptions parsingOptions)
{
if (pageFactoryCache.TryGetValue(typeof(TPage), out var f) && f is IPageFactory<TPage> pageFactory)
{
return GetPage(pageFactory, pageNumber, namedDestinations, parsingOptions);
}
throw new InvalidOperationException($"Could not find page factory of type '{typeof(IPageFactory<TPage>)}' for page type {typeof(TPage)}.");
}
private TPage GetPage<TPage>(IPageFactory<TPage> pageFactory,
int pageNumber,
NamedDestinations namedDestinations,
ParsingOptions parsingOptions)
{
if (pageNumber <= 0 || pageNumber > Count)
{
@ -80,6 +106,53 @@
return page;
}
internal void AddPageFactory<TPage>(IPageFactory<TPage> pageFactory)
{
Type type = typeof(TPage);
if (pageFactoryCache.ContainsKey(type))
{
throw new InvalidOperationException($"Could not add page factory for page type '{type}' as it was already added.");
}
pageFactoryCache.Add(type, pageFactory);
}
internal void AddPageFactory<TPage, TPageFactory>() where TPageFactory : IPageFactory<TPage>
{
var constructor = typeof(TPageFactory).GetConstructor(new[]
{
typeof(IPdfTokenScanner),
typeof(IResourceStore),
typeof(ILookupFilterProvider),
typeof(IPageContentParser),
typeof(ParsingOptions)
});
if (constructor == null)
{
throw new InvalidOperationException($"Could not find valid constructor for page factory of type '{typeof(TPageFactory)}'. " +
"The page factory should have a constructor with the following parameters: " +
$"{typeof(IPdfTokenScanner)}, {typeof(IResourceStore)}, {typeof(ILookupFilterProvider)}, {typeof(IPageContentParser)}, {typeof(ParsingOptions)}.");
}
var instance = constructor.Invoke(new object[]
{
defaultPageFactory.PdfScanner,
defaultPageFactory.ResourceStore,
defaultPageFactory.FilterProvider,
defaultPageFactory.PageContentParser,
defaultPageFactory.ParsingOptions
});
if (instance is not IPageFactory<TPage> pageFactory)
{
throw new InvalidOperationException(
$"Something wrong happened while creating page factory of type '{typeof(TPageFactory)}' for page type '{typeof(TPage)}'.");
}
AddPageFactory(pageFactory);
}
internal PageTreeNode GetPageNode(int pageNumber)
{
if (!pagesByNumber.TryGetValue(pageNumber, out var node))
@ -102,5 +175,19 @@
return null;
}
public void Dispose()
{
foreach (var key in pageFactoryCache.Keys)
{
var factory = pageFactoryCache[key];
pageFactoryCache.Remove(key);
if (factory is IDisposable disposable)
{
disposable.Dispose();
}
}
}
}
}

View File

@ -112,7 +112,7 @@
/// <param name="options">Optional parameters controlling parsing.</param>
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options);
/// <summary>
/// Opens a file and creates a <see cref="PdfDocument"/> for reading from the provided file path.
/// </summary>
@ -133,6 +133,26 @@
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
public static PdfDocument Open(Stream stream, ParsingOptions options = null) => PdfDocumentFactory.Open(stream, options);
/// <summary>
/// TODO
/// </summary>
/// <typeparam name="TPage"></typeparam>
/// <param name="pageFactory"></param>
public void AddPageFactory<TPage>(IPageFactory<TPage> pageFactory)
{
pages.AddPageFactory(pageFactory);
}
/// <summary>
/// TODO
/// </summary>
/// <typeparam name="TPage"></typeparam>
/// <typeparam name="TPageFactory"></typeparam>
public void AddPageFactory<TPage, TPageFactory>() where TPageFactory : IPageFactory<TPage>
{
pages.AddPageFactory<TPage, TPageFactory>();
}
/// <summary>
/// Get the page with the specified page number (1 indexed).
/// </summary>
@ -162,6 +182,36 @@
}
}
/// <summary>
/// Get the page with the specified page number (1 indexed), using the specified page factory.
/// </summary>
/// <typeparam name="TPage"></typeparam>
/// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
/// <returns>The page.</returns>
public TPage GetPage<TPage>(int pageNumber)
{
if (isDisposed)
{
throw new ObjectDisposedException("Cannot access page after the document is disposed.");
}
parsingOptions.Logger.Debug($"Accessing page {pageNumber}.");
try
{
return pages.GetPage<TPage>(pageNumber, namedDestinations, parsingOptions);
}
catch (Exception ex)
{
if (IsEncrypted)
{
throw new PdfDocumentEncryptedException("Document was encrypted which may have caused error when retrieving page.", encryptionDictionary, ex);
}
throw;
}
}
/// <summary>
/// Gets all pages in this document in order.
/// </summary>
@ -173,6 +223,17 @@
}
}
/// <summary>
/// Gets all pages in this document in order, using the specified page factory.
/// </summary>
public IEnumerable<TPage> GetPages<TPage>()
{
for (var i = 0; i < NumberOfPages; i++)
{
yield return GetPage<TPage>(i + 1);
}
}
/// <summary>
/// Get the document level metadata if present.
/// The metadata is XML in the (Extensible Metadata Platform) XMP format.
@ -247,6 +308,7 @@
Advanced.Dispose();
pdfScanner.Dispose();
inputBytes.Dispose();
pages.Dispose();
}
catch (Exception ex)
{