diff --git a/src/UglyToad.PdfPig.Tests/Integration/PageFactoryTests.cs b/src/UglyToad.PdfPig.Tests/Integration/PageFactoryTests.cs new file mode 100644 index 00000000..eae5b54f --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/PageFactoryTests.cs @@ -0,0 +1,280 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using Content; + using Outline.Destinations; + using PdfPig.Core; + using PdfPig.Filters; + using PdfPig.Geometry; + using PdfPig.Graphics.Operations; + using PdfPig.Parser; + using PdfPig.Tokenization.Scanner; + using PdfPig.Tokens; + using System; + using System.Collections.Generic; + using Xunit; + + public class PageFactoryTests + { + [Fact] + public void SimpleFactory1() + { + var file = IntegrationHelpers.GetDocumentPath("ICML03-081"); + + using (var document = PdfDocument.Open(file)) + { + document.AddPageFactory(); + + for (int p = 1; p < document.NumberOfPages; p++) + { + var page = document.GetPage(p); + var pageInfo = document.GetPage(p); + + Assert.Equal(page.Number, pageInfo.Number); + Assert.Equal(page.Rotation.Value, pageInfo.Rotation); + Assert.Equal(page.MediaBox.Bounds, pageInfo.MediaBox.Bounds); + } + } + } + + [Fact] + public void SimpleFactory2() + { + var file = IntegrationHelpers.GetDocumentPath("cat-genetics"); + + using (var document = PdfDocument.Open(file)) + { + document.AddPageFactory(new SimplePageFactory()); + + var page = document.GetPage(1); + var pageInfo = document.GetPage(1); + + Assert.Equal(page.Number, pageInfo.Number); + Assert.Equal(page.Rotation.Value, pageInfo.Rotation); + Assert.Equal(page.MediaBox.Bounds, pageInfo.MediaBox.Bounds); + + // Run again + pageInfo = document.GetPage(1); + Assert.Equal(page.Number, pageInfo.Number); + Assert.Equal(page.Rotation.Value, pageInfo.Rotation); + Assert.Equal(page.MediaBox.Bounds, pageInfo.MediaBox.Bounds); + } + } + + [Fact] + public void InformationFactory() + { + var file = IntegrationHelpers.GetDocumentPath("Gamebook"); + + using (var document = PdfDocument.Open(file)) + { + document.AddPageFactory(); + + for (int p = 1; p < document.NumberOfPages; p++) + { + var page = document.GetPage(p); + + var pageInfo = document.GetPage(p); + Assert.Equal(page.Number, pageInfo.Number); + Assert.Equal(page.Rotation, pageInfo.Rotation); + Assert.Equal(page.Width, pageInfo.Width); + Assert.Equal(page.Height, pageInfo.Height); + + // Run again + pageInfo = document.GetPage(p); + Assert.Equal(page.Number, pageInfo.Number); + Assert.Equal(page.Rotation, pageInfo.Rotation); + Assert.Equal(page.Width, pageInfo.Width); + Assert.Equal(page.Height, pageInfo.Height); + } + } + } + + [Fact] + public void SimpleAndInformationFactory() + { + var file = IntegrationHelpers.GetDocumentPath("DeviceN_CS_test"); + + using (var document = PdfDocument.Open(file)) + { + document.AddPageFactory(); + document.AddPageFactory(); + + for (int p = 1; p < document.NumberOfPages; p++) + { + var page = document.GetPage(p); + + var pageInfo = document.GetPage(p); + Assert.Equal(page.Number, pageInfo.Number); + Assert.Equal(page.Rotation, pageInfo.Rotation); + Assert.Equal(page.Width, pageInfo.Width); + Assert.Equal(page.Height, pageInfo.Height); + + var simplePage = document.GetPage(p); + Assert.Equal(page.Number, simplePage.Number); + Assert.Equal(page.Rotation.Value, simplePage.Rotation); + Assert.Equal(page.MediaBox.Bounds, simplePage.MediaBox.Bounds); + } + } + } + + [Fact] + public void NoPageFactory() + { + var file = IntegrationHelpers.GetDocumentPath("cat-genetics"); + + using (var document = PdfDocument.Open(file)) + { + var exception = Assert.Throws(() => document.GetPage(1)); + Assert.StartsWith("Could not find page factory of type", exception.Message); + } + } + + [Fact] + public void WrongSignatureFactory() + { + var file = IntegrationHelpers.GetDocumentPath("Gamebook"); + + using (var document = PdfDocument.Open(file)) + { + var exception = Assert.Throws(() => + document.AddPageFactory()); + Assert.StartsWith("Could not find valid constructor for page factory of type ", exception.Message); + } + } + + #region Wrong + + public class WrongConstructorFactory : BasePageFactory + { + public WrongConstructorFactory( + IResourceStore resourceStore, + ILookupFilterProvider filterProvider, + IPageContentParser pageContentParser, + ParsingOptions parsingOptions) + : base(null, resourceStore, filterProvider, pageContentParser, parsingOptions) + { + } + + protected override PageInformation ProcessPage(int pageNumber, + DictionaryToken dictionary, + NamedDestinations namedDestinations, + MediaBox mediaBox, + CropBox cropBox, + UserSpaceUnit userSpaceUnit, + PageRotationDegrees rotation, + TransformationMatrix initialMatrix, + IReadOnlyList operations) + { + throw new Exception(); + } + } + + #endregion + + #region SimplePage + + public sealed class SimplePage + { + public int Number { get; } + + public int Rotation { get; } + + public MediaBox MediaBox { get; } + + public SimplePage(int number, int rotation, MediaBox mediaBox) + { + Number = number; + Rotation = rotation; + MediaBox = mediaBox; + } + } + + public sealed class SimplePageFactory : IPageFactory + { + public SimplePageFactory() + { + // do nothing + } + + public SimplePageFactory( + IPdfTokenScanner pdfScanner, + IResourceStore resourceStore, + ILookupFilterProvider filterProvider, + IPageContentParser pageContentParser, + ParsingOptions parsingOptions) + { + // do nothing + } + + public SimplePage Create(int number, + DictionaryToken dictionary, + PageTreeMembers pageTreeMembers, + NamedDestinations namedDestinations) + { + return new SimplePage(number, pageTreeMembers.Rotation, pageTreeMembers.MediaBox); + } + } + + #endregion + + #region PageInformation + + public readonly struct PageInformation + { + public int Number { get; } + + public PageRotationDegrees Rotation { get; } + + public double Width { get; } + + public double Height { get; } + + public UserSpaceUnit UserSpaceUnit { get; } + + public PageInformation(int number, + PageRotationDegrees rotation, + double width, + double height, + UserSpaceUnit userSpaceUnit) + { + Number = number; + Rotation = rotation; + Width = width; + Height = height; + UserSpaceUnit = userSpaceUnit; + } + } + + public sealed class PageInformationFactory : BasePageFactory + { + public PageInformationFactory( + IPdfTokenScanner pdfScanner, + IResourceStore resourceStore, + ILookupFilterProvider filterProvider, + IPageContentParser pageContentParser, + ParsingOptions parsingOptions) + : base(pdfScanner, resourceStore, filterProvider, pageContentParser, parsingOptions) + { + } + + protected override PageInformation ProcessPage(int pageNumber, + DictionaryToken dictionary, + NamedDestinations namedDestinations, + MediaBox mediaBox, + CropBox cropBox, + UserSpaceUnit userSpaceUnit, + PageRotationDegrees rotation, + TransformationMatrix initialMatrix, + IReadOnlyList operations) + { + // Same logic as in Page class: + // Special case where cropbox is outside mediabox: use cropbox instead of intersection + var viewBox = mediaBox.Bounds.Intersect(cropBox.Bounds) ?? cropBox.Bounds; + + return new PageInformation(pageNumber, rotation, viewBox.Width, viewBox.Height, userSpaceUnit); + } + } + + #endregion + } +} diff --git a/src/UglyToad.PdfPig.Tests/Integration/StreamProcessorTests.cs b/src/UglyToad.PdfPig.Tests/Integration/StreamProcessorTests.cs new file mode 100644 index 00000000..1d5d43c3 --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/StreamProcessorTests.cs @@ -0,0 +1,257 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using Content; + using Outline.Destinations; + using PdfFonts; + using PdfPig.Core; + using PdfPig.Filters; + using PdfPig.Geometry; + using PdfPig.Graphics; + using PdfPig.Graphics.Colors; + using PdfPig.Graphics.Operations; + using PdfPig.Parser; + using PdfPig.Tokenization.Scanner; + using PdfPig.Tokens; + using System.Collections.Generic; + using System.Linq; + using Xunit; + + public class StreamProcessorTests + { + [Fact] + public void TextOnly() + { + var file = IntegrationHelpers.GetDocumentPath("cat-genetics"); + + using (var document = PdfDocument.Open(file)) + { + document.AddPageFactory(); + + var page = document.GetPage(1); + var textOnlyPage = document.GetPage(1); + + string expected = string.Concat(page.Letters.Select(l => l.Value)); + Assert.Equal(expected, textOnlyPage.Text); + } + } + + #region AdvancedPage + + public readonly struct TextOnlyPage + { + public int Number { get; } + + public string Text { get; } + + public TextOnlyPage(int number, string text) + { + Number = number; + Text = text; + } + } + + public readonly struct TextOnlyPageContent + { + public IReadOnlyList Letters { get; } + + public TextOnlyPageContent(IReadOnlyList letters) + { + Letters = letters; + } + } + + public class TextOnlyPageInformationFactory : BasePageFactory + { + public TextOnlyPageInformationFactory( + IPdfTokenScanner pdfScanner, + IResourceStore resourceStore, + ILookupFilterProvider filterProvider, + IPageContentParser pageContentParser, + ParsingOptions parsingOptions) + : base(pdfScanner, resourceStore, filterProvider, pageContentParser, parsingOptions) + { + } + + protected override TextOnlyPage ProcessPage(int pageNumber, + DictionaryToken dictionary, + NamedDestinations namedDestinations, + MediaBox mediaBox, + CropBox cropBox, + UserSpaceUnit userSpaceUnit, + PageRotationDegrees rotation, + TransformationMatrix initialMatrix, + IReadOnlyList operations) + { + if (operations.Count == 0) + { + return new TextOnlyPage(pageNumber, string.Empty); + } + + var context = new TextOnlyStreamProcessor( + pageNumber, + ResourceStore, + PdfScanner, + PageContentParser, + FilterProvider, + cropBox, + userSpaceUnit, + rotation, + initialMatrix, + ParsingOptions); + + TextOnlyPageContent content = context.Process(pageNumber, operations); + + return new TextOnlyPage(pageNumber, string.Concat(content.Letters)); + } + } + + public sealed class TextOnlyStreamProcessor : BaseStreamProcessor + { + private readonly List _letters = new List(); + + public TextOnlyStreamProcessor(int pageNumber, + IResourceStore resourceStore, + IPdfTokenScanner pdfScanner, + IPageContentParser pageContentParser, + ILookupFilterProvider filterProvider, + CropBox cropBox, + UserSpaceUnit userSpaceUnit, + PageRotationDegrees rotation, + TransformationMatrix initialMatrix, + ParsingOptions parsingOptions) + : base(pageNumber, + resourceStore, + pdfScanner, + pageContentParser, + filterProvider, + cropBox, + userSpaceUnit, + rotation, + initialMatrix, + parsingOptions) + { + } + + public override TextOnlyPageContent Process(int pageNumberCurrent, + IReadOnlyList operations) + { + CloneAllStates(); + + ProcessOperations(operations); + + return new TextOnlyPageContent(_letters); + } + + public override void RenderGlyph(IFont font, + IColor strokingColor, + IColor nonStrokingColor, + TextRenderingMode textRenderingMode, + double fontSize, + double pointSize, + int code, + string unicode, + long currentOffset, + TransformationMatrix renderingMatrix, + TransformationMatrix textMatrix, + TransformationMatrix transformationMatrix, + CharacterBoundingBox characterBoundingBox) + { + _letters.Add(unicode); + } + + protected override void RenderXObjectImage(XObjectContentRecord xObjectContentRecord) + { + // No op + } + + public override void BeginSubpath() + { + // No op + } + + public override PdfPoint? CloseSubpath() + { + return new PdfPoint(); + } + + public override void StrokePath(bool close) + { + // No op + } + + public override void FillPath(FillingRule fillingRule, bool close) + { + // No op + } + + public override void FillStrokePath(FillingRule fillingRule, bool close) + { + // No op + } + + public override void MoveTo(double x, double y) + { + // No op + } + + public override void BezierCurveTo(double x1, double y1, double x2, double y2, double x3, double y3) + { + // No op + } + + public override void LineTo(double x, double y) + { + // No op + } + + public override void Rectangle(double x, double y, double width, double height) + { + // No op + } + + public override void EndPath() + { + // No op + } + + public override void ClosePath() + { + // No op + } + + public override void BeginMarkedContent(NameToken name, + NameToken propertyDictionaryName, + DictionaryToken properties) + { + // No op + } + + public override void EndMarkedContent() + { + // No op + } + + public override void ModifyClippingIntersect(FillingRule clippingRule) + { + // No op + } + + public override void PaintShading(NameToken shadingName) + { + // No op + } + + protected override void RenderInlineImage(InlineImage inlineImage) + { + // No op + } + + public override void BezierCurveTo(double x2, double y2, double x3, double y3) + { + // No op + } + } + + #endregion + } +} diff --git a/src/UglyToad.PdfPig/Content/Pages.cs b/src/UglyToad.PdfPig/Content/Pages.cs index e07c00b4..85b0efaa 100644 --- a/src/UglyToad.PdfPig/Content/Pages.cs +++ b/src/UglyToad.PdfPig/Content/Pages.cs @@ -3,14 +3,17 @@ using System; using System.Collections.Generic; using Core; + using Filters; using Outline.Destinations; + using Parser; using Tokenization.Scanner; using Tokens; using Util; - internal class Pages + internal sealed class Pages : IDisposable { - private readonly IPageFactory pageFactory; + private readonly Dictionary pageFactoryCache; + private readonly PageFactory defaultPageFactory; private readonly IPdfTokenScanner pdfScanner; private readonly Dictionary pagesByNumber; @@ -21,15 +24,38 @@ /// public PageTreeNode PageTree { get; } - internal Pages(IPageFactory pageFactory, IPdfTokenScanner pdfScanner, PageTreeNode pageTree, Dictionary pagesByNumber) + internal Pages(IPageFactory pageFactory, + IPdfTokenScanner pdfScanner, + PageTreeNode pageTree, + Dictionary pagesByNumber) { - this.pageFactory = pageFactory ?? throw new ArgumentNullException(nameof(pageFactory)); + pageFactoryCache = new Dictionary(); + + defaultPageFactory = (PageFactory)pageFactory ?? throw new ArgumentNullException(nameof(pageFactory)); this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner)); this.pagesByNumber = pagesByNumber; PageTree = pageTree; + + AddPageFactory(defaultPageFactory); } - internal Page GetPage(int pageNumber, NamedDestinations namedDestinations, ParsingOptions parsingOptions) + internal Page GetPage(int pageNumber, NamedDestinations namedDestinations, ParsingOptions parsingOptions) => + GetPage(defaultPageFactory, pageNumber, namedDestinations, parsingOptions); + + internal TPage GetPage(int pageNumber, NamedDestinations namedDestinations, ParsingOptions parsingOptions) + { + if (pageFactoryCache.TryGetValue(typeof(TPage), out var f) && f is IPageFactory pageFactory) + { + return GetPage(pageFactory, pageNumber, namedDestinations, parsingOptions); + } + + throw new InvalidOperationException($"Could not find page factory of type '{typeof(IPageFactory)}' for page type {typeof(TPage)}."); + } + + private TPage GetPage(IPageFactory pageFactory, + int pageNumber, + NamedDestinations namedDestinations, + ParsingOptions parsingOptions) { if (pageNumber <= 0 || pageNumber > Count) { @@ -80,6 +106,53 @@ return page; } + internal void AddPageFactory(IPageFactory pageFactory) + { + Type type = typeof(TPage); + if (pageFactoryCache.ContainsKey(type)) + { + throw new InvalidOperationException($"Could not add page factory for page type '{type}' as it was already added."); + } + + pageFactoryCache.Add(type, pageFactory); + } + + internal void AddPageFactory() where TPageFactory : IPageFactory + { + var constructor = typeof(TPageFactory).GetConstructor(new[] + { + typeof(IPdfTokenScanner), + typeof(IResourceStore), + typeof(ILookupFilterProvider), + typeof(IPageContentParser), + typeof(ParsingOptions) + }); + + if (constructor == null) + { + throw new InvalidOperationException($"Could not find valid constructor for page factory of type '{typeof(TPageFactory)}'. " + + "The page factory should have a constructor with the following parameters: " + + $"{typeof(IPdfTokenScanner)}, {typeof(IResourceStore)}, {typeof(ILookupFilterProvider)}, {typeof(IPageContentParser)}, {typeof(ParsingOptions)}."); + } + + var instance = constructor.Invoke(new object[] + { + defaultPageFactory.PdfScanner, + defaultPageFactory.ResourceStore, + defaultPageFactory.FilterProvider, + defaultPageFactory.PageContentParser, + defaultPageFactory.ParsingOptions + }); + + if (instance is not IPageFactory pageFactory) + { + throw new InvalidOperationException( + $"Something wrong happened while creating page factory of type '{typeof(TPageFactory)}' for page type '{typeof(TPage)}'."); + } + + AddPageFactory(pageFactory); + } + internal PageTreeNode GetPageNode(int pageNumber) { if (!pagesByNumber.TryGetValue(pageNumber, out var node)) @@ -102,5 +175,19 @@ return null; } + + public void Dispose() + { + foreach (var key in pageFactoryCache.Keys) + { + var factory = pageFactoryCache[key]; + pageFactoryCache.Remove(key); + + if (factory is IDisposable disposable) + { + disposable.Dispose(); + } + } + } } } diff --git a/src/UglyToad.PdfPig/PdfDocument.cs b/src/UglyToad.PdfPig/PdfDocument.cs index d8a4d36c..27cc86b0 100644 --- a/src/UglyToad.PdfPig/PdfDocument.cs +++ b/src/UglyToad.PdfPig/PdfDocument.cs @@ -112,7 +112,7 @@ /// Optional parameters controlling parsing. /// A providing access to the file contents. public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options); - + /// /// Opens a file and creates a for reading from the provided file path. /// @@ -133,6 +133,26 @@ /// A providing access to the file contents. public static PdfDocument Open(Stream stream, ParsingOptions options = null) => PdfDocumentFactory.Open(stream, options); + /// + /// TODO + /// + /// + /// + public void AddPageFactory(IPageFactory pageFactory) + { + pages.AddPageFactory(pageFactory); + } + + /// + /// TODO + /// + /// + /// + public void AddPageFactory() where TPageFactory : IPageFactory + { + pages.AddPageFactory(); + } + /// /// Get the page with the specified page number (1 indexed). /// @@ -162,6 +182,36 @@ } } + /// + /// Get the page with the specified page number (1 indexed), using the specified page factory. + /// + /// + /// The number of the page to return, this starts from 1. + /// The page. + public TPage GetPage(int pageNumber) + { + if (isDisposed) + { + throw new ObjectDisposedException("Cannot access page after the document is disposed."); + } + + parsingOptions.Logger.Debug($"Accessing page {pageNumber}."); + + try + { + return pages.GetPage(pageNumber, namedDestinations, parsingOptions); + } + catch (Exception ex) + { + if (IsEncrypted) + { + throw new PdfDocumentEncryptedException("Document was encrypted which may have caused error when retrieving page.", encryptionDictionary, ex); + } + + throw; + } + } + /// /// Gets all pages in this document in order. /// @@ -173,6 +223,17 @@ } } + /// + /// Gets all pages in this document in order, using the specified page factory. + /// + public IEnumerable GetPages() + { + for (var i = 0; i < NumberOfPages; i++) + { + yield return GetPage(i + 1); + } + } + /// /// Get the document level metadata if present. /// The metadata is XML in the (Extensible Metadata Platform) XMP format. @@ -247,6 +308,7 @@ Advanced.Dispose(); pdfScanner.Dispose(); inputBytes.Dispose(); + pages.Dispose(); } catch (Exception ex) {