diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index efae5960..1e15fd96 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -74,13 +74,14 @@ "UglyToad.PdfPig.Annotations.AppearanceStream", "UglyToad.PdfPig.Annotations.QuadPointsQuadrilateral", "UglyToad.PdfPig.Content.ArtifactMarkedContentElement", + "UglyToad.PdfPig.Content.BasePageFactory`1", "UglyToad.PdfPig.Content.Catalog", "UglyToad.PdfPig.Content.CropBox", "UglyToad.PdfPig.Content.DocumentInformation", "UglyToad.PdfPig.Content.EmbeddedFile", "UglyToad.PdfPig.Content.Hyperlink", "UglyToad.PdfPig.Content.InlineImage", - "UglyToad.PdfPig.Content.IPageFactory", + "UglyToad.PdfPig.Content.IPageFactory`1", "UglyToad.PdfPig.Content.IPdfImage", "UglyToad.PdfPig.Content.IResourceStore", "UglyToad.PdfPig.Content.Letter", diff --git a/src/UglyToad.PdfPig/Content/BasePageFactory.cs b/src/UglyToad.PdfPig/Content/BasePageFactory.cs new file mode 100644 index 00000000..319f435f --- /dev/null +++ b/src/UglyToad.PdfPig/Content/BasePageFactory.cs @@ -0,0 +1,326 @@ +namespace UglyToad.PdfPig.Content +{ + using System; + using System.Collections.Generic; + using Core; + using Filters; + using Geometry; + using Graphics; + using Graphics.Operations; + using Outline.Destinations; + using Parser; + using Parser.Parts; + using Tokenization.Scanner; + using Tokens; + using Util; + + /// + /// Page factory abstract class. + /// + /// The type of page the page factory creates. + public abstract class BasePageFactory : IPageFactory + { + /// + /// The parsing options. + /// + public readonly ParsingOptions ParsingOptions; + + /// + /// The Pdf token scanner. + /// + public readonly IPdfTokenScanner PdfScanner; + + /// + /// The resource store. + /// + public readonly IResourceStore ResourceStore; + + /// + /// The filter provider. + /// + public readonly ILookupFilterProvider FilterProvider; + + /// + /// The page content parser. + /// + public readonly IPageContentParser PageContentParser; + + /// + /// Create a . + /// + protected BasePageFactory( + IPdfTokenScanner pdfScanner, + IResourceStore resourceStore, + ILookupFilterProvider filterProvider, + IPageContentParser pageContentParser, + ParsingOptions parsingOptions) + { + this.ResourceStore = resourceStore; + this.FilterProvider = filterProvider; + this.PageContentParser = pageContentParser; + this.PdfScanner = pdfScanner; + this.ParsingOptions = parsingOptions; + } + + /// + public TPage Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, + NamedDestinations namedDestinations) + { + if (dictionary == null) + { + throw new ArgumentNullException(nameof(dictionary)); + } + + var type = dictionary.GetNameOrDefault(NameToken.Type); + + if (type != null && !type.Equals(NameToken.Page)) + { + ParsingOptions.Logger.Error($"Page {number} had its type specified as {type} rather than 'Page'."); + } + + var rotation = new PageRotationDegrees(pageTreeMembers.Rotation); + if (dictionary.TryGet(NameToken.Rotate, PdfScanner, out NumericToken rotateToken)) + { + rotation = new PageRotationDegrees(rotateToken.Int); + } + + var stackDepth = 0; + + while (pageTreeMembers.ParentResources.Count > 0) + { + var resource = pageTreeMembers.ParentResources.Dequeue(); + + ResourceStore.LoadResourceDictionary(resource); + stackDepth++; + } + + if (dictionary.TryGet(NameToken.Resources, PdfScanner, out DictionaryToken resources)) + { + ResourceStore.LoadResourceDictionary(resources); + stackDepth++; + } + + UserSpaceUnit userSpaceUnit = GetUserSpaceUnits(dictionary); + + MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers); + CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox); + + var initialMatrix = OperationContextHelper.GetInitialMatrix(userSpaceUnit, mediaBox, cropBox, rotation, ParsingOptions.Logger); + + ApplyTransformNormalise(initialMatrix, ref mediaBox, ref cropBox); + + TPage page; + + if (!dictionary.TryGet(NameToken.Contents, out var contents)) + { + // ignored for now, is it possible? check the spec... + page = ProcessPageInternal(number, dictionary, namedDestinations, mediaBox, cropBox, userSpaceUnit, rotation, initialMatrix, null); + } + else if (DirectObjectFinder.TryGet(contents, PdfScanner, out var array)) + { + var bytes = new List(); + + for (var i = 0; i < array.Data.Count; i++) + { + var item = array.Data[i]; + + if (!(item is IndirectReferenceToken obj)) + { + throw new PdfDocumentFormatException($"The contents contained something which was not an indirect reference: {item}."); + } + + var contentStream = DirectObjectFinder.Get(obj, PdfScanner); + + if (contentStream == null) + { + throw new InvalidOperationException($"Could not find the contents for object {obj}."); + } + + bytes.AddRange(contentStream.Decode(FilterProvider, PdfScanner)); + + if (i < array.Data.Count - 1) + { + bytes.Add((byte)'\n'); + } + } + + page = ProcessPageInternal(number, dictionary, namedDestinations, mediaBox, cropBox, userSpaceUnit, rotation, initialMatrix, bytes); + } + else + { + var contentStream = DirectObjectFinder.Get(contents, PdfScanner); + + if (contentStream == null) + { + throw new InvalidOperationException("Failed to parse the content for the page: " + number); + } + + var bytes = contentStream.Decode(FilterProvider, PdfScanner); + + page = ProcessPageInternal(number, dictionary, namedDestinations, mediaBox, cropBox, userSpaceUnit, rotation, initialMatrix, bytes); + } + + for (var i = 0; i < stackDepth; i++) + { + ResourceStore.UnloadResourceDictionary(); + } + + return page; + } + + private TPage ProcessPageInternal( + int pageNumber, + DictionaryToken dictionary, + NamedDestinations namedDestinations, + MediaBox mediaBox, + CropBox cropBox, + UserSpaceUnit userSpaceUnit, + PageRotationDegrees rotation, + TransformationMatrix initialMatrix, + IReadOnlyList contentBytes) + { + IReadOnlyList operations; + + if (contentBytes == null || contentBytes.Count == 0) + { + operations = EmptyArray.Instance; + } + else + { + operations = PageContentParser.Parse(pageNumber, + new ByteArrayInputBytes(contentBytes), + ParsingOptions.Logger); + } + + return ProcessPage(pageNumber, + dictionary, + namedDestinations, + mediaBox, + cropBox, + userSpaceUnit, + rotation, + initialMatrix, + operations); + } + + /// + /// Process a page with content. + /// + /// The page number, starts at 1. + /// + /// + /// The page media box. + /// The page effective crop box, computed as the intersection of the initial crop box and the media box. + /// + /// The page rotation. + /// + /// The page operations. Can be empty if the page has no content. + protected abstract TPage ProcessPage( + int pageNumber, + DictionaryToken dictionary, + NamedDestinations namedDestinations, + MediaBox mediaBox, + CropBox cropBox, + UserSpaceUnit userSpaceUnit, + PageRotationDegrees rotation, + TransformationMatrix initialMatrix, + IReadOnlyList operations); + + /// + /// Get the user space units. + /// + protected static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary) + { + if (dictionary.TryGet(NameToken.UserUnit, out var userUnitBase) && userUnitBase is NumericToken userUnitNumber) + { + return new UserSpaceUnit(userUnitNumber.Int); + } + + return UserSpaceUnit.Default; + } + + /// + /// Get the crop box. + /// + protected CropBox GetCropBox(DictionaryToken dictionary, PageTreeMembers pageTreeMembers, MediaBox mediaBox) + { + CropBox cropBox; + if (dictionary.TryGet(NameToken.CropBox, out var cropBoxObject) && + DirectObjectFinder.TryGet(cropBoxObject, PdfScanner, out ArrayToken cropBoxArray)) + { + if (cropBoxArray.Length != 4) + { + ParsingOptions.Logger.Error( + $"The CropBox was the wrong length in the dictionary: {dictionary}. Array was: {cropBoxArray}. Using MediaBox."); + + cropBox = new CropBox(mediaBox.Bounds); + + return cropBox; + } + + cropBox = new CropBox(cropBoxArray.ToRectangle(PdfScanner)); + } + else + { + cropBox = pageTreeMembers.GetCropBox() ?? new CropBox(mediaBox.Bounds); + } + + return cropBox; + } + + /// + /// Get the media box. + /// + protected MediaBox GetMediaBox(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers) + { + MediaBox mediaBox; + if (dictionary.TryGet(NameToken.MediaBox, out var mediaBoxObject) + && DirectObjectFinder.TryGet(mediaBoxObject, PdfScanner, out ArrayToken mediaBoxArray)) + { + if (mediaBoxArray.Length != 4) + { + ParsingOptions.Logger.Error( + $"The MediaBox was the wrong length in the dictionary: {dictionary}. Array was: {mediaBoxArray}. Defaulting to US Letter."); + + mediaBox = MediaBox.Letter; + + return mediaBox; + } + + mediaBox = new MediaBox(mediaBoxArray.ToRectangle(PdfScanner)); + } + else + { + mediaBox = pageTreeMembers.MediaBox; + + if (mediaBox == null) + { + ParsingOptions.Logger.Error( + $"The MediaBox was the wrong missing for page {number}. Using US Letter."); + + // PDFBox defaults to US Letter. + mediaBox = MediaBox.Letter; + } + } + + return mediaBox; + } + + /// + /// Apply the matrix transform to the media box and crop box. + /// Then Normalise() in order to obtain rectangles with rotation=0 + /// and width and height as viewed on screen. + /// + /// + /// + /// + protected static void ApplyTransformNormalise(TransformationMatrix transformationMatrix, ref MediaBox mediaBox, ref CropBox cropBox) + { + if (transformationMatrix != TransformationMatrix.Identity) + { + mediaBox = new MediaBox(transformationMatrix.Transform(mediaBox.Bounds).Normalise()); + cropBox = new CropBox(transformationMatrix.Transform(cropBox.Bounds).Normalise()); + } + } + } +} diff --git a/src/UglyToad.PdfPig/Content/IPageFactory.cs b/src/UglyToad.PdfPig/Content/IPageFactory.cs index 2e87112c..39f5338d 100644 --- a/src/UglyToad.PdfPig/Content/IPageFactory.cs +++ b/src/UglyToad.PdfPig/Content/IPageFactory.cs @@ -6,14 +6,15 @@ /// /// Page factory interface. /// - public interface IPageFactory + /// The type of page the page factory creates. + public interface IPageFactory { /// /// Create the page. /// - Page Create(int number, + TPage Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, - NamedDestinations annotationProvider); + NamedDestinations namedDestinations); } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Content/Pages.cs b/src/UglyToad.PdfPig/Content/Pages.cs index 2f661b66..e07c00b4 100644 --- a/src/UglyToad.PdfPig/Content/Pages.cs +++ b/src/UglyToad.PdfPig/Content/Pages.cs @@ -1,18 +1,19 @@ namespace UglyToad.PdfPig.Content { - using Core; - using Outline.Destinations; using System; using System.Collections.Generic; + using Core; + using Outline.Destinations; using Tokenization.Scanner; using Tokens; using Util; internal class Pages { - private readonly IPageFactory pageFactory; + private readonly IPageFactory pageFactory; private readonly IPdfTokenScanner pdfScanner; private readonly Dictionary pagesByNumber; + public int Count => pagesByNumber.Count; /// @@ -20,7 +21,7 @@ /// public PageTreeNode PageTree { get; } - internal Pages(IPageFactory pageFactory, IPdfTokenScanner pdfScanner, PageTreeNode pageTree, Dictionary pagesByNumber) + internal Pages(IPageFactory pageFactory, IPdfTokenScanner pdfScanner, PageTreeNode pageTree, Dictionary pagesByNumber) { this.pageFactory = pageFactory ?? throw new ArgumentNullException(nameof(pageFactory)); this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner)); @@ -34,7 +35,7 @@ { parsingOptions.Logger.Error($"Page {pageNumber} requested but is out of range."); - throw new ArgumentOutOfRangeException(nameof(pageNumber), + throw new ArgumentOutOfRangeException(nameof(pageNumber), $"Page number {pageNumber} invalid, must be between 1 and {Count}."); } @@ -49,7 +50,7 @@ } var pageTreeMembers = new PageTreeMembers(); - + while (pageStack.Count > 0) { currentNode = pageStack.Pop(); @@ -58,7 +59,7 @@ { pageTreeMembers.ParentResources.Enqueue(resourcesDictionary); } - + if (currentNode.NodeDictionary.TryGet(NameToken.MediaBox, pdfScanner, out ArrayToken mediaBox)) { pageTreeMembers.MediaBox = new MediaBox(mediaBox.ToRectangle(pdfScanner)); diff --git a/src/UglyToad.PdfPig/Content/PagesFactory.cs b/src/UglyToad.PdfPig/Content/PagesFactory.cs index d6de06aa..9c129b35 100644 --- a/src/UglyToad.PdfPig/Content/PagesFactory.cs +++ b/src/UglyToad.PdfPig/Content/PagesFactory.cs @@ -10,9 +10,9 @@ using Tokens; using Util; - internal class PagesFactory + internal static class PagesFactory { - private class PageCounter + private sealed class PageCounter { public int PageCount { get; private set; } public void Increment() @@ -21,7 +21,7 @@ } } - public static Pages Create(IndirectReference pagesReference, DictionaryToken pagesDictionary, IPdfTokenScanner scanner, IPageFactory pageFactory, ILog log, bool isLenientParsing) + public static Pages Create(IndirectReference pagesReference, DictionaryToken pagesDictionary, IPdfTokenScanner scanner, IPageFactory pageFactory, ILog log, bool isLenientParsing) { var pageNumber = new PageCounter(); diff --git a/src/UglyToad.PdfPig/Parser/PageFactory.cs b/src/UglyToad.PdfPig/Parser/PageFactory.cs index 77d98d94..de8c20a8 100644 --- a/src/UglyToad.PdfPig/Parser/PageFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PageFactory.cs @@ -1,6 +1,5 @@ namespace UglyToad.PdfPig.Parser { - using System; using System.Collections.Generic; using Annotations; using Content; @@ -9,267 +8,81 @@ using Geometry; using Graphics; using Graphics.Operations; - using Logging; using Outline.Destinations; - using Parts; using Tokenization.Scanner; using Tokens; - using Util; - internal class PageFactory : IPageFactory + internal class PageFactory : BasePageFactory { - private readonly ParsingOptions parsingOptions; - private readonly IPdfTokenScanner pdfScanner; - private readonly IResourceStore resourceStore; - private readonly ILookupFilterProvider filterProvider; - private readonly IPageContentParser pageContentParser; - public PageFactory( IPdfTokenScanner pdfScanner, IResourceStore resourceStore, ILookupFilterProvider filterProvider, IPageContentParser pageContentParser, ParsingOptions parsingOptions) + : base(pdfScanner, resourceStore, filterProvider, pageContentParser, parsingOptions) { - this.resourceStore = resourceStore; - this.filterProvider = filterProvider; - this.pageContentParser = pageContentParser; - this.pdfScanner = pdfScanner; - this.parsingOptions = parsingOptions; } - public Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, - NamedDestinations namedDestinations) - { - if (dictionary == null) - { - throw new ArgumentNullException(nameof(dictionary)); - } - - var type = dictionary.GetNameOrDefault(NameToken.Type); - - if (type != null && !type.Equals(NameToken.Page)) - { - parsingOptions.Logger.Error($"Page {number} had its type specified as {type} rather than 'Page'."); - } - - var rotation = new PageRotationDegrees(pageTreeMembers.Rotation); - if (dictionary.TryGet(NameToken.Rotate, pdfScanner, out NumericToken rotateToken)) - { - rotation = new PageRotationDegrees(rotateToken.Int); - } - - var stackDepth = 0; - - while (pageTreeMembers.ParentResources.Count > 0) - { - var resource = pageTreeMembers.ParentResources.Dequeue(); - - resourceStore.LoadResourceDictionary(resource); - stackDepth++; - } - - if (dictionary.TryGet(NameToken.Resources, pdfScanner, out DictionaryToken resources)) - { - resourceStore.LoadResourceDictionary(resources); - stackDepth++; - } - - UserSpaceUnit userSpaceUnit = GetUserSpaceUnits(dictionary); - - MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers); - CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox); - - var initialMatrix = OperationContextHelper.GetInitialMatrix(userSpaceUnit, mediaBox, cropBox, rotation, parsingOptions.Logger); - - ApplyTransformNormalise(initialMatrix, ref mediaBox, ref cropBox); - - PageContent content; - - if (!dictionary.TryGet(NameToken.Contents, out var contents)) - { - content = new PageContent(EmptyArray.Instance, - EmptyArray.Instance, - EmptyArray.Instance, - EmptyArray>.Instance, - EmptyArray.Instance, - pdfScanner, - filterProvider, - resourceStore); - // ignored for now, is it possible? check the spec... - } - else if (DirectObjectFinder.TryGet(contents, pdfScanner, out var array)) - { - var bytes = new List(); - - for (var i = 0; i < array.Data.Count; i++) - { - var item = array.Data[i]; - - if (!(item is IndirectReferenceToken obj)) - { - throw new PdfDocumentFormatException($"The contents contained something which was not an indirect reference: {item}."); - } - - var contentStream = DirectObjectFinder.Get(obj, pdfScanner); - - if (contentStream == null) - { - throw new InvalidOperationException($"Could not find the contents for object {obj}."); - } - - bytes.AddRange(contentStream.Decode(filterProvider, pdfScanner)); - - if (i < array.Data.Count - 1) - { - bytes.Add((byte)'\n'); - } - } - - content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, initialMatrix, parsingOptions); - } - else - { - var contentStream = DirectObjectFinder.Get(contents, pdfScanner); - - if (contentStream == null) - { - throw new InvalidOperationException("Failed to parse the content for the page: " + number); - } - - var bytes = contentStream.Decode(filterProvider, pdfScanner); - - content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, initialMatrix, parsingOptions); - } - - var annotationProvider = new AnnotationProvider(pdfScanner, dictionary, initialMatrix, namedDestinations, parsingOptions.Logger); - var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, annotationProvider, pdfScanner); - - for (var i = 0; i < stackDepth; i++) - { - resourceStore.UnloadResourceDictionary(); - } - - return page; - } - - private PageContent GetContent( - int pageNumber, - IReadOnlyList contentBytes, + protected override Page ProcessPage(int pageNumber, + DictionaryToken dictionary, + NamedDestinations namedDestinations, + MediaBox mediaBox, CropBox cropBox, UserSpaceUnit userSpaceUnit, PageRotationDegrees rotation, TransformationMatrix initialMatrix, - ParsingOptions parsingOptions) + IReadOnlyList operations) { - var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes), - parsingOptions.Logger); + var annotationProvider = new AnnotationProvider(PdfScanner, + dictionary, + initialMatrix, + namedDestinations, + ParsingOptions.Logger); + + if (operations == null || operations.Count == 0) + { + PageContent emptyContent = new PageContent(EmptyArray.Instance, + EmptyArray.Instance, + EmptyArray.Instance, + EmptyArray>.Instance, + EmptyArray.Instance, + PdfScanner, + FilterProvider, + ResourceStore); + + return new Page(pageNumber, + dictionary, + mediaBox, + cropBox, + rotation, + emptyContent, + annotationProvider, + PdfScanner); + } var context = new ContentStreamProcessor( pageNumber, - resourceStore, - pdfScanner, - pageContentParser, - filterProvider, + ResourceStore, + PdfScanner, + PageContentParser, + FilterProvider, cropBox, userSpaceUnit, rotation, initialMatrix, - parsingOptions); + ParsingOptions); - return context.Process(pageNumber, operations); - } + PageContent content = context.Process(pageNumber, operations); - private static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary) - { - var spaceUnits = UserSpaceUnit.Default; - if (dictionary.TryGet(NameToken.UserUnit, out var userUnitBase) && userUnitBase is NumericToken userUnitNumber) - { - spaceUnits = new UserSpaceUnit(userUnitNumber.Int); - } - - return spaceUnits; - } - - private CropBox GetCropBox( - DictionaryToken dictionary, - PageTreeMembers pageTreeMembers, - MediaBox mediaBox) - { - CropBox cropBox; - if (dictionary.TryGet(NameToken.CropBox, out var cropBoxObject) && - DirectObjectFinder.TryGet(cropBoxObject, pdfScanner, out ArrayToken cropBoxArray)) - { - if (cropBoxArray.Length != 4) - { - parsingOptions.Logger.Error($"The CropBox was the wrong length in the dictionary: {dictionary}. Array was: {cropBoxArray}. Using MediaBox."); - - cropBox = new CropBox(mediaBox.Bounds); - - return cropBox; - } - - cropBox = new CropBox(cropBoxArray.ToRectangle(pdfScanner)); - } - else - { - cropBox = pageTreeMembers.GetCropBox() ?? new CropBox(mediaBox.Bounds); - } - - return cropBox; - } - - private MediaBox GetMediaBox( - int number, - DictionaryToken dictionary, - PageTreeMembers pageTreeMembers) - { - MediaBox mediaBox; - if (dictionary.TryGet(NameToken.MediaBox, out var mediaBoxObject) - && DirectObjectFinder.TryGet(mediaBoxObject, pdfScanner, out ArrayToken mediaBoxArray)) - { - if (mediaBoxArray.Length != 4) - { - parsingOptions.Logger.Error($"The MediaBox was the wrong length in the dictionary: {dictionary}. Array was: {mediaBoxArray}. Defaulting to US Letter."); - - mediaBox = MediaBox.Letter; - - return mediaBox; - } - - mediaBox = new MediaBox(mediaBoxArray.ToRectangle(pdfScanner)); - } - else - { - mediaBox = pageTreeMembers.MediaBox; - - if (mediaBox == null) - { - parsingOptions.Logger.Error($"The MediaBox was the wrong missing for page {number}. Using US Letter."); - - // PDFBox defaults to US Letter. - mediaBox = MediaBox.Letter; - } - } - - return mediaBox; - } - - /// - /// Apply the matrix transform to the media box and crop box. - /// Then Normalise() in order to obtain rectangles with rotation=0 - /// and width and height as viewed on screen. - /// - /// - /// - /// - private static void ApplyTransformNormalise(TransformationMatrix transformationMatrix, ref MediaBox mediaBox, ref CropBox cropBox) - { - if (transformationMatrix != TransformationMatrix.Identity) - { - mediaBox = new MediaBox(transformationMatrix.Transform(mediaBox.Bounds).Normalise()); - cropBox = new CropBox(transformationMatrix.Transform(cropBox.Bounds).Normalise()); - } + return new Page(pageNumber, + dictionary, + mediaBox, + cropBox, + rotation, + content, + annotationProvider, + PdfScanner); } } }