Abstract away PageFactory main logic into BasePageFactory

This commit is contained in:
BobLd
2023-11-02 08:04:14 +00:00
parent 3d655c1fa4
commit 3fbf8aaa6c
6 changed files with 390 additions and 248 deletions

View File

@@ -74,13 +74,14 @@
"UglyToad.PdfPig.Annotations.AppearanceStream",
"UglyToad.PdfPig.Annotations.QuadPointsQuadrilateral",
"UglyToad.PdfPig.Content.ArtifactMarkedContentElement",
"UglyToad.PdfPig.Content.BasePageFactory`1",
"UglyToad.PdfPig.Content.Catalog",
"UglyToad.PdfPig.Content.CropBox",
"UglyToad.PdfPig.Content.DocumentInformation",
"UglyToad.PdfPig.Content.EmbeddedFile",
"UglyToad.PdfPig.Content.Hyperlink",
"UglyToad.PdfPig.Content.InlineImage",
"UglyToad.PdfPig.Content.IPageFactory",
"UglyToad.PdfPig.Content.IPageFactory`1",
"UglyToad.PdfPig.Content.IPdfImage",
"UglyToad.PdfPig.Content.IResourceStore",
"UglyToad.PdfPig.Content.Letter",

View File

@@ -0,0 +1,326 @@
namespace UglyToad.PdfPig.Content
{
using System;
using System.Collections.Generic;
using Core;
using Filters;
using Geometry;
using Graphics;
using Graphics.Operations;
using Outline.Destinations;
using Parser;
using Parser.Parts;
using Tokenization.Scanner;
using Tokens;
using Util;
/// <summary>
/// Page factory abstract class.
/// </summary>
/// <typeparam name="TPage">The type of page the page factory creates.</typeparam>
public abstract class BasePageFactory<TPage> : IPageFactory<TPage>
{
/// <summary>
/// The parsing options.
/// </summary>
public readonly ParsingOptions ParsingOptions;
/// <summary>
/// The Pdf token scanner.
/// </summary>
public readonly IPdfTokenScanner PdfScanner;
/// <summary>
/// The resource store.
/// </summary>
public readonly IResourceStore ResourceStore;
/// <summary>
/// The filter provider.
/// </summary>
public readonly ILookupFilterProvider FilterProvider;
/// <summary>
/// The page content parser.
/// </summary>
public readonly IPageContentParser PageContentParser;
/// <summary>
/// Create a <see cref="BasePageFactory{TPage}"/>.
/// </summary>
protected BasePageFactory(
IPdfTokenScanner pdfScanner,
IResourceStore resourceStore,
ILookupFilterProvider filterProvider,
IPageContentParser pageContentParser,
ParsingOptions parsingOptions)
{
this.ResourceStore = resourceStore;
this.FilterProvider = filterProvider;
this.PageContentParser = pageContentParser;
this.PdfScanner = pdfScanner;
this.ParsingOptions = parsingOptions;
}
/// <inheritdoc/>
public TPage Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers,
NamedDestinations namedDestinations)
{
if (dictionary == null)
{
throw new ArgumentNullException(nameof(dictionary));
}
var type = dictionary.GetNameOrDefault(NameToken.Type);
if (type != null && !type.Equals(NameToken.Page))
{
ParsingOptions.Logger.Error($"Page {number} had its type specified as {type} rather than 'Page'.");
}
var rotation = new PageRotationDegrees(pageTreeMembers.Rotation);
if (dictionary.TryGet(NameToken.Rotate, PdfScanner, out NumericToken rotateToken))
{
rotation = new PageRotationDegrees(rotateToken.Int);
}
var stackDepth = 0;
while (pageTreeMembers.ParentResources.Count > 0)
{
var resource = pageTreeMembers.ParentResources.Dequeue();
ResourceStore.LoadResourceDictionary(resource);
stackDepth++;
}
if (dictionary.TryGet(NameToken.Resources, PdfScanner, out DictionaryToken resources))
{
ResourceStore.LoadResourceDictionary(resources);
stackDepth++;
}
UserSpaceUnit userSpaceUnit = GetUserSpaceUnits(dictionary);
MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers);
CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox);
var initialMatrix = OperationContextHelper.GetInitialMatrix(userSpaceUnit, mediaBox, cropBox, rotation, ParsingOptions.Logger);
ApplyTransformNormalise(initialMatrix, ref mediaBox, ref cropBox);
TPage page;
if (!dictionary.TryGet(NameToken.Contents, out var contents))
{
// ignored for now, is it possible? check the spec...
page = ProcessPageInternal(number, dictionary, namedDestinations, mediaBox, cropBox, userSpaceUnit, rotation, initialMatrix, null);
}
else if (DirectObjectFinder.TryGet<ArrayToken>(contents, PdfScanner, out var array))
{
var bytes = new List<byte>();
for (var i = 0; i < array.Data.Count; i++)
{
var item = array.Data[i];
if (!(item is IndirectReferenceToken obj))
{
throw new PdfDocumentFormatException($"The contents contained something which was not an indirect reference: {item}.");
}
var contentStream = DirectObjectFinder.Get<StreamToken>(obj, PdfScanner);
if (contentStream == null)
{
throw new InvalidOperationException($"Could not find the contents for object {obj}.");
}
bytes.AddRange(contentStream.Decode(FilterProvider, PdfScanner));
if (i < array.Data.Count - 1)
{
bytes.Add((byte)'\n');
}
}
page = ProcessPageInternal(number, dictionary, namedDestinations, mediaBox, cropBox, userSpaceUnit, rotation, initialMatrix, bytes);
}
else
{
var contentStream = DirectObjectFinder.Get<StreamToken>(contents, PdfScanner);
if (contentStream == null)
{
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
}
var bytes = contentStream.Decode(FilterProvider, PdfScanner);
page = ProcessPageInternal(number, dictionary, namedDestinations, mediaBox, cropBox, userSpaceUnit, rotation, initialMatrix, bytes);
}
for (var i = 0; i < stackDepth; i++)
{
ResourceStore.UnloadResourceDictionary();
}
return page;
}
private TPage ProcessPageInternal(
int pageNumber,
DictionaryToken dictionary,
NamedDestinations namedDestinations,
MediaBox mediaBox,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
TransformationMatrix initialMatrix,
IReadOnlyList<byte> contentBytes)
{
IReadOnlyList<IGraphicsStateOperation> operations;
if (contentBytes == null || contentBytes.Count == 0)
{
operations = EmptyArray<IGraphicsStateOperation>.Instance;
}
else
{
operations = PageContentParser.Parse(pageNumber,
new ByteArrayInputBytes(contentBytes),
ParsingOptions.Logger);
}
return ProcessPage(pageNumber,
dictionary,
namedDestinations,
mediaBox,
cropBox,
userSpaceUnit,
rotation,
initialMatrix,
operations);
}
/// <summary>
/// Process a page with content.
/// </summary>
/// <param name="pageNumber">The page number, starts at 1.</param>
/// <param name="dictionary"></param>
/// <param name="namedDestinations"></param>
/// <param name="mediaBox">The page media box.</param>
/// <param name="cropBox">The page effective crop box, computed as the intersection of the initial crop box and the media box.</param>
/// <param name="userSpaceUnit"></param>
/// <param name="rotation">The page rotation.</param>
/// <param name="initialMatrix"></param>
/// <param name="operations">The page operations. Can be empty if the page has no content.</param>
protected abstract TPage ProcessPage(
int pageNumber,
DictionaryToken dictionary,
NamedDestinations namedDestinations,
MediaBox mediaBox,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
TransformationMatrix initialMatrix,
IReadOnlyList<IGraphicsStateOperation> operations);
/// <summary>
/// Get the user space units.
/// </summary>
protected static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary)
{
if (dictionary.TryGet(NameToken.UserUnit, out var userUnitBase) && userUnitBase is NumericToken userUnitNumber)
{
return new UserSpaceUnit(userUnitNumber.Int);
}
return UserSpaceUnit.Default;
}
/// <summary>
/// Get the crop box.
/// </summary>
protected CropBox GetCropBox(DictionaryToken dictionary, PageTreeMembers pageTreeMembers, MediaBox mediaBox)
{
CropBox cropBox;
if (dictionary.TryGet(NameToken.CropBox, out var cropBoxObject) &&
DirectObjectFinder.TryGet(cropBoxObject, PdfScanner, out ArrayToken cropBoxArray))
{
if (cropBoxArray.Length != 4)
{
ParsingOptions.Logger.Error(
$"The CropBox was the wrong length in the dictionary: {dictionary}. Array was: {cropBoxArray}. Using MediaBox.");
cropBox = new CropBox(mediaBox.Bounds);
return cropBox;
}
cropBox = new CropBox(cropBoxArray.ToRectangle(PdfScanner));
}
else
{
cropBox = pageTreeMembers.GetCropBox() ?? new CropBox(mediaBox.Bounds);
}
return cropBox;
}
/// <summary>
/// Get the media box.
/// </summary>
protected MediaBox GetMediaBox(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers)
{
MediaBox mediaBox;
if (dictionary.TryGet(NameToken.MediaBox, out var mediaBoxObject)
&& DirectObjectFinder.TryGet(mediaBoxObject, PdfScanner, out ArrayToken mediaBoxArray))
{
if (mediaBoxArray.Length != 4)
{
ParsingOptions.Logger.Error(
$"The MediaBox was the wrong length in the dictionary: {dictionary}. Array was: {mediaBoxArray}. Defaulting to US Letter.");
mediaBox = MediaBox.Letter;
return mediaBox;
}
mediaBox = new MediaBox(mediaBoxArray.ToRectangle(PdfScanner));
}
else
{
mediaBox = pageTreeMembers.MediaBox;
if (mediaBox == null)
{
ParsingOptions.Logger.Error(
$"The MediaBox was the wrong missing for page {number}. Using US Letter.");
// PDFBox defaults to US Letter.
mediaBox = MediaBox.Letter;
}
}
return mediaBox;
}
/// <summary>
/// Apply the matrix transform to the media box and crop box.
/// Then Normalise() in order to obtain rectangles with rotation=0
/// and width and height as viewed on screen.
/// </summary>
/// <param name="transformationMatrix"></param>
/// <param name="mediaBox"></param>
/// <param name="cropBox"></param>
protected static void ApplyTransformNormalise(TransformationMatrix transformationMatrix, ref MediaBox mediaBox, ref CropBox cropBox)
{
if (transformationMatrix != TransformationMatrix.Identity)
{
mediaBox = new MediaBox(transformationMatrix.Transform(mediaBox.Bounds).Normalise());
cropBox = new CropBox(transformationMatrix.Transform(cropBox.Bounds).Normalise());
}
}
}
}

View File

@@ -6,14 +6,15 @@
/// <summary>
/// Page factory interface.
/// </summary>
public interface IPageFactory
/// <typeparam name="TPage">The type of page the page factory creates.</typeparam>
public interface IPageFactory<out TPage>
{
/// <summary>
/// Create the page.
/// </summary>
Page Create(int number,
TPage Create(int number,
DictionaryToken dictionary,
PageTreeMembers pageTreeMembers,
NamedDestinations annotationProvider);
NamedDestinations namedDestinations);
}
}

View File

@@ -1,18 +1,19 @@
namespace UglyToad.PdfPig.Content
{
using Core;
using Outline.Destinations;
using System;
using System.Collections.Generic;
using Core;
using Outline.Destinations;
using Tokenization.Scanner;
using Tokens;
using Util;
internal class Pages
{
private readonly IPageFactory pageFactory;
private readonly IPageFactory<Page> pageFactory;
private readonly IPdfTokenScanner pdfScanner;
private readonly Dictionary<int, PageTreeNode> pagesByNumber;
public int Count => pagesByNumber.Count;
/// <summary>
@@ -20,7 +21,7 @@
/// </summary>
public PageTreeNode PageTree { get; }
internal Pages(IPageFactory pageFactory, IPdfTokenScanner pdfScanner, PageTreeNode pageTree, Dictionary<int, PageTreeNode> pagesByNumber)
internal Pages(IPageFactory<Page> pageFactory, IPdfTokenScanner pdfScanner, PageTreeNode pageTree, Dictionary<int, PageTreeNode> pagesByNumber)
{
this.pageFactory = pageFactory ?? throw new ArgumentNullException(nameof(pageFactory));
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
@@ -34,7 +35,7 @@
{
parsingOptions.Logger.Error($"Page {pageNumber} requested but is out of range.");
throw new ArgumentOutOfRangeException(nameof(pageNumber),
throw new ArgumentOutOfRangeException(nameof(pageNumber),
$"Page number {pageNumber} invalid, must be between 1 and {Count}.");
}
@@ -49,7 +50,7 @@
}
var pageTreeMembers = new PageTreeMembers();
while (pageStack.Count > 0)
{
currentNode = pageStack.Pop();
@@ -58,7 +59,7 @@
{
pageTreeMembers.ParentResources.Enqueue(resourcesDictionary);
}
if (currentNode.NodeDictionary.TryGet(NameToken.MediaBox, pdfScanner, out ArrayToken mediaBox))
{
pageTreeMembers.MediaBox = new MediaBox(mediaBox.ToRectangle(pdfScanner));

View File

@@ -10,9 +10,9 @@
using Tokens;
using Util;
internal class PagesFactory
internal static class PagesFactory
{
private class PageCounter
private sealed class PageCounter
{
public int PageCount { get; private set; }
public void Increment()
@@ -21,7 +21,7 @@
}
}
public static Pages Create(IndirectReference pagesReference, DictionaryToken pagesDictionary, IPdfTokenScanner scanner, IPageFactory pageFactory, ILog log, bool isLenientParsing)
public static Pages Create(IndirectReference pagesReference, DictionaryToken pagesDictionary, IPdfTokenScanner scanner, IPageFactory<Page> pageFactory, ILog log, bool isLenientParsing)
{
var pageNumber = new PageCounter();

View File

@@ -1,6 +1,5 @@
namespace UglyToad.PdfPig.Parser
{
using System;
using System.Collections.Generic;
using Annotations;
using Content;
@@ -9,267 +8,81 @@
using Geometry;
using Graphics;
using Graphics.Operations;
using Logging;
using Outline.Destinations;
using Parts;
using Tokenization.Scanner;
using Tokens;
using Util;
internal class PageFactory : IPageFactory
internal class PageFactory : BasePageFactory<Page>
{
private readonly ParsingOptions parsingOptions;
private readonly IPdfTokenScanner pdfScanner;
private readonly IResourceStore resourceStore;
private readonly ILookupFilterProvider filterProvider;
private readonly IPageContentParser pageContentParser;
public PageFactory(
IPdfTokenScanner pdfScanner,
IResourceStore resourceStore,
ILookupFilterProvider filterProvider,
IPageContentParser pageContentParser,
ParsingOptions parsingOptions)
: base(pdfScanner, resourceStore, filterProvider, pageContentParser, parsingOptions)
{
this.resourceStore = resourceStore;
this.filterProvider = filterProvider;
this.pageContentParser = pageContentParser;
this.pdfScanner = pdfScanner;
this.parsingOptions = parsingOptions;
}
public Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers,
NamedDestinations namedDestinations)
{
if (dictionary == null)
{
throw new ArgumentNullException(nameof(dictionary));
}
var type = dictionary.GetNameOrDefault(NameToken.Type);
if (type != null && !type.Equals(NameToken.Page))
{
parsingOptions.Logger.Error($"Page {number} had its type specified as {type} rather than 'Page'.");
}
var rotation = new PageRotationDegrees(pageTreeMembers.Rotation);
if (dictionary.TryGet(NameToken.Rotate, pdfScanner, out NumericToken rotateToken))
{
rotation = new PageRotationDegrees(rotateToken.Int);
}
var stackDepth = 0;
while (pageTreeMembers.ParentResources.Count > 0)
{
var resource = pageTreeMembers.ParentResources.Dequeue();
resourceStore.LoadResourceDictionary(resource);
stackDepth++;
}
if (dictionary.TryGet(NameToken.Resources, pdfScanner, out DictionaryToken resources))
{
resourceStore.LoadResourceDictionary(resources);
stackDepth++;
}
UserSpaceUnit userSpaceUnit = GetUserSpaceUnits(dictionary);
MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers);
CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox);
var initialMatrix = OperationContextHelper.GetInitialMatrix(userSpaceUnit, mediaBox, cropBox, rotation, parsingOptions.Logger);
ApplyTransformNormalise(initialMatrix, ref mediaBox, ref cropBox);
PageContent content;
if (!dictionary.TryGet(NameToken.Contents, out var contents))
{
content = new PageContent(EmptyArray<IGraphicsStateOperation>.Instance,
EmptyArray<Letter>.Instance,
EmptyArray<PdfPath>.Instance,
EmptyArray<Union<XObjectContentRecord, InlineImage>>.Instance,
EmptyArray<MarkedContentElement>.Instance,
pdfScanner,
filterProvider,
resourceStore);
// ignored for now, is it possible? check the spec...
}
else if (DirectObjectFinder.TryGet<ArrayToken>(contents, pdfScanner, out var array))
{
var bytes = new List<byte>();
for (var i = 0; i < array.Data.Count; i++)
{
var item = array.Data[i];
if (!(item is IndirectReferenceToken obj))
{
throw new PdfDocumentFormatException($"The contents contained something which was not an indirect reference: {item}.");
}
var contentStream = DirectObjectFinder.Get<StreamToken>(obj, pdfScanner);
if (contentStream == null)
{
throw new InvalidOperationException($"Could not find the contents for object {obj}.");
}
bytes.AddRange(contentStream.Decode(filterProvider, pdfScanner));
if (i < array.Data.Count - 1)
{
bytes.Add((byte)'\n');
}
}
content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, initialMatrix, parsingOptions);
}
else
{
var contentStream = DirectObjectFinder.Get<StreamToken>(contents, pdfScanner);
if (contentStream == null)
{
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
}
var bytes = contentStream.Decode(filterProvider, pdfScanner);
content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, initialMatrix, parsingOptions);
}
var annotationProvider = new AnnotationProvider(pdfScanner, dictionary, initialMatrix, namedDestinations, parsingOptions.Logger);
var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, annotationProvider, pdfScanner);
for (var i = 0; i < stackDepth; i++)
{
resourceStore.UnloadResourceDictionary();
}
return page;
}
private PageContent GetContent(
int pageNumber,
IReadOnlyList<byte> contentBytes,
protected override Page ProcessPage(int pageNumber,
DictionaryToken dictionary,
NamedDestinations namedDestinations,
MediaBox mediaBox,
CropBox cropBox,
UserSpaceUnit userSpaceUnit,
PageRotationDegrees rotation,
TransformationMatrix initialMatrix,
ParsingOptions parsingOptions)
IReadOnlyList<IGraphicsStateOperation> operations)
{
var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes),
parsingOptions.Logger);
var annotationProvider = new AnnotationProvider(PdfScanner,
dictionary,
initialMatrix,
namedDestinations,
ParsingOptions.Logger);
if (operations == null || operations.Count == 0)
{
PageContent emptyContent = new PageContent(EmptyArray<IGraphicsStateOperation>.Instance,
EmptyArray<Letter>.Instance,
EmptyArray<PdfPath>.Instance,
EmptyArray<Union<XObjectContentRecord, InlineImage>>.Instance,
EmptyArray<MarkedContentElement>.Instance,
PdfScanner,
FilterProvider,
ResourceStore);
return new Page(pageNumber,
dictionary,
mediaBox,
cropBox,
rotation,
emptyContent,
annotationProvider,
PdfScanner);
}
var context = new ContentStreamProcessor(
pageNumber,
resourceStore,
pdfScanner,
pageContentParser,
filterProvider,
ResourceStore,
PdfScanner,
PageContentParser,
FilterProvider,
cropBox,
userSpaceUnit,
rotation,
initialMatrix,
parsingOptions);
ParsingOptions);
return context.Process(pageNumber, operations);
}
PageContent content = context.Process(pageNumber, operations);
private static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary)
{
var spaceUnits = UserSpaceUnit.Default;
if (dictionary.TryGet(NameToken.UserUnit, out var userUnitBase) && userUnitBase is NumericToken userUnitNumber)
{
spaceUnits = new UserSpaceUnit(userUnitNumber.Int);
}
return spaceUnits;
}
private CropBox GetCropBox(
DictionaryToken dictionary,
PageTreeMembers pageTreeMembers,
MediaBox mediaBox)
{
CropBox cropBox;
if (dictionary.TryGet(NameToken.CropBox, out var cropBoxObject) &&
DirectObjectFinder.TryGet(cropBoxObject, pdfScanner, out ArrayToken cropBoxArray))
{
if (cropBoxArray.Length != 4)
{
parsingOptions.Logger.Error($"The CropBox was the wrong length in the dictionary: {dictionary}. Array was: {cropBoxArray}. Using MediaBox.");
cropBox = new CropBox(mediaBox.Bounds);
return cropBox;
}
cropBox = new CropBox(cropBoxArray.ToRectangle(pdfScanner));
}
else
{
cropBox = pageTreeMembers.GetCropBox() ?? new CropBox(mediaBox.Bounds);
}
return cropBox;
}
private MediaBox GetMediaBox(
int number,
DictionaryToken dictionary,
PageTreeMembers pageTreeMembers)
{
MediaBox mediaBox;
if (dictionary.TryGet(NameToken.MediaBox, out var mediaBoxObject)
&& DirectObjectFinder.TryGet(mediaBoxObject, pdfScanner, out ArrayToken mediaBoxArray))
{
if (mediaBoxArray.Length != 4)
{
parsingOptions.Logger.Error($"The MediaBox was the wrong length in the dictionary: {dictionary}. Array was: {mediaBoxArray}. Defaulting to US Letter.");
mediaBox = MediaBox.Letter;
return mediaBox;
}
mediaBox = new MediaBox(mediaBoxArray.ToRectangle(pdfScanner));
}
else
{
mediaBox = pageTreeMembers.MediaBox;
if (mediaBox == null)
{
parsingOptions.Logger.Error($"The MediaBox was the wrong missing for page {number}. Using US Letter.");
// PDFBox defaults to US Letter.
mediaBox = MediaBox.Letter;
}
}
return mediaBox;
}
/// <summary>
/// Apply the matrix transform to the media box and crop box.
/// Then Normalise() in order to obtain rectangles with rotation=0
/// and width and height as viewed on screen.
/// </summary>
/// <param name="transformationMatrix"></param>
/// <param name="mediaBox"></param>
/// <param name="cropBox"></param>
private static void ApplyTransformNormalise(TransformationMatrix transformationMatrix, ref MediaBox mediaBox, ref CropBox cropBox)
{
if (transformationMatrix != TransformationMatrix.Identity)
{
mediaBox = new MediaBox(transformationMatrix.Transform(mediaBox.Bounds).Normalise());
cropBox = new CropBox(transformationMatrix.Transform(cropBox.Bounds).Normalise());
}
return new Page(pageNumber,
dictionary,
mediaBox,
cropBox,
rotation,
content,
annotationProvider,
PdfScanner);
}
}
}