mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-06-28 15:30:17 +08:00
324 lines
12 KiB
C#
324 lines
12 KiB
C#
namespace UglyToad.PdfPig
|
|
{
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.IO;
|
|
using AcroForms;
|
|
using Content;
|
|
using Core;
|
|
using CrossReference;
|
|
using Encryption;
|
|
using Exceptions;
|
|
using Filters;
|
|
using Parser;
|
|
using Tokenization.Scanner;
|
|
using Tokens;
|
|
using Outline;
|
|
using Outline.Destinations;
|
|
using Util.JetBrains.Annotations;
|
|
|
|
/// <inheritdoc />
|
|
/// <summary>
|
|
/// Provides access to document level information for this PDF document as well as access to the <see cref="T:UglyToad.PdfPig.Content.Page"/>s contained in the document.
|
|
/// </summary>
|
|
public class PdfDocument : IDisposable
|
|
{
|
|
private bool isDisposed;
|
|
private readonly Lazy<AcroForm> documentForm;
|
|
|
|
[NotNull]
|
|
private readonly HeaderVersion version;
|
|
|
|
private readonly IInputBytes inputBytes;
|
|
|
|
[CanBeNull]
|
|
private readonly EncryptionDictionary encryptionDictionary;
|
|
|
|
[NotNull]
|
|
private readonly IPdfTokenScanner pdfScanner;
|
|
|
|
private readonly ILookupFilterProvider filterProvider;
|
|
private readonly BookmarksProvider bookmarksProvider;
|
|
private readonly ParsingOptions parsingOptions;
|
|
|
|
[NotNull]
|
|
private readonly Pages pages;
|
|
private readonly NamedDestinations namedDestinations;
|
|
|
|
/// <summary>
|
|
/// The metadata associated with this document.
|
|
/// </summary>
|
|
[NotNull]
|
|
public DocumentInformation Information { get; }
|
|
|
|
/// <summary>
|
|
/// Access to the underlying raw structure of the document.
|
|
/// </summary>
|
|
[NotNull]
|
|
public Structure Structure { get; }
|
|
|
|
/// <summary>
|
|
/// Access to rare or advanced features of the PDF specification.
|
|
/// </summary>
|
|
public AdvancedPdfDocumentAccess Advanced { get; }
|
|
|
|
/// <summary>
|
|
/// The version number of the PDF specification which this file conforms to, for example 1.4.
|
|
/// </summary>
|
|
public decimal Version => version.Version;
|
|
|
|
/// <summary>
|
|
/// Get the number of pages in this document.
|
|
/// </summary>
|
|
public int NumberOfPages => pages.Count;
|
|
|
|
/// <summary>
|
|
/// Whether the document content is encrypted.
|
|
/// </summary>
|
|
public bool IsEncrypted => encryptionDictionary != null;
|
|
|
|
internal PdfDocument(IInputBytes inputBytes,
|
|
HeaderVersion version,
|
|
CrossReferenceTable crossReferenceTable,
|
|
Catalog catalog,
|
|
DocumentInformation information,
|
|
EncryptionDictionary encryptionDictionary,
|
|
IPdfTokenScanner pdfScanner,
|
|
ILookupFilterProvider filterProvider,
|
|
AcroFormFactory acroFormFactory,
|
|
BookmarksProvider bookmarksProvider,
|
|
ParsingOptions parsingOptions)
|
|
{
|
|
this.inputBytes = inputBytes;
|
|
this.version = version ?? throw new ArgumentNullException(nameof(version));
|
|
this.encryptionDictionary = encryptionDictionary;
|
|
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
|
|
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
|
|
this.bookmarksProvider = bookmarksProvider ?? throw new ArgumentNullException(nameof(bookmarksProvider));
|
|
this.parsingOptions = parsingOptions;
|
|
|
|
Information = information ?? throw new ArgumentNullException(nameof(information));
|
|
pages = catalog.Pages;
|
|
namedDestinations = catalog.NamedDestinations;
|
|
Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
|
|
Advanced = new AdvancedPdfDocumentAccess(pdfScanner, filterProvider, catalog);
|
|
documentForm = new Lazy<AcroForm>(() => acroFormFactory.GetAcroForm(catalog));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates a <see cref="PdfDocument"/> for reading from the provided file bytes.
|
|
/// </summary>
|
|
/// <param name="fileBytes">The bytes of the PDF file.</param>
|
|
/// <param name="options">Optional parameters controlling parsing.</param>
|
|
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
|
|
public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options);
|
|
|
|
/// <summary>
|
|
/// Opens a file and creates a <see cref="PdfDocument"/> for reading from the provided file path.
|
|
/// </summary>
|
|
/// <param name="filePath">The full path to the file location of the PDF file.</param>
|
|
/// <param name="options">Optional parameters controlling parsing.</param>
|
|
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
|
|
public static PdfDocument Open(string filePath, ParsingOptions options = null) => PdfDocumentFactory.Open(filePath, options);
|
|
|
|
/// <summary>
|
|
/// Creates a <see cref="PdfDocument"/> for reading from the provided stream.
|
|
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
|
|
/// </summary>
|
|
/// <param name="stream">
|
|
/// A stream of the file contents, this must support reading and seeking.
|
|
/// The PdfDocument will not dispose of the provided stream.
|
|
/// </param>
|
|
/// <param name="options">Optional parameters controlling parsing.</param>
|
|
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
|
|
public static PdfDocument Open(Stream stream, ParsingOptions options = null) => PdfDocumentFactory.Open(stream, options);
|
|
|
|
/// <summary>
|
|
/// TODO
|
|
/// </summary>
|
|
/// <typeparam name="TPage"></typeparam>
|
|
/// <param name="pageFactory"></param>
|
|
public void AddPageFactory<TPage>(IPageFactory<TPage> pageFactory)
|
|
{
|
|
pages.AddPageFactory(pageFactory);
|
|
}
|
|
|
|
/// <summary>
|
|
/// TODO
|
|
/// </summary>
|
|
/// <typeparam name="TPage"></typeparam>
|
|
/// <typeparam name="TPageFactory"></typeparam>
|
|
public void AddPageFactory<TPage, TPageFactory>() where TPageFactory : IPageFactory<TPage>
|
|
{
|
|
pages.AddPageFactory<TPage, TPageFactory>();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Get the page with the specified page number (1 indexed).
|
|
/// </summary>
|
|
/// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
|
|
/// <returns>The page.</returns>
|
|
public Page GetPage(int pageNumber)
|
|
{
|
|
if (isDisposed)
|
|
{
|
|
throw new ObjectDisposedException("Cannot access page after the document is disposed.");
|
|
}
|
|
|
|
parsingOptions.Logger.Debug($"Accessing page {pageNumber}.");
|
|
|
|
try
|
|
{
|
|
return pages.GetPage(pageNumber, namedDestinations, parsingOptions);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
if (IsEncrypted)
|
|
{
|
|
throw new PdfDocumentEncryptedException("Document was encrypted which may have caused error when retrieving page.", encryptionDictionary, ex);
|
|
}
|
|
|
|
throw;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Get the page with the specified page number (1 indexed), using the specified page factory.
|
|
/// </summary>
|
|
/// <typeparam name="TPage"></typeparam>
|
|
/// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
|
|
/// <returns>The page.</returns>
|
|
public TPage GetPage<TPage>(int pageNumber)
|
|
{
|
|
if (isDisposed)
|
|
{
|
|
throw new ObjectDisposedException("Cannot access page after the document is disposed.");
|
|
}
|
|
|
|
parsingOptions.Logger.Debug($"Accessing page {pageNumber}.");
|
|
|
|
try
|
|
{
|
|
return pages.GetPage<TPage>(pageNumber, namedDestinations, parsingOptions);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
if (IsEncrypted)
|
|
{
|
|
throw new PdfDocumentEncryptedException("Document was encrypted which may have caused error when retrieving page.", encryptionDictionary, ex);
|
|
}
|
|
|
|
throw;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets all pages in this document in order.
|
|
/// </summary>
|
|
public IEnumerable<Page> GetPages()
|
|
{
|
|
for (var i = 0; i < NumberOfPages; i++)
|
|
{
|
|
yield return GetPage(i + 1);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets all pages in this document in order, using the specified page factory.
|
|
/// </summary>
|
|
public IEnumerable<TPage> GetPages<TPage>()
|
|
{
|
|
for (var i = 0; i < NumberOfPages; i++)
|
|
{
|
|
yield return GetPage<TPage>(i + 1);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Get the document level metadata if present.
|
|
/// The metadata is XML in the (Extensible Metadata Platform) XMP format.
|
|
/// </summary>
|
|
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
|
|
/// <param name="metadata">The metadata stream if it exists.</param>
|
|
/// <returns><see langword="true"/> if the metadata is present, <see langword="false"/> otherwise.</returns>
|
|
public bool TryGetXmpMetadata(out XmpMetadata metadata)
|
|
{
|
|
if (isDisposed)
|
|
{
|
|
throw new ObjectDisposedException("Cannot access the document metadata after the document is disposed.");
|
|
}
|
|
|
|
metadata = null;
|
|
|
|
if (!Structure.Catalog.CatalogDictionary.TryGet(NameToken.Metadata, pdfScanner, out StreamToken xmpStreamToken))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
metadata = new XmpMetadata(xmpStreamToken, filterProvider, pdfScanner);
|
|
|
|
return true;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the bookmarks if this document contains some.
|
|
/// </summary>
|
|
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
|
|
public bool TryGetBookmarks(out Bookmarks bookmarks)
|
|
{
|
|
if (isDisposed)
|
|
{
|
|
throw new ObjectDisposedException("Cannot access the bookmarks after the document is disposed.");
|
|
}
|
|
|
|
bookmarks = bookmarksProvider.GetBookmarks(Structure.Catalog);
|
|
if (bookmarks != null)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the form if this document contains one.
|
|
/// </summary>
|
|
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
|
|
/// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns>
|
|
public bool TryGetForm(out AcroForm form)
|
|
{
|
|
if (isDisposed)
|
|
{
|
|
throw new ObjectDisposedException("Cannot access the form after the document is disposed.");
|
|
}
|
|
|
|
form = documentForm.Value;
|
|
|
|
return form != null;
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
/// <summary>
|
|
/// Dispose the <see cref="T:UglyToad.PdfPig.PdfDocument" /> and close any unmanaged resources.
|
|
/// </summary>
|
|
public void Dispose()
|
|
{
|
|
try
|
|
{
|
|
Advanced.Dispose();
|
|
pdfScanner.Dispose();
|
|
inputBytes.Dispose();
|
|
pages.Dispose();
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
parsingOptions.Logger.Error("Failed disposing the PdfDocument due to an error.", ex);
|
|
}
|
|
finally
|
|
{
|
|
isDisposed = true;
|
|
}
|
|
}
|
|
}
|
|
}
|