2018-01-11 03:49:32 +08:00
namespace UglyToad.PdfPig
2017-11-10 03:14:09 +08:00
{
using System ;
2019-08-24 02:09:33 +08:00
using System.Collections.Generic ;
2018-01-22 03:34:21 +08:00
using System.IO ;
2019-01-02 01:44:46 +08:00
using AcroForms ;
2017-11-10 03:14:09 +08:00
using Content ;
2020-01-05 00:38:18 +08:00
using Core ;
2018-11-25 03:02:06 +08:00
using CrossReference ;
2019-05-04 22:36:13 +08:00
using Encryption ;
2019-05-10 02:02:39 +08:00
using Exceptions ;
2019-08-11 19:41:51 +08:00
using Filters ;
2017-11-10 03:14:09 +08:00
using Parser ;
2018-01-14 23:33:22 +08:00
using Tokenization.Scanner ;
2019-08-11 19:41:51 +08:00
using Tokens ;
2019-12-05 20:03:30 +08:00
using Outline ;
2023-10-23 00:34:47 +08:00
using Outline.Destinations ;
2017-11-10 03:14:09 +08:00
using Util.JetBrains.Annotations ;
2018-01-07 20:37:48 +08:00
/// <inheritdoc />
/// <summary>
2018-01-11 06:15:29 +08:00
/// Provides access to document level information for this PDF document as well as access to the <see cref="T:UglyToad.PdfPig.Content.Page"/>s contained in the document.
2018-01-07 20:37:48 +08:00
/// </summary>
2017-11-10 03:14:09 +08:00
public class PdfDocument : IDisposable
{
2018-03-31 05:02:05 +08:00
private bool isDisposed ;
2019-01-02 01:44:46 +08:00
private readonly Lazy < AcroForm > documentForm ;
2023-10-24 04:31:32 +08:00
2017-11-10 03:14:09 +08:00
[NotNull]
private readonly HeaderVersion version ;
2018-03-31 05:02:05 +08:00
2018-01-22 03:34:21 +08:00
private readonly IInputBytes inputBytes ;
2018-03-31 05:02:05 +08:00
2019-05-04 22:36:13 +08:00
[CanBeNull]
private readonly EncryptionDictionary encryptionDictionary ;
[NotNull]
2018-01-21 02:42:29 +08:00
private readonly IPdfTokenScanner pdfScanner ;
2019-08-11 19:41:51 +08:00
2021-04-26 04:22:22 +08:00
private readonly ILookupFilterProvider filterProvider ;
2019-12-05 20:03:30 +08:00
private readonly BookmarksProvider bookmarksProvider ;
2023-10-24 04:31:32 +08:00
private readonly ParsingOptions parsingOptions ;
2019-08-11 19:41:51 +08:00
2017-11-10 03:14:09 +08:00
[NotNull]
2018-03-31 05:02:05 +08:00
private readonly Pages pages ;
2023-04-11 00:14:14 +08:00
private readonly NamedDestinations namedDestinations ;
2017-12-28 21:14:03 +08:00
2018-01-11 04:31:38 +08:00
/// <summary>
/// The metadata associated with this document.
/// </summary>
2018-01-03 07:26:58 +08:00
[NotNull]
public DocumentInformation Information { get ; }
2018-11-25 03:02:06 +08:00
/// <summary>
/// Access to the underlying raw structure of the document.
/// </summary>
[NotNull]
2018-11-27 03:46:41 +08:00
public Structure Structure { get ; }
2018-11-25 03:02:06 +08:00
2019-12-22 00:16:36 +08:00
/// <summary>
/// Access to rare or advanced features of the PDF specification.
/// </summary>
public AdvancedPdfDocumentAccess Advanced { get ; }
2018-01-04 04:15:25 +08:00
/// <summary>
/// The version number of the PDF specification which this file conforms to, for example 1.4.
/// </summary>
public decimal Version = > version . Version ;
2017-12-28 21:14:03 +08:00
/// <summary>
/// Get the number of pages in this document.
/// </summary>
2018-03-31 05:02:05 +08:00
public int NumberOfPages = > pages . Count ;
2017-11-10 03:14:09 +08:00
2019-05-04 22:36:13 +08:00
/// <summary>
/// Whether the document content is encrypted.
/// </summary>
public bool IsEncrypted = > encryptionDictionary ! = null ;
2023-04-11 00:14:14 +08:00
internal PdfDocument ( IInputBytes inputBytes ,
HeaderVersion version ,
2018-01-22 03:34:21 +08:00
CrossReferenceTable crossReferenceTable ,
2018-01-03 07:26:58 +08:00
Catalog catalog ,
2023-04-11 00:14:14 +08:00
DocumentInformation information ,
2019-05-04 22:36:13 +08:00
EncryptionDictionary encryptionDictionary ,
IPdfTokenScanner pdfScanner ,
2021-04-26 04:22:22 +08:00
ILookupFilterProvider filterProvider ,
2019-12-05 20:03:30 +08:00
AcroFormFactory acroFormFactory ,
2020-04-05 00:13:48 +08:00
BookmarksProvider bookmarksProvider ,
2023-10-24 04:31:32 +08:00
ParsingOptions parsingOptions )
2017-11-10 03:14:09 +08:00
{
2018-01-22 03:34:21 +08:00
this . inputBytes = inputBytes ;
2017-11-10 03:14:09 +08:00
this . version = version ? ? throw new ArgumentNullException ( nameof ( version ) ) ;
2019-05-04 22:36:13 +08:00
this . encryptionDictionary = encryptionDictionary ;
2018-11-25 03:02:06 +08:00
this . pdfScanner = pdfScanner ? ? throw new ArgumentNullException ( nameof ( pdfScanner ) ) ;
2019-08-11 19:41:51 +08:00
this . filterProvider = filterProvider ? ? throw new ArgumentNullException ( nameof ( filterProvider ) ) ;
2019-12-05 20:03:30 +08:00
this . bookmarksProvider = bookmarksProvider ? ? throw new ArgumentNullException ( nameof ( bookmarksProvider ) ) ;
2022-10-10 01:44:05 +08:00
this . parsingOptions = parsingOptions ;
2018-01-03 07:26:58 +08:00
Information = information ? ? throw new ArgumentNullException ( nameof ( information ) ) ;
2023-04-11 00:14:14 +08:00
pages = catalog . Pages ;
namedDestinations = catalog . NamedDestinations ;
2018-11-25 03:02:06 +08:00
Structure = new Structure ( catalog , crossReferenceTable , pdfScanner ) ;
2020-02-28 19:50:18 +08:00
Advanced = new AdvancedPdfDocumentAccess ( pdfScanner , filterProvider , catalog ) ;
2019-01-02 01:44:46 +08:00
documentForm = new Lazy < AcroForm > ( ( ) = > acroFormFactory . GetAcroForm ( catalog ) ) ;
2017-11-10 03:14:09 +08:00
}
2018-01-09 06:43:48 +08:00
/// <summary>
/// Creates a <see cref="PdfDocument"/> for reading from the provided file bytes.
/// </summary>
/// <param name="fileBytes">The bytes of the PDF file.</param>
/// <param name="options">Optional parameters controlling parsing.</param>
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
2017-11-10 03:14:09 +08:00
public static PdfDocument Open ( byte [ ] fileBytes , ParsingOptions options = null ) = > PdfDocumentFactory . Open ( fileBytes , options ) ;
2023-11-02 16:12:46 +08:00
2018-01-09 06:43:48 +08:00
/// <summary>
/// Opens a file and creates a <see cref="PdfDocument"/> for reading from the provided file path.
/// </summary>
/// <param name="filePath">The full path to the file location of the PDF file.</param>
/// <param name="options">Optional parameters controlling parsing.</param>
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
public static PdfDocument Open ( string filePath , ParsingOptions options = null ) = > PdfDocumentFactory . Open ( filePath , options ) ;
2018-03-31 05:02:05 +08:00
2018-01-22 03:34:21 +08:00
/// <summary>
/// Creates a <see cref="PdfDocument"/> for reading from the provided stream.
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
/// </summary>
/// <param name="stream">
/// A stream of the file contents, this must support reading and seeking.
/// The PdfDocument will not dispose of the provided stream.
/// </param>
/// <param name="options">Optional parameters controlling parsing.</param>
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
public static PdfDocument Open ( Stream stream , ParsingOptions options = null ) = > PdfDocumentFactory . Open ( stream , options ) ;
2017-11-10 03:14:09 +08:00
2023-11-02 16:12:46 +08:00
/// <summary>
/// TODO
/// </summary>
/// <typeparam name="TPage"></typeparam>
/// <param name="pageFactory"></param>
public void AddPageFactory < TPage > ( IPageFactory < TPage > pageFactory )
{
pages . AddPageFactory ( pageFactory ) ;
}
/// <summary>
/// TODO
/// </summary>
/// <typeparam name="TPage"></typeparam>
/// <typeparam name="TPageFactory"></typeparam>
public void AddPageFactory < TPage , TPageFactory > ( ) where TPageFactory : IPageFactory < TPage >
{
pages . AddPageFactory < TPage , TPageFactory > ( ) ;
}
2017-12-28 21:14:03 +08:00
/// <summary>
2018-11-24 22:38:44 +08:00
/// Get the page with the specified page number (1 indexed).
2017-12-28 21:14:03 +08:00
/// </summary>
/// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
/// <returns>The page.</returns>
2020-04-05 00:13:48 +08:00
public Page GetPage ( int pageNumber )
2017-12-28 21:14:03 +08:00
{
2018-01-22 03:34:21 +08:00
if ( isDisposed )
{
throw new ObjectDisposedException ( "Cannot access page after the document is disposed." ) ;
}
2022-10-10 01:44:05 +08:00
parsingOptions . Logger . Debug ( $"Accessing page {pageNumber}." ) ;
2018-01-22 03:34:21 +08:00
2019-05-10 02:02:39 +08:00
try
{
2023-04-11 00:14:14 +08:00
return pages . GetPage ( pageNumber , namedDestinations , parsingOptions ) ;
2019-05-10 02:02:39 +08:00
}
catch ( Exception ex )
{
if ( IsEncrypted )
{
throw new PdfDocumentEncryptedException ( "Document was encrypted which may have caused error when retrieving page." , encryptionDictionary , ex ) ;
}
throw ;
}
2017-12-28 21:14:03 +08:00
}
2019-01-02 01:44:46 +08:00
2023-11-02 16:12:46 +08:00
/// <summary>
/// Get the page with the specified page number (1 indexed), using the specified page factory.
/// </summary>
/// <typeparam name="TPage"></typeparam>
/// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
/// <returns>The page.</returns>
public TPage GetPage < TPage > ( int pageNumber )
{
if ( isDisposed )
{
throw new ObjectDisposedException ( "Cannot access page after the document is disposed." ) ;
}
parsingOptions . Logger . Debug ( $"Accessing page {pageNumber}." ) ;
try
{
return pages . GetPage < TPage > ( pageNumber , namedDestinations , parsingOptions ) ;
}
catch ( Exception ex )
{
if ( IsEncrypted )
{
throw new PdfDocumentEncryptedException ( "Document was encrypted which may have caused error when retrieving page." , encryptionDictionary , ex ) ;
}
throw ;
}
}
2019-08-24 02:09:33 +08:00
/// <summary>
/// Gets all pages in this document in order.
/// </summary>
public IEnumerable < Page > GetPages ( )
{
for ( var i = 0 ; i < NumberOfPages ; i + + )
{
yield return GetPage ( i + 1 ) ;
}
}
2023-11-02 16:12:46 +08:00
/// <summary>
/// Gets all pages in this document in order, using the specified page factory.
/// </summary>
public IEnumerable < TPage > GetPages < TPage > ( )
{
for ( var i = 0 ; i < NumberOfPages ; i + + )
{
yield return GetPage < TPage > ( i + 1 ) ;
}
}
2019-08-11 19:41:51 +08:00
/// <summary>
/// Get the document level metadata if present.
/// The metadata is XML in the (Extensible Metadata Platform) XMP format.
/// </summary>
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
/// <param name="metadata">The metadata stream if it exists.</param>
/// <returns><see langword="true"/> if the metadata is present, <see langword="false"/> otherwise.</returns>
public bool TryGetXmpMetadata ( out XmpMetadata metadata )
{
if ( isDisposed )
{
throw new ObjectDisposedException ( "Cannot access the document metadata after the document is disposed." ) ;
}
metadata = null ;
if ( ! Structure . Catalog . CatalogDictionary . TryGet ( NameToken . Metadata , pdfScanner , out StreamToken xmpStreamToken ) )
{
return false ;
}
2021-04-26 04:22:22 +08:00
metadata = new XmpMetadata ( xmpStreamToken , filterProvider , pdfScanner ) ;
2019-08-11 19:41:51 +08:00
return true ;
}
2019-11-04 23:11:54 +08:00
/// <summary>
/// Gets the bookmarks if this document contains some.
/// </summary>
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
public bool TryGetBookmarks ( out Bookmarks bookmarks )
{
if ( isDisposed )
{
throw new ObjectDisposedException ( "Cannot access the bookmarks after the document is disposed." ) ;
}
2019-12-05 20:03:30 +08:00
bookmarks = bookmarksProvider . GetBookmarks ( Structure . Catalog ) ;
2021-04-26 04:22:22 +08:00
if ( bookmarks ! = null )
{
return true ;
}
2019-11-04 23:11:54 +08:00
return false ;
}
2019-01-02 01:44:46 +08:00
/// <summary>
/// Gets the form if this document contains one.
/// </summary>
2019-08-11 19:41:51 +08:00
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
2019-01-02 01:44:46 +08:00
/// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns>
2019-11-28 00:36:25 +08:00
public bool TryGetForm ( out AcroForm form )
2019-01-02 01:44:46 +08:00
{
if ( isDisposed )
{
throw new ObjectDisposedException ( "Cannot access the form after the document is disposed." ) ;
}
2019-11-28 00:36:25 +08:00
form = documentForm . Value ;
return form ! = null ;
2019-01-02 01:44:46 +08:00
}
2023-10-24 04:31:32 +08:00
2018-01-11 04:31:38 +08:00
/// <inheritdoc />
/// <summary>
/// Dispose the <see cref="T:UglyToad.PdfPig.PdfDocument" /> and close any unmanaged resources.
/// </summary>
2017-11-10 03:14:09 +08:00
public void Dispose ( )
{
try
{
2019-12-22 00:16:36 +08:00
Advanced . Dispose ( ) ;
2019-10-08 21:04:36 +08:00
pdfScanner . Dispose ( ) ;
2018-01-22 03:34:21 +08:00
inputBytes . Dispose ( ) ;
2023-11-02 16:12:46 +08:00
pages . Dispose ( ) ;
2018-01-22 03:34:21 +08:00
}
catch ( Exception ex )
{
2022-10-10 01:44:05 +08:00
parsingOptions . Logger . Error ( "Failed disposing the PdfDocument due to an error." , ex ) ;
2017-11-10 03:14:09 +08:00
}
2018-01-22 03:34:21 +08:00
finally
2017-11-10 03:14:09 +08:00
{
2018-01-22 03:34:21 +08:00
isDisposed = true ;
2017-11-10 03:14:09 +08:00
}
}
}
}