PdfPig/src/UglyToad.PdfPig/PdfDocument.cs

namespace UglyToad.PdfPig
{
    using System;
    using System.IO;
    using AcroForms;
    using Content;
    using CrossReference;
    using IO;
    using Logging;
    using Parser;
    using Tokenization.Scanner;
    using Util.JetBrains.Annotations;

    /// <inheritdoc />
    /// <summary>
    /// Provides access to document level information for this PDF document as well as access to the <see cref="T:UglyToad.PdfPig.Content.Page"/>s contained in the document.
    /// </summary>
    public class PdfDocument : IDisposable
    {
        private bool isDisposed;
        private readonly Lazy<AcroForm> documentForm;

        private readonly bool isLenientParsing;

        [NotNull]
        private readonly HeaderVersion version;
        
        private readonly ILog log;

        private readonly IInputBytes inputBytes;

        [NotNull]
        private readonly ParsingCachingProviders cachingProviders;

        private readonly IPdfTokenScanner pdfScanner;
        
        [NotNull]
        private readonly Pages pages;

        /// <summary>
        /// The metadata associated with this document.
        /// </summary>
        [NotNull]
        public DocumentInformation Information { get; }

        /// <summary>
        /// Access to the underlying raw structure of the document. 
        /// </summary>
        [NotNull]
        public Structure Structure { get; }

        /// <summary>
        /// The version number of the PDF specification which this file conforms to, for example 1.4.
        /// </summary>
        public decimal Version => version.Version;

        /// <summary>
        /// Get the number of pages in this document.
        /// </summary>
        public int NumberOfPages => pages.Count;

        internal PdfDocument(ILog log, 
            IInputBytes inputBytes,
            HeaderVersion version, 
            CrossReferenceTable crossReferenceTable,
            bool isLenientParsing,
            ParsingCachingProviders cachingProviders,
            IPageFactory pageFactory,
            Catalog catalog,
            DocumentInformation information, IPdfTokenScanner pdfScanner,
            AcroFormFactory acroFormFactory)
        {
            this.log = log;
            this.inputBytes = inputBytes;
            this.version = version ?? throw new ArgumentNullException(nameof(version));
            this.isLenientParsing = isLenientParsing;
            this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
            this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
            Information = information ?? throw new ArgumentNullException(nameof(information));
            pages = new Pages(log, catalog, pageFactory, isLenientParsing, pdfScanner);
            Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
            documentForm = new Lazy<AcroForm>(() => acroFormFactory.GetAcroForm(catalog));
        }

        /// <summary>
        /// Creates a <see cref="PdfDocument"/> for reading from the provided file bytes.
        /// </summary>
        /// <param name="fileBytes">The bytes of the PDF file.</param>
        /// <param name="options">Optional parameters controlling parsing.</param>
        /// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
        public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options);
 
        /// <summary>
        /// Opens a file and creates a <see cref="PdfDocument"/> for reading from the provided file path.
        /// </summary>
        /// <param name="filePath">The full path to the file location of the PDF file.</param>
        /// <param name="options">Optional parameters controlling parsing.</param>
        /// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
        public static PdfDocument Open(string filePath, ParsingOptions options = null) => PdfDocumentFactory.Open(filePath, options);

        /// <summary>
        /// Creates a <see cref="PdfDocument"/> for reading from the provided stream.
        /// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
        /// </summary>
        /// <param name="stream">
        /// A stream of the file contents, this must support reading and seeking.
        /// The PdfDocument will not dispose of the provided stream.
        /// </param>
        /// <param name="options">Optional parameters controlling parsing.</param>
        /// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
        public static PdfDocument Open(Stream stream, ParsingOptions options = null) => PdfDocumentFactory.Open(stream, options);

        /// <summary>
        /// Get the page with the specified page number (1 indexed).
        /// </summary>
        /// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
        /// <returns>The page.</returns>
        public Page GetPage(int pageNumber)
        {
            if (isDisposed)
            {
                throw new ObjectDisposedException("Cannot access page after the document is disposed.");
            }

            log.Debug($"Accessing page {pageNumber}.");

            return pages.GetPage(pageNumber);
        }

        /// <summary>
        /// Gets the form if this document contains one.
        /// </summary>
        /// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns>
        internal AcroForm GetForm()
        {
            if (isDisposed)
            {
                throw new ObjectDisposedException("Cannot access the form after the document is disposed.");
            }

            return documentForm.Value;
        }
        
        /// <inheritdoc />
        /// <summary>
        /// Dispose the <see cref="T:UglyToad.PdfPig.PdfDocument" /> and close any unmanaged resources.
        /// </summary>
        public void Dispose()
        {
            try
            {
                inputBytes.Dispose();
            }
            catch (Exception ex)
            {
                log.Error("Failed disposing the PdfDocument due to an error.", ex);
            }
            finally
            {
                isDisposed = true;
            }
        }
    }
}
-												change the project name to something silly

											
										
										
											2018-01-11 03:49:32 +08:00
+								namespace UglyToad.PdfPig
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								{
 								    using System;
-												support opening from stream and improve performance of brute force searching since the seek operation is now slower.

											
										
										
											2018-01-22 03:34:21 +08:00
+								    using System.IO;
-												#24 start adding classes for the acroform api

											
										
										
											2019-01-02 01:44:46 +08:00
+								    using AcroForms;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								    using Content;
-												#10 make all token classes public and expose via a public structure member on pdf document

											
										
										
											2018-11-25 03:02:06 +08:00
+								    using CrossReference;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								    using IO;
-												create Type0 font, notes about font format, heavy duty refactoring to inject dependencies rather than god object

											
										
										
											2017-12-23 07:54:54 +08:00
+								    using Logging;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								    using Parser;
-												start passing the pdf scanner in to read the type 1 files

											
										
										
											2018-01-14 23:33:22 +08:00
+								    using Tokenization.Scanner;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								    using Util.JetBrains.Annotations;
-												encapsulation for internal classes, remove old code, document public api

											
										
										
											2018-01-07 20:37:48 +08:00
+								    /// <inheritdoc />
 								    /// <summary>
-												fix a bug with tokenization without spaces before string

											
										
										
											2018-01-11 06:15:29 +08:00
+								    /// Provides access to document level information for this PDF document as well as access to the <see cref="T:UglyToad.PdfPig.Content.Page"/>s contained in the document.
-												encapsulation for internal classes, remove old code, document public api

											
										
										
											2018-01-07 20:37:48 +08:00
+								    /// </summary>
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								    public class PdfDocument : IDisposable
 								    {
-												Moved some internal properties to fields

											
										
										
											2018-03-31 05:02:05 +08:00
+								        private bool isDisposed;
-												#24 start adding classes for the acroform api

											
										
										
											2019-01-02 01:44:46 +08:00
+								        private readonly Lazy<AcroForm> documentForm;
-												Moved some internal properties to fields

											
										
										
											2018-03-31 05:02:05 +08:00
 								        private readonly bool isLenientParsing;
-												support opening from stream and improve performance of brute force searching since the seek operation is now slower.

											
										
										
											2018-01-22 03:34:21 +08:00
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								        [NotNull]
 								        private readonly HeaderVersion version;
-												#10 make all token classes public and expose via a public structure member on pdf document

											
										
										
											2018-11-25 03:02:06 +08:00
-												create Type0 font, notes about font format, heavy duty refactoring to inject dependencies rather than god object

											
										
										
											2017-12-23 07:54:54 +08:00
+								        private readonly ILog log;
-												Moved some internal properties to fields

											
										
										
											2018-03-31 05:02:05 +08:00
-												support opening from stream and improve performance of brute force searching since the seek operation is now slower.

											
										
										
											2018-01-22 03:34:21 +08:00
+								        private readonly IInputBytes inputBytes;
-												Moved some internal properties to fields

											
										
										
											2018-03-31 05:02:05 +08:00
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								        [NotNull]
 								        private readonly ParsingCachingProviders cachingProviders;
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								        private readonly IPdfTokenScanner pdfScanner;
-												#10 make all token classes public and expose via a public structure member on pdf document

											
										
										
											2018-11-25 03:02:06 +08:00
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								        [NotNull]
-												Moved some internal properties to fields

											
										
										
											2018-03-31 05:02:05 +08:00
+								        private readonly Pages pages;
-												encapsulate the internals better and improve the api for pdfdocument, delete old code and tidy tests. expand readme

											
										
										
											2017-12-28 21:14:03 +08:00
-												add missing doc comments

											
										
										
											2018-01-11 04:31:38 +08:00
+								        /// <summary>
 								        /// The metadata associated with this document.
 								        /// </summary>
-												move catalog parsing to its own factory. parse document information if present and expose publically. add test for itext generated document

											
										
										
											2018-01-03 07:26:58 +08:00
+								        [NotNull]
 								        public DocumentInformation Information { get; }
-												#10 make all token classes public and expose via a public structure member on pdf document

											
										
										
											2018-11-25 03:02:06 +08:00
+								        /// <summary>
 								        /// Access to the underlying raw structure of the document.
 								        /// </summary>
 								        [NotNull]
-												#11 early access to the raw xobjects for images.

temporary 'safe' untested implementation of seac for type 1 charstrings.
make structure public
bump version of package and project to 0.0.3 (it had accidentally increased to 0.0.5)

											
										
										
											2018-11-27 03:46:41 +08:00
+								        public Structure Structure { get; }
-												#10 make all token classes public and expose via a public structure member on pdf document

											
										
										
											2018-11-25 03:02:06 +08:00
-												make tokenizer classes internal and change the file header to use a scanner rather than the pdfbox type reader

											
										
										
											2018-01-04 04:15:25 +08:00
+								        /// <summary>
 								        /// The version number of the PDF specification which this file conforms to, for example 1.4.
 								        /// </summary>
 								        public decimal Version => version.Version;
-												encapsulate the internals better and improve the api for pdfdocument, delete old code and tidy tests. expand readme

											
										
										
											2017-12-28 21:14:03 +08:00
+								        /// <summary>
 								        /// Get the number of pages in this document.
 								        /// </summary>
-												Moved some internal properties to fields

											
										
										
											2018-03-31 05:02:05 +08:00
+								        public int NumberOfPages => pages.Count;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
-												support opening from stream and improve performance of brute force searching since the seek operation is now slower.

											
										
										
											2018-01-22 03:34:21 +08:00
+								        internal PdfDocument(ILog log,
 								            IInputBytes inputBytes,
 								            HeaderVersion version,
 								            CrossReferenceTable crossReferenceTable,
-												start passing the pdf scanner in to read the type 1 files

											
										
										
											2018-01-14 23:33:22 +08:00
+								            bool isLenientParsing,
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								            ParsingCachingProviders cachingProviders,
-												create Type0 font, notes about font format, heavy duty refactoring to inject dependencies rather than god object

											
										
										
											2017-12-23 07:54:54 +08:00
+								            IPageFactory pageFactory,
-												move catalog parsing to its own factory. parse document information if present and expose publically. add test for itext generated document

											
										
										
											2018-01-03 07:26:58 +08:00
+								            Catalog catalog,
-												#24 start adding classes for the acroform api

											
										
										
											2019-01-02 01:44:46 +08:00
+								            DocumentInformation information, IPdfTokenScanner pdfScanner,
 								            AcroFormFactory acroFormFactory)
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								        {
-												create Type0 font, notes about font format, heavy duty refactoring to inject dependencies rather than god object

											
										
										
											2017-12-23 07:54:54 +08:00
+								            this.log = log;
-												support opening from stream and improve performance of brute force searching since the seek operation is now slower.

											
										
										
											2018-01-22 03:34:21 +08:00
+								            this.inputBytes = inputBytes;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								            this.version = version ?? throw new ArgumentNullException(nameof(version));
 								            this.isLenientParsing = isLenientParsing;
 								            this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
-												#10 make all token classes public and expose via a public structure member on pdf document

											
										
										
											2018-11-25 03:02:06 +08:00
+								            this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
-												move catalog parsing to its own factory. parse document information if present and expose publically. add test for itext generated document

											
										
										
											2018-01-03 07:26:58 +08:00
+								            Information = information ?? throw new ArgumentNullException(nameof(information));
-												Moved some internal properties to fields

											
										
										
											2018-03-31 05:02:05 +08:00
+								            pages = new Pages(log, catalog, pageFactory, isLenientParsing, pdfScanner);
-												#10 make all token classes public and expose via a public structure member on pdf document

											
										
										
											2018-11-25 03:02:06 +08:00
+								            Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
-												#24 start adding classes for the acroform api

											
										
										
											2019-01-02 01:44:46 +08:00
+								            documentForm = new Lazy<AcroForm>(() => acroFormFactory.GetAcroForm(catalog));
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								        }
-												throw informative exception when the document is encrypted

											
										
										
											2018-01-09 06:43:48 +08:00
+								        /// <summary>
 								        /// Creates a <see cref="PdfDocument"/> for reading from the provided file bytes.
 								        /// </summary>
 								        /// <param name="fileBytes">The bytes of the PDF file.</param>
 								        /// <param name="options">Optional parameters controlling parsing.</param>
 								        /// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								        public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options);
-												Moved some internal properties to fields

											
										
										
											2018-03-31 05:02:05 +08:00
-												throw informative exception when the document is encrypted

											
										
										
											2018-01-09 06:43:48 +08:00
+								        /// <summary>
 								        /// Opens a file and creates a <see cref="PdfDocument"/> for reading from the provided file path.
 								        /// </summary>
 								        /// <param name="filePath">The full path to the file location of the PDF file.</param>
 								        /// <param name="options">Optional parameters controlling parsing.</param>
 								        /// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
 								        public static PdfDocument Open(string filePath, ParsingOptions options = null) => PdfDocumentFactory.Open(filePath, options);
-												Moved some internal properties to fields

											
										
										
											2018-03-31 05:02:05 +08:00
-												support opening from stream and improve performance of brute force searching since the seek operation is now slower.

											
										
										
											2018-01-22 03:34:21 +08:00
+								        /// <summary>
 								        /// Creates a <see cref="PdfDocument"/> for reading from the provided stream.
 								        /// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
 								        /// </summary>
 								        /// <param name="stream">
 								        /// A stream of the file contents, this must support reading and seeking.
 								        /// The PdfDocument will not dispose of the provided stream.
 								        /// </param>
 								        /// <param name="options">Optional parameters controlling parsing.</param>
 								        /// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
 								        public static PdfDocument Open(Stream stream, ParsingOptions options = null) => PdfDocumentFactory.Open(stream, options);
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
-												encapsulate the internals better and improve the api for pdfdocument, delete old code and tidy tests. expand readme

											
										
										
											2017-12-28 21:14:03 +08:00
+								        /// <summary>
-												#16 #19 tidy up letter api and check/fix type 1 cff positions

											
										
										
											2018-11-24 22:38:44 +08:00
+								        /// Get the page with the specified page number (1 indexed).
-												encapsulate the internals better and improve the api for pdfdocument, delete old code and tidy tests. expand readme

											
										
										
											2017-12-28 21:14:03 +08:00
+								        /// </summary>
 								        /// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
 								        /// <returns>The page.</returns>
 								        public Page GetPage(int pageNumber)
 								        {
-												support opening from stream and improve performance of brute force searching since the seek operation is now slower.

											
										
										
											2018-01-22 03:34:21 +08:00
+								            if (isDisposed)
 								            {
 								                throw new ObjectDisposedException("Cannot access page after the document is disposed.");
 								            }
 								            log.Debug($"Accessing page {pageNumber}.");
-												Moved some internal properties to fields

											
										
										
											2018-03-31 05:02:05 +08:00
+								            return pages.GetPage(pageNumber);
-												encapsulate the internals better and improve the api for pdfdocument, delete old code and tidy tests. expand readme

											
										
										
											2017-12-28 21:14:03 +08:00
+								        }
-												#24 start adding classes for the acroform api

											
										
										
											2019-01-02 01:44:46 +08:00
 								        /// <summary>
 								        /// Gets the form if this document contains one.
 								        /// </summary>
 								        /// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns>
 								        internal AcroForm GetForm()
 								        {
 								            if (isDisposed)
 								            {
 								                throw new ObjectDisposedException("Cannot access the form after the document is disposed.");
 								            }
 								            return documentForm.Value;
 								        }
-												#10 make all token classes public and expose via a public structure member on pdf document

											
										
										
											2018-11-25 03:02:06 +08:00
-												add missing doc comments

											
										
										
											2018-01-11 04:31:38 +08:00
+								        /// <inheritdoc />
 								        /// <summary>
 								        /// Dispose the <see cref="T:UglyToad.PdfPig.PdfDocument" /> and close any unmanaged resources.
 								        /// </summary>
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								        public void Dispose()
 								        {
 								            try
 								            {
-												support opening from stream and improve performance of brute force searching since the seek operation is now slower.

											
										
										
											2018-01-22 03:34:21 +08:00
+								                inputBytes.Dispose();
 								            }
 								            catch (Exception ex)
 								            {
 								                log.Error("Failed disposing the PdfDocument due to an error.", ex);
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								            }
-												support opening from stream and improve performance of brute force searching since the seek operation is now slower.

											
										
										
											2018-01-22 03:34:21 +08:00
+								            finally
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								            {
-												support opening from stream and improve performance of brute force searching since the seek operation is now slower.

											
										
										
											2018-01-22 03:34:21 +08:00
+								                isDisposed = true;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								            }
 								        }
 								    }
 								}