#57 add access to document metadata and expose wrapper type

2025-12-21 19:29:51 +08:00 · 2019-08-11 12:41:51 +01:00
parent 2d6e49426a
commit 0349bedd3e
8 changed files with 120 additions and 7 deletions
--- a/src/UglyToad.PdfPig.Tests/Integration/CatGeneticsTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Integration/CatGeneticsTests.cs
@@ -21,11 +21,10 @@
            }
        }
        [Fact]
        public void CanGetAnnotations()
        {
-            using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions { UseLenientParsing = false }))
+            using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
            {
                var page = document.GetPage(1);
--- a/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs
@@ -4,6 +4,7 @@
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using PdfPig.Util;
    using Xunit;
    public class LaTexTests
@@ -16,7 +17,7 @@
        [Fact]
        public void CanReadContent()
        {
-            using (var document = PdfDocument.Open(GetFilename()))
+            using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
            {
                var page = document.GetPage(1);
@@ -125,6 +126,25 @@ used per estimate, we introduce a “complement class”
Naive Bayes is often us
                }
            }
        }
        [Fact]
        public void CanGetMetadata()
        {
            using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
            {
                var hasMetadata = document.TryGetXmpMetadata(out var metadata);
                Assert.True(hasMetadata);
                var xDocument = metadata.GetXDocument();
                Assert.NotNull(xDocument);
                var text = OtherEncodings.BytesAsLatin1String(metadata.GetXmlBytes().ToArray());
                Assert.StartsWith("<?xpacket begin='' id='W5M0MpCehiHzreSzNTczkc9d'", text);
            }
        }
        private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
        {
--- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
@@ -55,6 +55,7 @@
                "UglyToad.PdfPig.Content.Word",
                "UglyToad.PdfPig.Content.TextLine",
                "UglyToad.PdfPig.Content.TextDirection",
                "UglyToad.PdfPig.Content.XmpMetadata",
                "UglyToad.PdfPig.Core.TransformationMatrix",
                "UglyToad.PdfPig.CrossReference.CrossReferenceTable",
                "UglyToad.PdfPig.CrossReference.CrossReferenceType",
--- a/src/UglyToad.PdfPig/Content/XmpMetadata.cs
+++ b/src/UglyToad.PdfPig/Content/XmpMetadata.cs
@@ -0,0 +1,50 @@
 namespace UglyToad.PdfPig.Content
 {
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Xml.Linq;
    using Filters;
    using Tokens;
    using Util;
    using Util.JetBrains.Annotations;
    /// <summary>
    /// Wraps an XML based Extensible Metadata Platform (XMP) document. These XML documents are embedded in PDFs to provide metadata
    /// about objects (the entire document, images, etc). They can be present as plain text or encoded/encrypted streams.
    /// </summary>
    public class XmpMetadata
    {
        private readonly IFilterProvider filterProvider;
        /// <summary>
        /// The underlying <see cref="StreamToken"/> for this metadata.
        /// </summary>
        [NotNull]
        public StreamToken MetadataStreamToken { get; }
        internal XmpMetadata(StreamToken stream, IFilterProvider filterProvider)
        {
            this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
            MetadataStreamToken = stream ?? throw new ArgumentNullException(nameof(stream));
        }
        /// <summary>
        /// Get the decoded bytes for the metadata stream. This can be interpreted as a sequence of plain-text bytes.
        /// </summary>
        /// <returns>The bytes for the metadata object with any filters removed.</returns>
        public IReadOnlyList<byte> GetXmlBytes()
        {
            return MetadataStreamToken.Decode(filterProvider);
        }
        /// <summary>
        /// Gets the metadata stream as an <see cref="XDocument"/>.
        /// </summary>
        /// <returns>The <see cref="XDocument"/> for the XMP XML.</returns>
        public XDocument GetXDocument()
        {
            return XDocument.Parse(OtherEncodings.BytesAsLatin1String(GetXmlBytes().ToArray()));
        }
    }
 }
--- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
+++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
@@ -141,6 +141,7 @@
            return new PdfDocument(log, inputBytes, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
                encryptionDictionary,
                pdfScanner,
                filterProvider,
                acroFormFactory);
        }
--- a/src/UglyToad.PdfPig/ParsingOptions.cs
+++ b/src/UglyToad.PdfPig/ParsingOptions.cs
@@ -7,6 +7,14 @@
    /// </summary>
    public class ParsingOptions
    {
        /// <summary>
        /// A default <see cref="ParsingOptions"/> with <see cref="UseLenientParsing"/> set to false.
        /// </summary>
        public static ParsingOptions LenientParsingOff { get; } = new ParsingOptions
        {
            UseLenientParsing = false
        };
        /// <summary>
        /// Should the parser ignore issues where the document does not conform to the PDF specification?
        /// </summary>
--- a/src/UglyToad.PdfPig/PdfDocument.cs
+++ b/src/UglyToad.PdfPig/PdfDocument.cs
@@ -7,10 +7,12 @@
    using CrossReference;
    using Encryption;
    using Exceptions;
    using Filters;
    using IO;
    using Logging;
    using Parser;
    using Tokenization.Scanner;
    using Tokens;
    using Util.JetBrains.Annotations;
    /// <inheritdoc />
@@ -40,6 +42,8 @@
        [NotNull]
        private readonly IPdfTokenScanner pdfScanner;
        private readonly IFilterProvider filterProvider;
        [NotNull]
        private readonly Pages pages;
@@ -81,6 +85,7 @@
            DocumentInformation information, 
            EncryptionDictionary encryptionDictionary,
            IPdfTokenScanner pdfScanner,
            IFilterProvider filterProvider,
            AcroFormFactory acroFormFactory)
        {
            this.log = log;
@@ -90,6 +95,7 @@
            this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
            this.encryptionDictionary = encryptionDictionary;
            this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
            this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
            Information = information ?? throw new ArgumentNullException(nameof(information));
            pages = new Pages(log, catalog, pageFactory, isLenientParsing, pdfScanner);
            Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
@@ -153,9 +159,36 @@
            }
        }
        /// <summary>
        /// Get the document level metadata if present.
        /// The metadata is XML in the (Extensible Metadata Platform) XMP format.
        /// </summary>
        /// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
        /// <param name="metadata">The metadata stream if it exists.</param>
        /// <returns><see langword="true"/> if the metadata is present, <see langword="false"/> otherwise.</returns>
        public bool TryGetXmpMetadata(out XmpMetadata metadata)
        {
            if (isDisposed)
            {
                throw new ObjectDisposedException("Cannot access the document metadata after the document is disposed.");
            }
            metadata = null;
            if (!Structure.Catalog.CatalogDictionary.TryGet(NameToken.Metadata, pdfScanner, out StreamToken xmpStreamToken))
            {
                return false;
            }
            metadata = new XmpMetadata(xmpStreamToken, filterProvider);
            return true;
        }
        /// <summary>
        /// Gets the form if this document contains one.
        /// </summary>
        /// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
        /// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns>
        internal AcroForm GetForm()
        {
--- a/src/UglyToad.PdfPig/Tokens/NameToken.Constants.cs
+++ b/src/UglyToad.PdfPig/Tokens/NameToken.Constants.cs
@@ -552,11 +552,12 @@
        public static readonly NameToken WinAnsiEncoding = new NameToken("WinAnsiEncoding");
        // X
        public static readonly NameToken Xfa = new NameToken("XFA");
        public static readonly NameToken XStep = new NameToken("XStep");
        public static readonly NameToken Xheight = new NameToken("XHeight");
        public static readonly NameToken Xml = new NameToken("XML");
        public static readonly NameToken Xobject = new NameToken("XObject");
        public static readonly NameToken Xref = new NameToken("XRef");
        public static readonly NameToken XrefStm = new NameToken("XRefStm");
        public static readonly NameToken XStep = new NameToken("XStep");
        // Y
        public static readonly NameToken YStep = new NameToken("YStep");
        public static readonly NameToken Yes = new NameToken("Yes");