#57 add access to document metadata and expose wrapper type

2025-10-15 19:54:52 +08:00 · 2019-08-11 12:41:51 +01:00
parent 2d6e49426a
commit 0349bedd3e
8 changed files with 120 additions and 7 deletions
--- a/src/UglyToad.PdfPig.Tests/Integration/CatGeneticsTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Integration/CatGeneticsTests.cs
@@ -20,12 +20,11 @@
                Assert.Contains("catus", page.Text);
            }
        }
-
-
+        
        [Fact]
        public void CanGetAnnotations()
        {
-            using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions { UseLenientParsing = false }))
+            using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
            {
                var page = document.GetPage(1);

--- a/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs
@@ -4,6 +4,7 @@
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
+    using PdfPig.Util;
    using Xunit;

    public class LaTexTests
@@ -16,7 +17,7 @@
        [Fact]
        public void CanReadContent()
        {
-            using (var document = PdfDocument.Open(GetFilename()))
+            using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
            {
                var page = document.GetPage(1);

@@ -125,6 +126,25 @@ used per estimate, we introduce a “complement class”
Naive Bayes is often us
                }
            }
        }
+
+        [Fact]
+        public void CanGetMetadata()
+        {
+            using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
+            {
+                var hasMetadata = document.TryGetXmpMetadata(out var metadata);
+
+                Assert.True(hasMetadata);
+
+                var xDocument = metadata.GetXDocument();
+
+                Assert.NotNull(xDocument);
+
+                var text = OtherEncodings.BytesAsLatin1String(metadata.GetXmlBytes().ToArray());
+
+                Assert.StartsWith("<?xpacket begin='' id='W5M0MpCehiHzreSzNTczkc9d'", text);
+            }
+        }

        private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
        {
--- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs
@@ -55,6 +55,7 @@
                "UglyToad.PdfPig.Content.Word",
                "UglyToad.PdfPig.Content.TextLine",
                "UglyToad.PdfPig.Content.TextDirection",
+                "UglyToad.PdfPig.Content.XmpMetadata",
                "UglyToad.PdfPig.Core.TransformationMatrix",
                "UglyToad.PdfPig.CrossReference.CrossReferenceTable",
                "UglyToad.PdfPig.CrossReference.CrossReferenceType",
--- a/src/UglyToad.PdfPig/Content/XmpMetadata.cs
+++ b/src/UglyToad.PdfPig/Content/XmpMetadata.cs
@@ -0,0 +1,50 @@
+namespace UglyToad.PdfPig.Content
+{
+    using System;
+    using System.Collections.Generic;
+    using System.Linq;
+    using System.Xml.Linq;
+    using Filters;
+    using Tokens;
+    using Util;
+    using Util.JetBrains.Annotations;
+
+    /// <summary>
+    /// Wraps an XML based Extensible Metadata Platform (XMP) document. These XML documents are embedded in PDFs to provide metadata
+    /// about objects (the entire document, images, etc). They can be present as plain text or encoded/encrypted streams.
+    /// </summary>
+    public class XmpMetadata
+    {
+        private readonly IFilterProvider filterProvider;
+
+        /// <summary>
+        /// The underlying <see cref="StreamToken"/> for this metadata.
+        /// </summary>
+        [NotNull]
+        public StreamToken MetadataStreamToken { get; }
+
+        internal XmpMetadata(StreamToken stream, IFilterProvider filterProvider)
+        {
+            this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
+            MetadataStreamToken = stream ?? throw new ArgumentNullException(nameof(stream));
+        }
+
+        /// <summary>
+        /// Get the decoded bytes for the metadata stream. This can be interpreted as a sequence of plain-text bytes.
+        /// </summary>
+        /// <returns>The bytes for the metadata object with any filters removed.</returns>
+        public IReadOnlyList<byte> GetXmlBytes()
+        {
+            return MetadataStreamToken.Decode(filterProvider);
+        }
+
+        /// <summary>
+        /// Gets the metadata stream as an <see cref="XDocument"/>.
+        /// </summary>
+        /// <returns>The <see cref="XDocument"/> for the XMP XML.</returns>
+        public XDocument GetXDocument()
+        {
+            return XDocument.Parse(OtherEncodings.BytesAsLatin1String(GetXmlBytes().ToArray()));
+        }
+    }
+}
--- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
+++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
@@ -140,7 +140,8 @@
            
            return new PdfDocument(log, inputBytes, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
                encryptionDictionary,
-                pdfScanner, 
+                pdfScanner,
+                filterProvider,
                acroFormFactory);
        }

--- a/src/UglyToad.PdfPig/ParsingOptions.cs
+++ b/src/UglyToad.PdfPig/ParsingOptions.cs
@@ -7,6 +7,14 @@
    /// </summary>
    public class ParsingOptions
    {
+        /// <summary>
+        /// A default <see cref="ParsingOptions"/> with <see cref="UseLenientParsing"/> set to false.
+        /// </summary>
+        public static ParsingOptions LenientParsingOff { get; } = new ParsingOptions
+        {
+            UseLenientParsing = false
+        };
+
        /// <summary>
        /// Should the parser ignore issues where the document does not conform to the PDF specification?
        /// </summary>
--- a/src/UglyToad.PdfPig/PdfDocument.cs
+++ b/src/UglyToad.PdfPig/PdfDocument.cs
@@ -7,10 +7,12 @@
    using CrossReference;
    using Encryption;
    using Exceptions;
+    using Filters;
    using IO;
    using Logging;
    using Parser;
    using Tokenization.Scanner;
+    using Tokens;
    using Util.JetBrains.Annotations;

    /// <inheritdoc />
@@ -39,7 +41,9 @@

        [NotNull]
        private readonly IPdfTokenScanner pdfScanner;
-        
+
+        private readonly IFilterProvider filterProvider;
+
        [NotNull]
        private readonly Pages pages;

@@ -81,6 +85,7 @@
            DocumentInformation information, 
            EncryptionDictionary encryptionDictionary,
            IPdfTokenScanner pdfScanner,
+            IFilterProvider filterProvider,
            AcroFormFactory acroFormFactory)
        {
            this.log = log;
@@ -90,6 +95,7 @@
            this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
            this.encryptionDictionary = encryptionDictionary;
            this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
+            this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
            Information = information ?? throw new ArgumentNullException(nameof(information));
            pages = new Pages(log, catalog, pageFactory, isLenientParsing, pdfScanner);
            Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
@@ -153,9 +159,36 @@
            }
        }

+        /// <summary>
+        /// Get the document level metadata if present.
+        /// The metadata is XML in the (Extensible Metadata Platform) XMP format.
+        /// </summary>
+        /// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
+        /// <param name="metadata">The metadata stream if it exists.</param>
+        /// <returns><see langword="true"/> if the metadata is present, <see langword="false"/> otherwise.</returns>
+        public bool TryGetXmpMetadata(out XmpMetadata metadata)
+        {
+            if (isDisposed)
+            {
+                throw new ObjectDisposedException("Cannot access the document metadata after the document is disposed.");
+            }
+
+            metadata = null;
+
+            if (!Structure.Catalog.CatalogDictionary.TryGet(NameToken.Metadata, pdfScanner, out StreamToken xmpStreamToken))
+            {
+                return false;
+            }
+
+            metadata = new XmpMetadata(xmpStreamToken, filterProvider);
+
+            return true;
+        }
+
        /// <summary>
        /// Gets the form if this document contains one.
        /// </summary>
+        /// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
        /// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns>
        internal AcroForm GetForm()
        {
--- a/src/UglyToad.PdfPig/Tokens/NameToken.Constants.cs
+++ b/src/UglyToad.PdfPig/Tokens/NameToken.Constants.cs
@@ -552,11 +552,12 @@
        public static readonly NameToken WinAnsiEncoding = new NameToken("WinAnsiEncoding");
        // X
        public static readonly NameToken Xfa = new NameToken("XFA");
-        public static readonly NameToken XStep = new NameToken("XStep");
        public static readonly NameToken Xheight = new NameToken("XHeight");
+        public static readonly NameToken Xml = new NameToken("XML");
        public static readonly NameToken Xobject = new NameToken("XObject");
        public static readonly NameToken Xref = new NameToken("XRef");
        public static readonly NameToken XrefStm = new NameToken("XRefStm");
+        public static readonly NameToken XStep = new NameToken("XStep");
        // Y
        public static readonly NameToken YStep = new NameToken("YStep");
        public static readonly NameToken Yes = new NameToken("Yes");