#57 add access to document metadata and expose wrapper type

This commit is contained in:
Eliot Jones
2019-08-11 12:41:51 +01:00
parent 2d6e49426a
commit 0349bedd3e
8 changed files with 120 additions and 7 deletions

View File

@@ -21,11 +21,10 @@
} }
} }
[Fact] [Fact]
public void CanGetAnnotations() public void CanGetAnnotations()
{ {
using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions { UseLenientParsing = false })) using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{ {
var page = document.GetPage(1); var page = document.GetPage(1);

View File

@@ -4,6 +4,7 @@
using System.Collections.Generic; using System.Collections.Generic;
using System.IO; using System.IO;
using System.Linq; using System.Linq;
using PdfPig.Util;
using Xunit; using Xunit;
public class LaTexTests public class LaTexTests
@@ -16,7 +17,7 @@
[Fact] [Fact]
public void CanReadContent() public void CanReadContent()
{ {
using (var document = PdfDocument.Open(GetFilename())) using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{ {
var page = document.GetPage(1); var page = document.GetPage(1);
@@ -125,6 +126,25 @@ used per estimate, we introduce a “complement class” Naive Bayes is often us
} }
} }
} }
[Fact]
public void CanGetMetadata()
{
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var hasMetadata = document.TryGetXmpMetadata(out var metadata);
Assert.True(hasMetadata);
var xDocument = metadata.GetXDocument();
Assert.NotNull(xDocument);
var text = OtherEncodings.BytesAsLatin1String(metadata.GetXmlBytes().ToArray());
Assert.StartsWith("<?xpacket begin='' id='W5M0MpCehiHzreSzNTczkc9d'", text);
}
}
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData() private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
{ {

View File

@@ -55,6 +55,7 @@
"UglyToad.PdfPig.Content.Word", "UglyToad.PdfPig.Content.Word",
"UglyToad.PdfPig.Content.TextLine", "UglyToad.PdfPig.Content.TextLine",
"UglyToad.PdfPig.Content.TextDirection", "UglyToad.PdfPig.Content.TextDirection",
"UglyToad.PdfPig.Content.XmpMetadata",
"UglyToad.PdfPig.Core.TransformationMatrix", "UglyToad.PdfPig.Core.TransformationMatrix",
"UglyToad.PdfPig.CrossReference.CrossReferenceTable", "UglyToad.PdfPig.CrossReference.CrossReferenceTable",
"UglyToad.PdfPig.CrossReference.CrossReferenceType", "UglyToad.PdfPig.CrossReference.CrossReferenceType",

View File

@@ -0,0 +1,50 @@
namespace UglyToad.PdfPig.Content
{
using System;
using System.Collections.Generic;
using System.Linq;
using System.Xml.Linq;
using Filters;
using Tokens;
using Util;
using Util.JetBrains.Annotations;
/// <summary>
/// Wraps an XML based Extensible Metadata Platform (XMP) document. These XML documents are embedded in PDFs to provide metadata
/// about objects (the entire document, images, etc). They can be present as plain text or encoded/encrypted streams.
/// </summary>
public class XmpMetadata
{
private readonly IFilterProvider filterProvider;
/// <summary>
/// The underlying <see cref="StreamToken"/> for this metadata.
/// </summary>
[NotNull]
public StreamToken MetadataStreamToken { get; }
internal XmpMetadata(StreamToken stream, IFilterProvider filterProvider)
{
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
MetadataStreamToken = stream ?? throw new ArgumentNullException(nameof(stream));
}
/// <summary>
/// Get the decoded bytes for the metadata stream. This can be interpreted as a sequence of plain-text bytes.
/// </summary>
/// <returns>The bytes for the metadata object with any filters removed.</returns>
public IReadOnlyList<byte> GetXmlBytes()
{
return MetadataStreamToken.Decode(filterProvider);
}
/// <summary>
/// Gets the metadata stream as an <see cref="XDocument"/>.
/// </summary>
/// <returns>The <see cref="XDocument"/> for the XMP XML.</returns>
public XDocument GetXDocument()
{
return XDocument.Parse(OtherEncodings.BytesAsLatin1String(GetXmlBytes().ToArray()));
}
}
}

View File

@@ -141,6 +141,7 @@
return new PdfDocument(log, inputBytes, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information, return new PdfDocument(log, inputBytes, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
encryptionDictionary, encryptionDictionary,
pdfScanner, pdfScanner,
filterProvider,
acroFormFactory); acroFormFactory);
} }

View File

@@ -7,6 +7,14 @@
/// </summary> /// </summary>
public class ParsingOptions public class ParsingOptions
{ {
/// <summary>
/// A default <see cref="ParsingOptions"/> with <see cref="UseLenientParsing"/> set to false.
/// </summary>
public static ParsingOptions LenientParsingOff { get; } = new ParsingOptions
{
UseLenientParsing = false
};
/// <summary> /// <summary>
/// Should the parser ignore issues where the document does not conform to the PDF specification? /// Should the parser ignore issues where the document does not conform to the PDF specification?
/// </summary> /// </summary>

View File

@@ -7,10 +7,12 @@
using CrossReference; using CrossReference;
using Encryption; using Encryption;
using Exceptions; using Exceptions;
using Filters;
using IO; using IO;
using Logging; using Logging;
using Parser; using Parser;
using Tokenization.Scanner; using Tokenization.Scanner;
using Tokens;
using Util.JetBrains.Annotations; using Util.JetBrains.Annotations;
/// <inheritdoc /> /// <inheritdoc />
@@ -40,6 +42,8 @@
[NotNull] [NotNull]
private readonly IPdfTokenScanner pdfScanner; private readonly IPdfTokenScanner pdfScanner;
private readonly IFilterProvider filterProvider;
[NotNull] [NotNull]
private readonly Pages pages; private readonly Pages pages;
@@ -81,6 +85,7 @@
DocumentInformation information, DocumentInformation information,
EncryptionDictionary encryptionDictionary, EncryptionDictionary encryptionDictionary,
IPdfTokenScanner pdfScanner, IPdfTokenScanner pdfScanner,
IFilterProvider filterProvider,
AcroFormFactory acroFormFactory) AcroFormFactory acroFormFactory)
{ {
this.log = log; this.log = log;
@@ -90,6 +95,7 @@
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders)); this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
this.encryptionDictionary = encryptionDictionary; this.encryptionDictionary = encryptionDictionary;
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner)); this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
Information = information ?? throw new ArgumentNullException(nameof(information)); Information = information ?? throw new ArgumentNullException(nameof(information));
pages = new Pages(log, catalog, pageFactory, isLenientParsing, pdfScanner); pages = new Pages(log, catalog, pageFactory, isLenientParsing, pdfScanner);
Structure = new Structure(catalog, crossReferenceTable, pdfScanner); Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
@@ -153,9 +159,36 @@
} }
} }
/// <summary>
/// Get the document level metadata if present.
/// The metadata is XML in the (Extensible Metadata Platform) XMP format.
/// </summary>
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
/// <param name="metadata">The metadata stream if it exists.</param>
/// <returns><see langword="true"/> if the metadata is present, <see langword="false"/> otherwise.</returns>
public bool TryGetXmpMetadata(out XmpMetadata metadata)
{
if (isDisposed)
{
throw new ObjectDisposedException("Cannot access the document metadata after the document is disposed.");
}
metadata = null;
if (!Structure.Catalog.CatalogDictionary.TryGet(NameToken.Metadata, pdfScanner, out StreamToken xmpStreamToken))
{
return false;
}
metadata = new XmpMetadata(xmpStreamToken, filterProvider);
return true;
}
/// <summary> /// <summary>
/// Gets the form if this document contains one. /// Gets the form if this document contains one.
/// </summary> /// </summary>
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
/// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns> /// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns>
internal AcroForm GetForm() internal AcroForm GetForm()
{ {

View File

@@ -552,11 +552,12 @@
public static readonly NameToken WinAnsiEncoding = new NameToken("WinAnsiEncoding"); public static readonly NameToken WinAnsiEncoding = new NameToken("WinAnsiEncoding");
// X // X
public static readonly NameToken Xfa = new NameToken("XFA"); public static readonly NameToken Xfa = new NameToken("XFA");
public static readonly NameToken XStep = new NameToken("XStep");
public static readonly NameToken Xheight = new NameToken("XHeight"); public static readonly NameToken Xheight = new NameToken("XHeight");
public static readonly NameToken Xml = new NameToken("XML");
public static readonly NameToken Xobject = new NameToken("XObject"); public static readonly NameToken Xobject = new NameToken("XObject");
public static readonly NameToken Xref = new NameToken("XRef"); public static readonly NameToken Xref = new NameToken("XRef");
public static readonly NameToken XrefStm = new NameToken("XRefStm"); public static readonly NameToken XrefStm = new NameToken("XRefStm");
public static readonly NameToken XStep = new NameToken("XStep");
// Y // Y
public static readonly NameToken YStep = new NameToken("YStep"); public static readonly NameToken YStep = new NameToken("YStep");
public static readonly NameToken Yes = new NameToken("Yes"); public static readonly NameToken Yes = new NameToken("Yes");