#57 add access to document metadata and expose wrapper type

This commit is contained in:
Eliot Jones
2019-08-11 12:41:51 +01:00
parent 2d6e49426a
commit 0349bedd3e
8 changed files with 120 additions and 7 deletions

View File

@@ -20,12 +20,11 @@
Assert.Contains("catus", page.Text);
}
}
[Fact]
public void CanGetAnnotations()
{
using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions { UseLenientParsing = false }))
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(1);

View File

@@ -4,6 +4,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using PdfPig.Util;
using Xunit;
public class LaTexTests
@@ -16,7 +17,7 @@
[Fact]
public void CanReadContent()
{
using (var document = PdfDocument.Open(GetFilename()))
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(1);
@@ -125,6 +126,25 @@ used per estimate, we introduce a “complement class” Naive Bayes is often us
}
}
}
[Fact]
public void CanGetMetadata()
{
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var hasMetadata = document.TryGetXmpMetadata(out var metadata);
Assert.True(hasMetadata);
var xDocument = metadata.GetXDocument();
Assert.NotNull(xDocument);
var text = OtherEncodings.BytesAsLatin1String(metadata.GetXmlBytes().ToArray());
Assert.StartsWith("<?xpacket begin='' id='W5M0MpCehiHzreSzNTczkc9d'", text);
}
}
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
{

View File

@@ -55,6 +55,7 @@
"UglyToad.PdfPig.Content.Word",
"UglyToad.PdfPig.Content.TextLine",
"UglyToad.PdfPig.Content.TextDirection",
"UglyToad.PdfPig.Content.XmpMetadata",
"UglyToad.PdfPig.Core.TransformationMatrix",
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
"UglyToad.PdfPig.CrossReference.CrossReferenceType",

View File

@@ -0,0 +1,50 @@
namespace UglyToad.PdfPig.Content
{
using System;
using System.Collections.Generic;
using System.Linq;
using System.Xml.Linq;
using Filters;
using Tokens;
using Util;
using Util.JetBrains.Annotations;
/// <summary>
/// Wraps an XML based Extensible Metadata Platform (XMP) document. These XML documents are embedded in PDFs to provide metadata
/// about objects (the entire document, images, etc). They can be present as plain text or encoded/encrypted streams.
/// </summary>
public class XmpMetadata
{
private readonly IFilterProvider filterProvider;
/// <summary>
/// The underlying <see cref="StreamToken"/> for this metadata.
/// </summary>
[NotNull]
public StreamToken MetadataStreamToken { get; }
internal XmpMetadata(StreamToken stream, IFilterProvider filterProvider)
{
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
MetadataStreamToken = stream ?? throw new ArgumentNullException(nameof(stream));
}
/// <summary>
/// Get the decoded bytes for the metadata stream. This can be interpreted as a sequence of plain-text bytes.
/// </summary>
/// <returns>The bytes for the metadata object with any filters removed.</returns>
public IReadOnlyList<byte> GetXmlBytes()
{
return MetadataStreamToken.Decode(filterProvider);
}
/// <summary>
/// Gets the metadata stream as an <see cref="XDocument"/>.
/// </summary>
/// <returns>The <see cref="XDocument"/> for the XMP XML.</returns>
public XDocument GetXDocument()
{
return XDocument.Parse(OtherEncodings.BytesAsLatin1String(GetXmlBytes().ToArray()));
}
}
}

View File

@@ -140,7 +140,8 @@
return new PdfDocument(log, inputBytes, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
encryptionDictionary,
pdfScanner,
pdfScanner,
filterProvider,
acroFormFactory);
}

View File

@@ -7,6 +7,14 @@
/// </summary>
public class ParsingOptions
{
/// <summary>
/// A default <see cref="ParsingOptions"/> with <see cref="UseLenientParsing"/> set to false.
/// </summary>
public static ParsingOptions LenientParsingOff { get; } = new ParsingOptions
{
UseLenientParsing = false
};
/// <summary>
/// Should the parser ignore issues where the document does not conform to the PDF specification?
/// </summary>

View File

@@ -7,10 +7,12 @@
using CrossReference;
using Encryption;
using Exceptions;
using Filters;
using IO;
using Logging;
using Parser;
using Tokenization.Scanner;
using Tokens;
using Util.JetBrains.Annotations;
/// <inheritdoc />
@@ -39,7 +41,9 @@
[NotNull]
private readonly IPdfTokenScanner pdfScanner;
private readonly IFilterProvider filterProvider;
[NotNull]
private readonly Pages pages;
@@ -81,6 +85,7 @@
DocumentInformation information,
EncryptionDictionary encryptionDictionary,
IPdfTokenScanner pdfScanner,
IFilterProvider filterProvider,
AcroFormFactory acroFormFactory)
{
this.log = log;
@@ -90,6 +95,7 @@
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
this.encryptionDictionary = encryptionDictionary;
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
Information = information ?? throw new ArgumentNullException(nameof(information));
pages = new Pages(log, catalog, pageFactory, isLenientParsing, pdfScanner);
Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
@@ -153,9 +159,36 @@
}
}
/// <summary>
/// Get the document level metadata if present.
/// The metadata is XML in the (Extensible Metadata Platform) XMP format.
/// </summary>
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
/// <param name="metadata">The metadata stream if it exists.</param>
/// <returns><see langword="true"/> if the metadata is present, <see langword="false"/> otherwise.</returns>
public bool TryGetXmpMetadata(out XmpMetadata metadata)
{
if (isDisposed)
{
throw new ObjectDisposedException("Cannot access the document metadata after the document is disposed.");
}
metadata = null;
if (!Structure.Catalog.CatalogDictionary.TryGet(NameToken.Metadata, pdfScanner, out StreamToken xmpStreamToken))
{
return false;
}
metadata = new XmpMetadata(xmpStreamToken, filterProvider);
return true;
}
/// <summary>
/// Gets the form if this document contains one.
/// </summary>
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
/// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns>
internal AcroForm GetForm()
{

View File

@@ -552,11 +552,12 @@
public static readonly NameToken WinAnsiEncoding = new NameToken("WinAnsiEncoding");
// X
public static readonly NameToken Xfa = new NameToken("XFA");
public static readonly NameToken XStep = new NameToken("XStep");
public static readonly NameToken Xheight = new NameToken("XHeight");
public static readonly NameToken Xml = new NameToken("XML");
public static readonly NameToken Xobject = new NameToken("XObject");
public static readonly NameToken Xref = new NameToken("XRef");
public static readonly NameToken XrefStm = new NameToken("XRefStm");
public static readonly NameToken XStep = new NameToken("XStep");
// Y
public static readonly NameToken YStep = new NameToken("YStep");
public static readonly NameToken Yes = new NameToken("Yes");