mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 19:54:52 +08:00
#57 add access to document metadata and expose wrapper type
This commit is contained in:
@@ -20,12 +20,11 @@
|
||||
Assert.Contains("catus", page.Text);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
[Fact]
|
||||
public void CanGetAnnotations()
|
||||
{
|
||||
using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions { UseLenientParsing = false }))
|
||||
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
|
@@ -4,6 +4,7 @@
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using PdfPig.Util;
|
||||
using Xunit;
|
||||
|
||||
public class LaTexTests
|
||||
@@ -16,7 +17,7 @@
|
||||
[Fact]
|
||||
public void CanReadContent()
|
||||
{
|
||||
using (var document = PdfDocument.Open(GetFilename()))
|
||||
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
@@ -125,6 +126,25 @@ used per estimate, we introduce a “complement class”
Naive Bayes is often us
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanGetMetadata()
|
||||
{
|
||||
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
var hasMetadata = document.TryGetXmpMetadata(out var metadata);
|
||||
|
||||
Assert.True(hasMetadata);
|
||||
|
||||
var xDocument = metadata.GetXDocument();
|
||||
|
||||
Assert.NotNull(xDocument);
|
||||
|
||||
var text = OtherEncodings.BytesAsLatin1String(metadata.GetXmlBytes().ToArray());
|
||||
|
||||
Assert.StartsWith("<?xpacket begin='' id='W5M0MpCehiHzreSzNTczkc9d'", text);
|
||||
}
|
||||
}
|
||||
|
||||
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
|
||||
{
|
||||
|
@@ -55,6 +55,7 @@
|
||||
"UglyToad.PdfPig.Content.Word",
|
||||
"UglyToad.PdfPig.Content.TextLine",
|
||||
"UglyToad.PdfPig.Content.TextDirection",
|
||||
"UglyToad.PdfPig.Content.XmpMetadata",
|
||||
"UglyToad.PdfPig.Core.TransformationMatrix",
|
||||
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
|
||||
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
|
||||
|
50
src/UglyToad.PdfPig/Content/XmpMetadata.cs
Normal file
50
src/UglyToad.PdfPig/Content/XmpMetadata.cs
Normal file
@@ -0,0 +1,50 @@
|
||||
namespace UglyToad.PdfPig.Content
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Xml.Linq;
|
||||
using Filters;
|
||||
using Tokens;
|
||||
using Util;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
/// <summary>
|
||||
/// Wraps an XML based Extensible Metadata Platform (XMP) document. These XML documents are embedded in PDFs to provide metadata
|
||||
/// about objects (the entire document, images, etc). They can be present as plain text or encoded/encrypted streams.
|
||||
/// </summary>
|
||||
public class XmpMetadata
|
||||
{
|
||||
private readonly IFilterProvider filterProvider;
|
||||
|
||||
/// <summary>
|
||||
/// The underlying <see cref="StreamToken"/> for this metadata.
|
||||
/// </summary>
|
||||
[NotNull]
|
||||
public StreamToken MetadataStreamToken { get; }
|
||||
|
||||
internal XmpMetadata(StreamToken stream, IFilterProvider filterProvider)
|
||||
{
|
||||
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
|
||||
MetadataStreamToken = stream ?? throw new ArgumentNullException(nameof(stream));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the decoded bytes for the metadata stream. This can be interpreted as a sequence of plain-text bytes.
|
||||
/// </summary>
|
||||
/// <returns>The bytes for the metadata object with any filters removed.</returns>
|
||||
public IReadOnlyList<byte> GetXmlBytes()
|
||||
{
|
||||
return MetadataStreamToken.Decode(filterProvider);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the metadata stream as an <see cref="XDocument"/>.
|
||||
/// </summary>
|
||||
/// <returns>The <see cref="XDocument"/> for the XMP XML.</returns>
|
||||
public XDocument GetXDocument()
|
||||
{
|
||||
return XDocument.Parse(OtherEncodings.BytesAsLatin1String(GetXmlBytes().ToArray()));
|
||||
}
|
||||
}
|
||||
}
|
@@ -140,7 +140,8 @@
|
||||
|
||||
return new PdfDocument(log, inputBytes, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
|
||||
encryptionDictionary,
|
||||
pdfScanner,
|
||||
pdfScanner,
|
||||
filterProvider,
|
||||
acroFormFactory);
|
||||
}
|
||||
|
||||
|
@@ -7,6 +7,14 @@
|
||||
/// </summary>
|
||||
public class ParsingOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// A default <see cref="ParsingOptions"/> with <see cref="UseLenientParsing"/> set to false.
|
||||
/// </summary>
|
||||
public static ParsingOptions LenientParsingOff { get; } = new ParsingOptions
|
||||
{
|
||||
UseLenientParsing = false
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Should the parser ignore issues where the document does not conform to the PDF specification?
|
||||
/// </summary>
|
||||
|
@@ -7,10 +7,12 @@
|
||||
using CrossReference;
|
||||
using Encryption;
|
||||
using Exceptions;
|
||||
using Filters;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Parser;
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
/// <inheritdoc />
|
||||
@@ -39,7 +41,9 @@
|
||||
|
||||
[NotNull]
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
|
||||
|
||||
private readonly IFilterProvider filterProvider;
|
||||
|
||||
[NotNull]
|
||||
private readonly Pages pages;
|
||||
|
||||
@@ -81,6 +85,7 @@
|
||||
DocumentInformation information,
|
||||
EncryptionDictionary encryptionDictionary,
|
||||
IPdfTokenScanner pdfScanner,
|
||||
IFilterProvider filterProvider,
|
||||
AcroFormFactory acroFormFactory)
|
||||
{
|
||||
this.log = log;
|
||||
@@ -90,6 +95,7 @@
|
||||
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
|
||||
this.encryptionDictionary = encryptionDictionary;
|
||||
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
|
||||
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
|
||||
Information = information ?? throw new ArgumentNullException(nameof(information));
|
||||
pages = new Pages(log, catalog, pageFactory, isLenientParsing, pdfScanner);
|
||||
Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
|
||||
@@ -153,9 +159,36 @@
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the document level metadata if present.
|
||||
/// The metadata is XML in the (Extensible Metadata Platform) XMP format.
|
||||
/// </summary>
|
||||
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
|
||||
/// <param name="metadata">The metadata stream if it exists.</param>
|
||||
/// <returns><see langword="true"/> if the metadata is present, <see langword="false"/> otherwise.</returns>
|
||||
public bool TryGetXmpMetadata(out XmpMetadata metadata)
|
||||
{
|
||||
if (isDisposed)
|
||||
{
|
||||
throw new ObjectDisposedException("Cannot access the document metadata after the document is disposed.");
|
||||
}
|
||||
|
||||
metadata = null;
|
||||
|
||||
if (!Structure.Catalog.CatalogDictionary.TryGet(NameToken.Metadata, pdfScanner, out StreamToken xmpStreamToken))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
metadata = new XmpMetadata(xmpStreamToken, filterProvider);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the form if this document contains one.
|
||||
/// </summary>
|
||||
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
|
||||
/// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns>
|
||||
internal AcroForm GetForm()
|
||||
{
|
||||
|
@@ -552,11 +552,12 @@
|
||||
public static readonly NameToken WinAnsiEncoding = new NameToken("WinAnsiEncoding");
|
||||
// X
|
||||
public static readonly NameToken Xfa = new NameToken("XFA");
|
||||
public static readonly NameToken XStep = new NameToken("XStep");
|
||||
public static readonly NameToken Xheight = new NameToken("XHeight");
|
||||
public static readonly NameToken Xml = new NameToken("XML");
|
||||
public static readonly NameToken Xobject = new NameToken("XObject");
|
||||
public static readonly NameToken Xref = new NameToken("XRef");
|
||||
public static readonly NameToken XrefStm = new NameToken("XRefStm");
|
||||
public static readonly NameToken XStep = new NameToken("XStep");
|
||||
// Y
|
||||
public static readonly NameToken YStep = new NameToken("YStep");
|
||||
public static readonly NameToken Yes = new NameToken("Yes");
|
||||
|
Reference in New Issue
Block a user