mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-12-21 11:13:55 +08:00
#57 add access to document metadata and expose wrapper type
This commit is contained in:
@@ -20,12 +20,11 @@
|
|||||||
Assert.Contains("catus", page.Text);
|
Assert.Contains("catus", page.Text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void CanGetAnnotations()
|
public void CanGetAnnotations()
|
||||||
{
|
{
|
||||||
using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions { UseLenientParsing = false }))
|
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||||
{
|
{
|
||||||
var page = document.GetPage(1);
|
var page = document.GetPage(1);
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
using System.IO;
|
using System.IO;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
|
using PdfPig.Util;
|
||||||
using Xunit;
|
using Xunit;
|
||||||
|
|
||||||
public class LaTexTests
|
public class LaTexTests
|
||||||
@@ -16,7 +17,7 @@
|
|||||||
[Fact]
|
[Fact]
|
||||||
public void CanReadContent()
|
public void CanReadContent()
|
||||||
{
|
{
|
||||||
using (var document = PdfDocument.Open(GetFilename()))
|
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||||
{
|
{
|
||||||
var page = document.GetPage(1);
|
var page = document.GetPage(1);
|
||||||
|
|
||||||
@@ -125,6 +126,25 @@ used per estimate, we introduce a “complement class”
Naive Bayes is often us
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void CanGetMetadata()
|
||||||
|
{
|
||||||
|
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||||
|
{
|
||||||
|
var hasMetadata = document.TryGetXmpMetadata(out var metadata);
|
||||||
|
|
||||||
|
Assert.True(hasMetadata);
|
||||||
|
|
||||||
|
var xDocument = metadata.GetXDocument();
|
||||||
|
|
||||||
|
Assert.NotNull(xDocument);
|
||||||
|
|
||||||
|
var text = OtherEncodings.BytesAsLatin1String(metadata.GetXmlBytes().ToArray());
|
||||||
|
|
||||||
|
Assert.StartsWith("<?xpacket begin='' id='W5M0MpCehiHzreSzNTczkc9d'", text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
|
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -55,6 +55,7 @@
|
|||||||
"UglyToad.PdfPig.Content.Word",
|
"UglyToad.PdfPig.Content.Word",
|
||||||
"UglyToad.PdfPig.Content.TextLine",
|
"UglyToad.PdfPig.Content.TextLine",
|
||||||
"UglyToad.PdfPig.Content.TextDirection",
|
"UglyToad.PdfPig.Content.TextDirection",
|
||||||
|
"UglyToad.PdfPig.Content.XmpMetadata",
|
||||||
"UglyToad.PdfPig.Core.TransformationMatrix",
|
"UglyToad.PdfPig.Core.TransformationMatrix",
|
||||||
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
|
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
|
||||||
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
|
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
|
||||||
|
|||||||
50
src/UglyToad.PdfPig/Content/XmpMetadata.cs
Normal file
50
src/UglyToad.PdfPig/Content/XmpMetadata.cs
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
namespace UglyToad.PdfPig.Content
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Xml.Linq;
|
||||||
|
using Filters;
|
||||||
|
using Tokens;
|
||||||
|
using Util;
|
||||||
|
using Util.JetBrains.Annotations;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Wraps an XML based Extensible Metadata Platform (XMP) document. These XML documents are embedded in PDFs to provide metadata
|
||||||
|
/// about objects (the entire document, images, etc). They can be present as plain text or encoded/encrypted streams.
|
||||||
|
/// </summary>
|
||||||
|
public class XmpMetadata
|
||||||
|
{
|
||||||
|
private readonly IFilterProvider filterProvider;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The underlying <see cref="StreamToken"/> for this metadata.
|
||||||
|
/// </summary>
|
||||||
|
[NotNull]
|
||||||
|
public StreamToken MetadataStreamToken { get; }
|
||||||
|
|
||||||
|
internal XmpMetadata(StreamToken stream, IFilterProvider filterProvider)
|
||||||
|
{
|
||||||
|
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
|
||||||
|
MetadataStreamToken = stream ?? throw new ArgumentNullException(nameof(stream));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get the decoded bytes for the metadata stream. This can be interpreted as a sequence of plain-text bytes.
|
||||||
|
/// </summary>
|
||||||
|
/// <returns>The bytes for the metadata object with any filters removed.</returns>
|
||||||
|
public IReadOnlyList<byte> GetXmlBytes()
|
||||||
|
{
|
||||||
|
return MetadataStreamToken.Decode(filterProvider);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets the metadata stream as an <see cref="XDocument"/>.
|
||||||
|
/// </summary>
|
||||||
|
/// <returns>The <see cref="XDocument"/> for the XMP XML.</returns>
|
||||||
|
public XDocument GetXDocument()
|
||||||
|
{
|
||||||
|
return XDocument.Parse(OtherEncodings.BytesAsLatin1String(GetXmlBytes().ToArray()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -140,7 +140,8 @@
|
|||||||
|
|
||||||
return new PdfDocument(log, inputBytes, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
|
return new PdfDocument(log, inputBytes, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
|
||||||
encryptionDictionary,
|
encryptionDictionary,
|
||||||
pdfScanner,
|
pdfScanner,
|
||||||
|
filterProvider,
|
||||||
acroFormFactory);
|
acroFormFactory);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,14 @@
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public class ParsingOptions
|
public class ParsingOptions
|
||||||
{
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// A default <see cref="ParsingOptions"/> with <see cref="UseLenientParsing"/> set to false.
|
||||||
|
/// </summary>
|
||||||
|
public static ParsingOptions LenientParsingOff { get; } = new ParsingOptions
|
||||||
|
{
|
||||||
|
UseLenientParsing = false
|
||||||
|
};
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Should the parser ignore issues where the document does not conform to the PDF specification?
|
/// Should the parser ignore issues where the document does not conform to the PDF specification?
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|||||||
@@ -7,10 +7,12 @@
|
|||||||
using CrossReference;
|
using CrossReference;
|
||||||
using Encryption;
|
using Encryption;
|
||||||
using Exceptions;
|
using Exceptions;
|
||||||
|
using Filters;
|
||||||
using IO;
|
using IO;
|
||||||
using Logging;
|
using Logging;
|
||||||
using Parser;
|
using Parser;
|
||||||
using Tokenization.Scanner;
|
using Tokenization.Scanner;
|
||||||
|
using Tokens;
|
||||||
using Util.JetBrains.Annotations;
|
using Util.JetBrains.Annotations;
|
||||||
|
|
||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
@@ -39,7 +41,9 @@
|
|||||||
|
|
||||||
[NotNull]
|
[NotNull]
|
||||||
private readonly IPdfTokenScanner pdfScanner;
|
private readonly IPdfTokenScanner pdfScanner;
|
||||||
|
|
||||||
|
private readonly IFilterProvider filterProvider;
|
||||||
|
|
||||||
[NotNull]
|
[NotNull]
|
||||||
private readonly Pages pages;
|
private readonly Pages pages;
|
||||||
|
|
||||||
@@ -81,6 +85,7 @@
|
|||||||
DocumentInformation information,
|
DocumentInformation information,
|
||||||
EncryptionDictionary encryptionDictionary,
|
EncryptionDictionary encryptionDictionary,
|
||||||
IPdfTokenScanner pdfScanner,
|
IPdfTokenScanner pdfScanner,
|
||||||
|
IFilterProvider filterProvider,
|
||||||
AcroFormFactory acroFormFactory)
|
AcroFormFactory acroFormFactory)
|
||||||
{
|
{
|
||||||
this.log = log;
|
this.log = log;
|
||||||
@@ -90,6 +95,7 @@
|
|||||||
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
|
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
|
||||||
this.encryptionDictionary = encryptionDictionary;
|
this.encryptionDictionary = encryptionDictionary;
|
||||||
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
|
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
|
||||||
|
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
|
||||||
Information = information ?? throw new ArgumentNullException(nameof(information));
|
Information = information ?? throw new ArgumentNullException(nameof(information));
|
||||||
pages = new Pages(log, catalog, pageFactory, isLenientParsing, pdfScanner);
|
pages = new Pages(log, catalog, pageFactory, isLenientParsing, pdfScanner);
|
||||||
Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
|
Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
|
||||||
@@ -153,9 +159,36 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get the document level metadata if present.
|
||||||
|
/// The metadata is XML in the (Extensible Metadata Platform) XMP format.
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
|
||||||
|
/// <param name="metadata">The metadata stream if it exists.</param>
|
||||||
|
/// <returns><see langword="true"/> if the metadata is present, <see langword="false"/> otherwise.</returns>
|
||||||
|
public bool TryGetXmpMetadata(out XmpMetadata metadata)
|
||||||
|
{
|
||||||
|
if (isDisposed)
|
||||||
|
{
|
||||||
|
throw new ObjectDisposedException("Cannot access the document metadata after the document is disposed.");
|
||||||
|
}
|
||||||
|
|
||||||
|
metadata = null;
|
||||||
|
|
||||||
|
if (!Structure.Catalog.CatalogDictionary.TryGet(NameToken.Metadata, pdfScanner, out StreamToken xmpStreamToken))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
metadata = new XmpMetadata(xmpStreamToken, filterProvider);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Gets the form if this document contains one.
|
/// Gets the form if this document contains one.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
|
||||||
/// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns>
|
/// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns>
|
||||||
internal AcroForm GetForm()
|
internal AcroForm GetForm()
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -552,11 +552,12 @@
|
|||||||
public static readonly NameToken WinAnsiEncoding = new NameToken("WinAnsiEncoding");
|
public static readonly NameToken WinAnsiEncoding = new NameToken("WinAnsiEncoding");
|
||||||
// X
|
// X
|
||||||
public static readonly NameToken Xfa = new NameToken("XFA");
|
public static readonly NameToken Xfa = new NameToken("XFA");
|
||||||
public static readonly NameToken XStep = new NameToken("XStep");
|
|
||||||
public static readonly NameToken Xheight = new NameToken("XHeight");
|
public static readonly NameToken Xheight = new NameToken("XHeight");
|
||||||
|
public static readonly NameToken Xml = new NameToken("XML");
|
||||||
public static readonly NameToken Xobject = new NameToken("XObject");
|
public static readonly NameToken Xobject = new NameToken("XObject");
|
||||||
public static readonly NameToken Xref = new NameToken("XRef");
|
public static readonly NameToken Xref = new NameToken("XRef");
|
||||||
public static readonly NameToken XrefStm = new NameToken("XRefStm");
|
public static readonly NameToken XrefStm = new NameToken("XRefStm");
|
||||||
|
public static readonly NameToken XStep = new NameToken("XStep");
|
||||||
// Y
|
// Y
|
||||||
public static readonly NameToken YStep = new NameToken("YStep");
|
public static readonly NameToken YStep = new NameToken("YStep");
|
||||||
public static readonly NameToken Yes = new NameToken("Yes");
|
public static readonly NameToken Yes = new NameToken("Yes");
|
||||||
|
|||||||
Reference in New Issue
Block a user