mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-19 19:07:56 +08:00
add method to retrieve any embedded files
This commit is contained in:
@@ -0,0 +1,38 @@
|
|||||||
|
namespace UglyToad.PdfPig.Tests.Integration
|
||||||
|
{
|
||||||
|
using Xunit;
|
||||||
|
|
||||||
|
public class EmbeddedFileAttachmentTests
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void HasCorrectText()
|
||||||
|
{
|
||||||
|
var path = IntegrationHelpers.GetSpecificTestDocumentPath("embedded-file-attachment.pdf");
|
||||||
|
|
||||||
|
using (var document = PdfDocument.Open(path))
|
||||||
|
{
|
||||||
|
for (var i = 1; i <= document.NumberOfPages; i++)
|
||||||
|
{
|
||||||
|
var page = document.GetPage(i);
|
||||||
|
|
||||||
|
Assert.StartsWith("This is a test document. It contains a file attachment.", page.Text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void HasEmbeddedFiles()
|
||||||
|
{
|
||||||
|
var path = IntegrationHelpers.GetSpecificTestDocumentPath("embedded-file-attachment.pdf");
|
||||||
|
|
||||||
|
using (var document = PdfDocument.Open(path))
|
||||||
|
{
|
||||||
|
Assert.True(document.Advanced.TryGetEmbeddedFiles(out var files));
|
||||||
|
|
||||||
|
Assert.Equal(1, files.Count);
|
||||||
|
|
||||||
|
Assert.Equal(20668, files[0].Bytes.Count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
@@ -58,6 +58,7 @@
|
|||||||
"UglyToad.PdfPig.AcroForms.Fields.AcroSignatureField",
|
"UglyToad.PdfPig.AcroForms.Fields.AcroSignatureField",
|
||||||
"UglyToad.PdfPig.AcroForms.Fields.AcroTextField",
|
"UglyToad.PdfPig.AcroForms.Fields.AcroTextField",
|
||||||
"UglyToad.PdfPig.AcroForms.Fields.AcroTextFieldFlags",
|
"UglyToad.PdfPig.AcroForms.Fields.AcroTextFieldFlags",
|
||||||
|
"UglyToad.PdfPig.AdvancedPdfDocumentAccess",
|
||||||
"UglyToad.PdfPig.Annotations.Annotation",
|
"UglyToad.PdfPig.Annotations.Annotation",
|
||||||
"UglyToad.PdfPig.Annotations.AnnotationBorder",
|
"UglyToad.PdfPig.Annotations.AnnotationBorder",
|
||||||
"UglyToad.PdfPig.Annotations.AnnotationFlags",
|
"UglyToad.PdfPig.Annotations.AnnotationFlags",
|
||||||
@@ -65,6 +66,7 @@
|
|||||||
"UglyToad.PdfPig.Content.Catalog",
|
"UglyToad.PdfPig.Content.Catalog",
|
||||||
"UglyToad.PdfPig.Content.CropBox",
|
"UglyToad.PdfPig.Content.CropBox",
|
||||||
"UglyToad.PdfPig.Content.DocumentInformation",
|
"UglyToad.PdfPig.Content.DocumentInformation",
|
||||||
|
"UglyToad.PdfPig.Content.EmbeddedFile",
|
||||||
"UglyToad.PdfPig.Content.Hyperlink",
|
"UglyToad.PdfPig.Content.Hyperlink",
|
||||||
"UglyToad.PdfPig.Content.InlineImage",
|
"UglyToad.PdfPig.Content.InlineImage",
|
||||||
"UglyToad.PdfPig.Content.IPdfImage",
|
"UglyToad.PdfPig.Content.IPdfImage",
|
||||||
|
104
src/UglyToad.PdfPig/AdvancedPdfDocumentAccess.cs
Normal file
104
src/UglyToad.PdfPig/AdvancedPdfDocumentAccess.cs
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
namespace UglyToad.PdfPig
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using Content;
|
||||||
|
using Filters;
|
||||||
|
using Parser.Parts;
|
||||||
|
using Tokenization.Scanner;
|
||||||
|
using Tokens;
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
/// <summary>
|
||||||
|
/// Provides access to rare or advanced features from the PDF specification.
|
||||||
|
/// </summary>
|
||||||
|
public class AdvancedPdfDocumentAccess : IDisposable
|
||||||
|
{
|
||||||
|
private readonly IPdfTokenScanner pdfScanner;
|
||||||
|
private readonly IFilterProvider filterProvider;
|
||||||
|
private readonly Catalog catalog;
|
||||||
|
private readonly bool isLenientParsing;
|
||||||
|
|
||||||
|
private bool isDisposed;
|
||||||
|
|
||||||
|
internal AdvancedPdfDocumentAccess(IPdfTokenScanner pdfScanner,
|
||||||
|
IFilterProvider filterProvider,
|
||||||
|
Catalog catalog,
|
||||||
|
bool isLenientParsing)
|
||||||
|
{
|
||||||
|
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
|
||||||
|
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
|
||||||
|
this.catalog = catalog ?? throw new ArgumentNullException(nameof(catalog));
|
||||||
|
this.isLenientParsing = isLenientParsing;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get any embedded files contained in this PDF document.
|
||||||
|
/// Since PDF 1.3 any external file referenced by the document may have its contents embedded within the referring PDF file,
|
||||||
|
/// allowing its contents to be stored or transmitted along with the PDF file.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="embeddedFiles">The set of embedded files in this document.</param>
|
||||||
|
/// <returns><see langword="true"/> if this document contains more than zero embedded files, otherwise <see langword="false"/>.</returns>
|
||||||
|
public bool TryGetEmbeddedFiles(out IReadOnlyList<EmbeddedFile> embeddedFiles)
|
||||||
|
{
|
||||||
|
GuardDisposed();
|
||||||
|
|
||||||
|
embeddedFiles = null;
|
||||||
|
|
||||||
|
if (!catalog.CatalogDictionary.TryGet(NameToken.Names, pdfScanner, out DictionaryToken namesDictionary)
|
||||||
|
|| !namesDictionary.TryGet(NameToken.EmbeddedFiles, pdfScanner, out DictionaryToken embeddedFileNamesDictionary))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var embeddedFileNames = NameTreeParser.FlattenNameTreeToDictionary(embeddedFileNamesDictionary, pdfScanner, isLenientParsing,
|
||||||
|
x => x);
|
||||||
|
|
||||||
|
if (embeddedFileNames.Count == 0)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var result = new List<EmbeddedFile>();
|
||||||
|
|
||||||
|
foreach (var keyValuePair in embeddedFileNames)
|
||||||
|
{
|
||||||
|
if (!DirectObjectFinder.TryGet(keyValuePair.Value, pdfScanner, out DictionaryToken fileDescriptorDictionaryToken)
|
||||||
|
|| !fileDescriptorDictionaryToken.TryGet(NameToken.Ef, pdfScanner, out DictionaryToken efDictionary)
|
||||||
|
|| !efDictionary.TryGet(NameToken.F, pdfScanner, out StreamToken fileStreamToken))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var fileSpecification = string.Empty;
|
||||||
|
if (fileDescriptorDictionaryToken.TryGet(NameToken.F, pdfScanner, out IDataToken<string> fileSpecificationToken))
|
||||||
|
{
|
||||||
|
fileSpecification = fileSpecificationToken.Data;
|
||||||
|
}
|
||||||
|
|
||||||
|
var fileBytes = fileStreamToken.Decode(filterProvider);
|
||||||
|
|
||||||
|
result.Add(new EmbeddedFile(keyValuePair.Key, fileSpecification, fileBytes, fileStreamToken));
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddedFiles = result;
|
||||||
|
|
||||||
|
return embeddedFiles.Count > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void GuardDisposed()
|
||||||
|
{
|
||||||
|
if (isDisposed)
|
||||||
|
{
|
||||||
|
throw new ObjectDisposedException(nameof(AdvancedPdfDocumentAccess));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
public void Dispose()
|
||||||
|
{
|
||||||
|
pdfScanner?.Dispose();
|
||||||
|
isDisposed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
46
src/UglyToad.PdfPig/Content/EmbeddedFile.cs
Normal file
46
src/UglyToad.PdfPig/Content/EmbeddedFile.cs
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
namespace UglyToad.PdfPig.Content
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using Tokens;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// A file embedded in a PDF document for document references.
|
||||||
|
/// </summary>
|
||||||
|
public class EmbeddedFile
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// The name given to this embedded file in the document's name tree.
|
||||||
|
/// </summary>
|
||||||
|
public string Name { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The specification of the path to the file.
|
||||||
|
/// </summary>
|
||||||
|
public string FileSpecification { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The decrypted bytes of the file.
|
||||||
|
/// </summary>
|
||||||
|
public IReadOnlyList<byte> Bytes { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The underlying embedded file stream.
|
||||||
|
/// </summary>
|
||||||
|
public StreamToken Stream { get; }
|
||||||
|
|
||||||
|
internal EmbeddedFile(string name, string fileSpecification, IReadOnlyList<byte> bytes, StreamToken stream)
|
||||||
|
{
|
||||||
|
Name = name ?? throw new ArgumentNullException(nameof(name));
|
||||||
|
FileSpecification = fileSpecification;
|
||||||
|
Bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
|
||||||
|
Stream = stream ?? throw new ArgumentNullException(nameof(stream));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
public override string ToString()
|
||||||
|
{
|
||||||
|
return $"{Name}: {Stream.StreamDictionary}.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -198,51 +198,20 @@
|
|||||||
* The keys in the name tree may be treated as text strings for display purposes.
|
* The keys in the name tree may be treated as text strings for display purposes.
|
||||||
* The destination value associated with a key in the name tree may be either an array or a dictionary.
|
* The destination value associated with a key in the name tree may be either an array or a dictionary.
|
||||||
*/
|
*/
|
||||||
ExtractNameTree(dests, catalog, pdfScanner, isLenientParsing, log, result);
|
NameTreeParser.FlattenNameTree(dests, pdfScanner, isLenientParsing, value =>
|
||||||
|
{
|
||||||
|
if (TryReadExplicitDestination(value, catalog, pdfScanner, log, out var destination))
|
||||||
|
{
|
||||||
|
return destination;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void ExtractNameTree(DictionaryToken nameTreeNodeDictionary, Catalog catalog, IPdfTokenScanner pdfScanner,
|
|
||||||
bool isLenientParsing,
|
|
||||||
ILog log,
|
|
||||||
Dictionary<string, ExplicitDestination> explicitDestinations)
|
|
||||||
{
|
|
||||||
if (nameTreeNodeDictionary.TryGet(NameToken.Names, pdfScanner, out ArrayToken nodeNames))
|
|
||||||
{
|
|
||||||
for (var i = 0; i < nodeNames.Length; i += 2)
|
|
||||||
{
|
|
||||||
if (!(nodeNames[i] is IDataToken<string> key))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
var value = nodeNames[i + 1];
|
|
||||||
|
|
||||||
if (TryReadExplicitDestination(value, catalog, pdfScanner, log, out var destination))
|
|
||||||
{
|
|
||||||
explicitDestinations[key.Data] = destination;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nameTreeNodeDictionary.TryGet(NameToken.Kids, pdfScanner, out ArrayToken kids))
|
|
||||||
{
|
|
||||||
foreach (var kid in kids.Data)
|
|
||||||
{
|
|
||||||
if (DirectObjectFinder.TryGet(kid, pdfScanner, out DictionaryToken kidDictionary))
|
|
||||||
{
|
|
||||||
ExtractNameTree(kidDictionary, catalog, pdfScanner, isLenientParsing, log, explicitDestinations);
|
|
||||||
}
|
|
||||||
else if (!isLenientParsing)
|
|
||||||
{
|
|
||||||
throw new PdfDocumentFormatException($"Invalid kids entry in PDF name tree: {kid} in {kids}.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static bool TryReadExplicitDestination(IToken value, Catalog catalog, IPdfTokenScanner pdfScanner,
|
private static bool TryReadExplicitDestination(IToken value, Catalog catalog, IPdfTokenScanner pdfScanner,
|
||||||
ILog log, out ExplicitDestination destination)
|
ILog log, out ExplicitDestination destination)
|
||||||
{
|
{
|
||||||
|
65
src/UglyToad.PdfPig/Parser/Parts/NameTreeParser.cs
Normal file
65
src/UglyToad.PdfPig/Parser/Parts/NameTreeParser.cs
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
namespace UglyToad.PdfPig.Parser.Parts
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using Exceptions;
|
||||||
|
using Tokenization.Scanner;
|
||||||
|
using Tokens;
|
||||||
|
|
||||||
|
internal static class NameTreeParser
|
||||||
|
{
|
||||||
|
public static IReadOnlyDictionary<string, TResult> FlattenNameTreeToDictionary<TResult>(DictionaryToken nameTreeNodeDictionary,
|
||||||
|
IPdfTokenScanner pdfScanner,
|
||||||
|
bool isLenientParsing,
|
||||||
|
Func<IToken, TResult> valuesFactory) where TResult : class
|
||||||
|
{
|
||||||
|
var result = new Dictionary<string, TResult>();
|
||||||
|
|
||||||
|
FlattenNameTree(nameTreeNodeDictionary, pdfScanner, isLenientParsing, valuesFactory, result);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void FlattenNameTree<TResult>(DictionaryToken nameTreeNodeDictionary,
|
||||||
|
IPdfTokenScanner pdfScanner,
|
||||||
|
bool isLenientParsing,
|
||||||
|
Func<IToken, TResult> valuesFactory,
|
||||||
|
Dictionary<string, TResult> result) where TResult : class
|
||||||
|
{
|
||||||
|
if (nameTreeNodeDictionary.TryGet(NameToken.Names, pdfScanner, out ArrayToken nodeNames))
|
||||||
|
{
|
||||||
|
for (var i = 0; i < nodeNames.Length; i += 2)
|
||||||
|
{
|
||||||
|
if (!(nodeNames[i] is IDataToken<string> key))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var valueToken = nodeNames[i + 1];
|
||||||
|
|
||||||
|
var value = valuesFactory(valueToken);
|
||||||
|
|
||||||
|
if (value != null)
|
||||||
|
{
|
||||||
|
result[key.Data] = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nameTreeNodeDictionary.TryGet(NameToken.Kids, pdfScanner, out ArrayToken kids))
|
||||||
|
{
|
||||||
|
foreach (var kid in kids.Data)
|
||||||
|
{
|
||||||
|
if (DirectObjectFinder.TryGet(kid, pdfScanner, out DictionaryToken kidDictionary))
|
||||||
|
{
|
||||||
|
FlattenNameTree(kidDictionary, pdfScanner, isLenientParsing, valuesFactory, result);
|
||||||
|
}
|
||||||
|
else if (!isLenientParsing)
|
||||||
|
{
|
||||||
|
throw new PdfDocumentFormatException($"Invalid kids entry in PDF name tree: {kid} in {kids}.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -62,6 +62,11 @@
|
|||||||
[NotNull]
|
[NotNull]
|
||||||
public Structure Structure { get; }
|
public Structure Structure { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Access to rare or advanced features of the PDF specification.
|
||||||
|
/// </summary>
|
||||||
|
public AdvancedPdfDocumentAccess Advanced { get; }
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The version number of the PDF specification which this file conforms to, for example 1.4.
|
/// The version number of the PDF specification which this file conforms to, for example 1.4.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
@@ -104,6 +109,7 @@
|
|||||||
Information = information ?? throw new ArgumentNullException(nameof(information));
|
Information = information ?? throw new ArgumentNullException(nameof(information));
|
||||||
pages = new Pages(catalog, pageFactory, isLenientParsing, pdfScanner);
|
pages = new Pages(catalog, pageFactory, isLenientParsing, pdfScanner);
|
||||||
Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
|
Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
|
||||||
|
Advanced = new AdvancedPdfDocumentAccess(pdfScanner, filterProvider, catalog, isLenientParsing);
|
||||||
documentForm = new Lazy<AcroForm>(() => acroFormFactory.GetAcroForm(catalog));
|
documentForm = new Lazy<AcroForm>(() => acroFormFactory.GetAcroForm(catalog));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -242,6 +248,7 @@
|
|||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
|
Advanced.Dispose();
|
||||||
pdfScanner.Dispose();
|
pdfScanner.Dispose();
|
||||||
inputBytes.Dispose();
|
inputBytes.Dispose();
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user