add method to retrieve any embedded files

This commit is contained in:
Eliot Jones
2019-12-21 16:16:36 +00:00
parent 4d697e3669
commit e984180b3d
8 changed files with 271 additions and 40 deletions

View File

@@ -0,0 +1,38 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using Xunit;
public class EmbeddedFileAttachmentTests
{
[Fact]
public void HasCorrectText()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("embedded-file-attachment.pdf");
using (var document = PdfDocument.Open(path))
{
for (var i = 1; i <= document.NumberOfPages; i++)
{
var page = document.GetPage(i);
Assert.StartsWith("This is a test document. It contains a file attachment.", page.Text);
}
}
}
[Fact]
public void HasEmbeddedFiles()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("embedded-file-attachment.pdf");
using (var document = PdfDocument.Open(path))
{
Assert.True(document.Advanced.TryGetEmbeddedFiles(out var files));
Assert.Equal(1, files.Count);
Assert.Equal(20668, files[0].Bytes.Count);
}
}
}
}

View File

@@ -58,6 +58,7 @@
"UglyToad.PdfPig.AcroForms.Fields.AcroSignatureField", "UglyToad.PdfPig.AcroForms.Fields.AcroSignatureField",
"UglyToad.PdfPig.AcroForms.Fields.AcroTextField", "UglyToad.PdfPig.AcroForms.Fields.AcroTextField",
"UglyToad.PdfPig.AcroForms.Fields.AcroTextFieldFlags", "UglyToad.PdfPig.AcroForms.Fields.AcroTextFieldFlags",
"UglyToad.PdfPig.AdvancedPdfDocumentAccess",
"UglyToad.PdfPig.Annotations.Annotation", "UglyToad.PdfPig.Annotations.Annotation",
"UglyToad.PdfPig.Annotations.AnnotationBorder", "UglyToad.PdfPig.Annotations.AnnotationBorder",
"UglyToad.PdfPig.Annotations.AnnotationFlags", "UglyToad.PdfPig.Annotations.AnnotationFlags",
@@ -65,6 +66,7 @@
"UglyToad.PdfPig.Content.Catalog", "UglyToad.PdfPig.Content.Catalog",
"UglyToad.PdfPig.Content.CropBox", "UglyToad.PdfPig.Content.CropBox",
"UglyToad.PdfPig.Content.DocumentInformation", "UglyToad.PdfPig.Content.DocumentInformation",
"UglyToad.PdfPig.Content.EmbeddedFile",
"UglyToad.PdfPig.Content.Hyperlink", "UglyToad.PdfPig.Content.Hyperlink",
"UglyToad.PdfPig.Content.InlineImage", "UglyToad.PdfPig.Content.InlineImage",
"UglyToad.PdfPig.Content.IPdfImage", "UglyToad.PdfPig.Content.IPdfImage",

View File

@@ -0,0 +1,104 @@
namespace UglyToad.PdfPig
{
using System;
using System.Collections.Generic;
using Content;
using Filters;
using Parser.Parts;
using Tokenization.Scanner;
using Tokens;
/// <inheritdoc />
/// <summary>
/// Provides access to rare or advanced features from the PDF specification.
/// </summary>
public class AdvancedPdfDocumentAccess : IDisposable
{
private readonly IPdfTokenScanner pdfScanner;
private readonly IFilterProvider filterProvider;
private readonly Catalog catalog;
private readonly bool isLenientParsing;
private bool isDisposed;
internal AdvancedPdfDocumentAccess(IPdfTokenScanner pdfScanner,
IFilterProvider filterProvider,
Catalog catalog,
bool isLenientParsing)
{
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
this.catalog = catalog ?? throw new ArgumentNullException(nameof(catalog));
this.isLenientParsing = isLenientParsing;
}
/// <summary>
/// Get any embedded files contained in this PDF document.
/// Since PDF 1.3 any external file referenced by the document may have its contents embedded within the referring PDF file,
/// allowing its contents to be stored or transmitted along with the PDF file.
/// </summary>
/// <param name="embeddedFiles">The set of embedded files in this document.</param>
/// <returns><see langword="true"/> if this document contains more than zero embedded files, otherwise <see langword="false"/>.</returns>
public bool TryGetEmbeddedFiles(out IReadOnlyList<EmbeddedFile> embeddedFiles)
{
GuardDisposed();
embeddedFiles = null;
if (!catalog.CatalogDictionary.TryGet(NameToken.Names, pdfScanner, out DictionaryToken namesDictionary)
|| !namesDictionary.TryGet(NameToken.EmbeddedFiles, pdfScanner, out DictionaryToken embeddedFileNamesDictionary))
{
return false;
}
var embeddedFileNames = NameTreeParser.FlattenNameTreeToDictionary(embeddedFileNamesDictionary, pdfScanner, isLenientParsing,
x => x);
if (embeddedFileNames.Count == 0)
{
return false;
}
var result = new List<EmbeddedFile>();
foreach (var keyValuePair in embeddedFileNames)
{
if (!DirectObjectFinder.TryGet(keyValuePair.Value, pdfScanner, out DictionaryToken fileDescriptorDictionaryToken)
|| !fileDescriptorDictionaryToken.TryGet(NameToken.Ef, pdfScanner, out DictionaryToken efDictionary)
|| !efDictionary.TryGet(NameToken.F, pdfScanner, out StreamToken fileStreamToken))
{
continue;
}
var fileSpecification = string.Empty;
if (fileDescriptorDictionaryToken.TryGet(NameToken.F, pdfScanner, out IDataToken<string> fileSpecificationToken))
{
fileSpecification = fileSpecificationToken.Data;
}
var fileBytes = fileStreamToken.Decode(filterProvider);
result.Add(new EmbeddedFile(keyValuePair.Key, fileSpecification, fileBytes, fileStreamToken));
}
embeddedFiles = result;
return embeddedFiles.Count > 0;
}
private void GuardDisposed()
{
if (isDisposed)
{
throw new ObjectDisposedException(nameof(AdvancedPdfDocumentAccess));
}
}
/// <inheritdoc />
public void Dispose()
{
pdfScanner?.Dispose();
isDisposed = true;
}
}
}

View File

@@ -0,0 +1,46 @@
namespace UglyToad.PdfPig.Content
{
using System;
using System.Collections.Generic;
using Tokens;
/// <summary>
/// A file embedded in a PDF document for document references.
/// </summary>
public class EmbeddedFile
{
/// <summary>
/// The name given to this embedded file in the document's name tree.
/// </summary>
public string Name { get; }
/// <summary>
/// The specification of the path to the file.
/// </summary>
public string FileSpecification { get; }
/// <summary>
/// The decrypted bytes of the file.
/// </summary>
public IReadOnlyList<byte> Bytes { get; }
/// <summary>
/// The underlying embedded file stream.
/// </summary>
public StreamToken Stream { get; }
internal EmbeddedFile(string name, string fileSpecification, IReadOnlyList<byte> bytes, StreamToken stream)
{
Name = name ?? throw new ArgumentNullException(nameof(name));
FileSpecification = fileSpecification;
Bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
Stream = stream ?? throw new ArgumentNullException(nameof(stream));
}
/// <inheritdoc />
public override string ToString()
{
return $"{Name}: {Stream.StreamDictionary}.";
}
}
}

View File

@@ -198,51 +198,20 @@
* The keys in the name tree may be treated as text strings for display purposes. * The keys in the name tree may be treated as text strings for display purposes.
* The destination value associated with a key in the name tree may be either an array or a dictionary. * The destination value associated with a key in the name tree may be either an array or a dictionary.
*/ */
ExtractNameTree(dests, catalog, pdfScanner, isLenientParsing, log, result); NameTreeParser.FlattenNameTree(dests, pdfScanner, isLenientParsing, value =>
{
if (TryReadExplicitDestination(value, catalog, pdfScanner, log, out var destination))
{
return destination;
}
return null;
}, result);
} }
return result; return result;
} }
private static void ExtractNameTree(DictionaryToken nameTreeNodeDictionary, Catalog catalog, IPdfTokenScanner pdfScanner,
bool isLenientParsing,
ILog log,
Dictionary<string, ExplicitDestination> explicitDestinations)
{
if (nameTreeNodeDictionary.TryGet(NameToken.Names, pdfScanner, out ArrayToken nodeNames))
{
for (var i = 0; i < nodeNames.Length; i += 2)
{
if (!(nodeNames[i] is IDataToken<string> key))
{
continue;
}
var value = nodeNames[i + 1];
if (TryReadExplicitDestination(value, catalog, pdfScanner, log, out var destination))
{
explicitDestinations[key.Data] = destination;
}
}
}
if (nameTreeNodeDictionary.TryGet(NameToken.Kids, pdfScanner, out ArrayToken kids))
{
foreach (var kid in kids.Data)
{
if (DirectObjectFinder.TryGet(kid, pdfScanner, out DictionaryToken kidDictionary))
{
ExtractNameTree(kidDictionary, catalog, pdfScanner, isLenientParsing, log, explicitDestinations);
}
else if (!isLenientParsing)
{
throw new PdfDocumentFormatException($"Invalid kids entry in PDF name tree: {kid} in {kids}.");
}
}
}
}
private static bool TryReadExplicitDestination(IToken value, Catalog catalog, IPdfTokenScanner pdfScanner, private static bool TryReadExplicitDestination(IToken value, Catalog catalog, IPdfTokenScanner pdfScanner,
ILog log, out ExplicitDestination destination) ILog log, out ExplicitDestination destination)
{ {

View File

@@ -0,0 +1,65 @@
namespace UglyToad.PdfPig.Parser.Parts
{
using System;
using System.Collections.Generic;
using Exceptions;
using Tokenization.Scanner;
using Tokens;
internal static class NameTreeParser
{
public static IReadOnlyDictionary<string, TResult> FlattenNameTreeToDictionary<TResult>(DictionaryToken nameTreeNodeDictionary,
IPdfTokenScanner pdfScanner,
bool isLenientParsing,
Func<IToken, TResult> valuesFactory) where TResult : class
{
var result = new Dictionary<string, TResult>();
FlattenNameTree(nameTreeNodeDictionary, pdfScanner, isLenientParsing, valuesFactory, result);
return result;
}
public static void FlattenNameTree<TResult>(DictionaryToken nameTreeNodeDictionary,
IPdfTokenScanner pdfScanner,
bool isLenientParsing,
Func<IToken, TResult> valuesFactory,
Dictionary<string, TResult> result) where TResult : class
{
if (nameTreeNodeDictionary.TryGet(NameToken.Names, pdfScanner, out ArrayToken nodeNames))
{
for (var i = 0; i < nodeNames.Length; i += 2)
{
if (!(nodeNames[i] is IDataToken<string> key))
{
continue;
}
var valueToken = nodeNames[i + 1];
var value = valuesFactory(valueToken);
if (value != null)
{
result[key.Data] = value;
}
}
}
if (nameTreeNodeDictionary.TryGet(NameToken.Kids, pdfScanner, out ArrayToken kids))
{
foreach (var kid in kids.Data)
{
if (DirectObjectFinder.TryGet(kid, pdfScanner, out DictionaryToken kidDictionary))
{
FlattenNameTree(kidDictionary, pdfScanner, isLenientParsing, valuesFactory, result);
}
else if (!isLenientParsing)
{
throw new PdfDocumentFormatException($"Invalid kids entry in PDF name tree: {kid} in {kids}.");
}
}
}
}
}
}

View File

@@ -62,6 +62,11 @@
[NotNull] [NotNull]
public Structure Structure { get; } public Structure Structure { get; }
/// <summary>
/// Access to rare or advanced features of the PDF specification.
/// </summary>
public AdvancedPdfDocumentAccess Advanced { get; }
/// <summary> /// <summary>
/// The version number of the PDF specification which this file conforms to, for example 1.4. /// The version number of the PDF specification which this file conforms to, for example 1.4.
/// </summary> /// </summary>
@@ -104,6 +109,7 @@
Information = information ?? throw new ArgumentNullException(nameof(information)); Information = information ?? throw new ArgumentNullException(nameof(information));
pages = new Pages(catalog, pageFactory, isLenientParsing, pdfScanner); pages = new Pages(catalog, pageFactory, isLenientParsing, pdfScanner);
Structure = new Structure(catalog, crossReferenceTable, pdfScanner); Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
Advanced = new AdvancedPdfDocumentAccess(pdfScanner, filterProvider, catalog, isLenientParsing);
documentForm = new Lazy<AcroForm>(() => acroFormFactory.GetAcroForm(catalog)); documentForm = new Lazy<AcroForm>(() => acroFormFactory.GetAcroForm(catalog));
} }
@@ -242,6 +248,7 @@
{ {
try try
{ {
Advanced.Dispose();
pdfScanner.Dispose(); pdfScanner.Dispose();
inputBytes.Dispose(); inputBytes.Dispose();
} }