diff --git a/src/UglyToad.PdfPig.Tests/Integration/EmbeddedFileAttachmentTests.cs b/src/UglyToad.PdfPig.Tests/Integration/EmbeddedFileAttachmentTests.cs new file mode 100644 index 00000000..30b2fd9f --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/EmbeddedFileAttachmentTests.cs @@ -0,0 +1,38 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using Xunit; + + public class EmbeddedFileAttachmentTests + { + [Fact] + public void HasCorrectText() + { + var path = IntegrationHelpers.GetSpecificTestDocumentPath("embedded-file-attachment.pdf"); + + using (var document = PdfDocument.Open(path)) + { + for (var i = 1; i <= document.NumberOfPages; i++) + { + var page = document.GetPage(i); + + Assert.StartsWith("This is a test document. It contains a file attachment.", page.Text); + } + } + } + + [Fact] + public void HasEmbeddedFiles() + { + var path = IntegrationHelpers.GetSpecificTestDocumentPath("embedded-file-attachment.pdf"); + + using (var document = PdfDocument.Open(path)) + { + Assert.True(document.Advanced.TryGetEmbeddedFiles(out var files)); + + Assert.Equal(1, files.Count); + + Assert.Equal(20668, files[0].Bytes.Count); + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/embedded-file-attachment.pdf b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/embedded-file-attachment.pdf new file mode 100644 index 00000000..bfd6078b Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/embedded-file-attachment.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 4e4d643a..3f39e11d 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -58,6 +58,7 @@ "UglyToad.PdfPig.AcroForms.Fields.AcroSignatureField", "UglyToad.PdfPig.AcroForms.Fields.AcroTextField", "UglyToad.PdfPig.AcroForms.Fields.AcroTextFieldFlags", + "UglyToad.PdfPig.AdvancedPdfDocumentAccess", "UglyToad.PdfPig.Annotations.Annotation", "UglyToad.PdfPig.Annotations.AnnotationBorder", "UglyToad.PdfPig.Annotations.AnnotationFlags", @@ -65,6 +66,7 @@ "UglyToad.PdfPig.Content.Catalog", "UglyToad.PdfPig.Content.CropBox", "UglyToad.PdfPig.Content.DocumentInformation", + "UglyToad.PdfPig.Content.EmbeddedFile", "UglyToad.PdfPig.Content.Hyperlink", "UglyToad.PdfPig.Content.InlineImage", "UglyToad.PdfPig.Content.IPdfImage", diff --git a/src/UglyToad.PdfPig/AdvancedPdfDocumentAccess.cs b/src/UglyToad.PdfPig/AdvancedPdfDocumentAccess.cs new file mode 100644 index 00000000..60c0b643 --- /dev/null +++ b/src/UglyToad.PdfPig/AdvancedPdfDocumentAccess.cs @@ -0,0 +1,104 @@ +namespace UglyToad.PdfPig +{ + using System; + using System.Collections.Generic; + using Content; + using Filters; + using Parser.Parts; + using Tokenization.Scanner; + using Tokens; + + /// + /// + /// Provides access to rare or advanced features from the PDF specification. + /// + public class AdvancedPdfDocumentAccess : IDisposable + { + private readonly IPdfTokenScanner pdfScanner; + private readonly IFilterProvider filterProvider; + private readonly Catalog catalog; + private readonly bool isLenientParsing; + + private bool isDisposed; + + internal AdvancedPdfDocumentAccess(IPdfTokenScanner pdfScanner, + IFilterProvider filterProvider, + Catalog catalog, + bool isLenientParsing) + { + this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner)); + this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider)); + this.catalog = catalog ?? throw new ArgumentNullException(nameof(catalog)); + this.isLenientParsing = isLenientParsing; + } + + /// + /// Get any embedded files contained in this PDF document. + /// Since PDF 1.3 any external file referenced by the document may have its contents embedded within the referring PDF file, + /// allowing its contents to be stored or transmitted along with the PDF file. + /// + /// The set of embedded files in this document. + /// if this document contains more than zero embedded files, otherwise . + public bool TryGetEmbeddedFiles(out IReadOnlyList embeddedFiles) + { + GuardDisposed(); + + embeddedFiles = null; + + if (!catalog.CatalogDictionary.TryGet(NameToken.Names, pdfScanner, out DictionaryToken namesDictionary) + || !namesDictionary.TryGet(NameToken.EmbeddedFiles, pdfScanner, out DictionaryToken embeddedFileNamesDictionary)) + { + return false; + } + + var embeddedFileNames = NameTreeParser.FlattenNameTreeToDictionary(embeddedFileNamesDictionary, pdfScanner, isLenientParsing, + x => x); + + if (embeddedFileNames.Count == 0) + { + return false; + } + + var result = new List(); + + foreach (var keyValuePair in embeddedFileNames) + { + if (!DirectObjectFinder.TryGet(keyValuePair.Value, pdfScanner, out DictionaryToken fileDescriptorDictionaryToken) + || !fileDescriptorDictionaryToken.TryGet(NameToken.Ef, pdfScanner, out DictionaryToken efDictionary) + || !efDictionary.TryGet(NameToken.F, pdfScanner, out StreamToken fileStreamToken)) + { + continue; + } + + var fileSpecification = string.Empty; + if (fileDescriptorDictionaryToken.TryGet(NameToken.F, pdfScanner, out IDataToken fileSpecificationToken)) + { + fileSpecification = fileSpecificationToken.Data; + } + + var fileBytes = fileStreamToken.Decode(filterProvider); + + result.Add(new EmbeddedFile(keyValuePair.Key, fileSpecification, fileBytes, fileStreamToken)); + } + + embeddedFiles = result; + + return embeddedFiles.Count > 0; + } + + private void GuardDisposed() + { + if (isDisposed) + { + throw new ObjectDisposedException(nameof(AdvancedPdfDocumentAccess)); + } + } + + /// + public void Dispose() + { + pdfScanner?.Dispose(); + isDisposed = true; + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Content/EmbeddedFile.cs b/src/UglyToad.PdfPig/Content/EmbeddedFile.cs new file mode 100644 index 00000000..291eec5d --- /dev/null +++ b/src/UglyToad.PdfPig/Content/EmbeddedFile.cs @@ -0,0 +1,46 @@ +namespace UglyToad.PdfPig.Content +{ + using System; + using System.Collections.Generic; + using Tokens; + + /// + /// A file embedded in a PDF document for document references. + /// + public class EmbeddedFile + { + /// + /// The name given to this embedded file in the document's name tree. + /// + public string Name { get; } + + /// + /// The specification of the path to the file. + /// + public string FileSpecification { get; } + + /// + /// The decrypted bytes of the file. + /// + public IReadOnlyList Bytes { get; } + + /// + /// The underlying embedded file stream. + /// + public StreamToken Stream { get; } + + internal EmbeddedFile(string name, string fileSpecification, IReadOnlyList bytes, StreamToken stream) + { + Name = name ?? throw new ArgumentNullException(nameof(name)); + FileSpecification = fileSpecification; + Bytes = bytes ?? throw new ArgumentNullException(nameof(bytes)); + Stream = stream ?? throw new ArgumentNullException(nameof(stream)); + } + + /// + public override string ToString() + { + return $"{Name}: {Stream.StreamDictionary}."; + } + } +} diff --git a/src/UglyToad.PdfPig/Outline/BookmarksProvider.cs b/src/UglyToad.PdfPig/Outline/BookmarksProvider.cs index 6a147bf2..893792c9 100644 --- a/src/UglyToad.PdfPig/Outline/BookmarksProvider.cs +++ b/src/UglyToad.PdfPig/Outline/BookmarksProvider.cs @@ -198,51 +198,20 @@ * The keys in the name tree may be treated as text strings for display purposes. * The destination value associated with a key in the name tree may be either an array or a dictionary. */ - ExtractNameTree(dests, catalog, pdfScanner, isLenientParsing, log, result); + NameTreeParser.FlattenNameTree(dests, pdfScanner, isLenientParsing, value => + { + if (TryReadExplicitDestination(value, catalog, pdfScanner, log, out var destination)) + { + return destination; + } + + return null; + }, result); } return result; } - private static void ExtractNameTree(DictionaryToken nameTreeNodeDictionary, Catalog catalog, IPdfTokenScanner pdfScanner, - bool isLenientParsing, - ILog log, - Dictionary explicitDestinations) - { - if (nameTreeNodeDictionary.TryGet(NameToken.Names, pdfScanner, out ArrayToken nodeNames)) - { - for (var i = 0; i < nodeNames.Length; i += 2) - { - if (!(nodeNames[i] is IDataToken key)) - { - continue; - } - - var value = nodeNames[i + 1]; - - if (TryReadExplicitDestination(value, catalog, pdfScanner, log, out var destination)) - { - explicitDestinations[key.Data] = destination; - } - } - } - - if (nameTreeNodeDictionary.TryGet(NameToken.Kids, pdfScanner, out ArrayToken kids)) - { - foreach (var kid in kids.Data) - { - if (DirectObjectFinder.TryGet(kid, pdfScanner, out DictionaryToken kidDictionary)) - { - ExtractNameTree(kidDictionary, catalog, pdfScanner, isLenientParsing, log, explicitDestinations); - } - else if (!isLenientParsing) - { - throw new PdfDocumentFormatException($"Invalid kids entry in PDF name tree: {kid} in {kids}."); - } - } - } - } - private static bool TryReadExplicitDestination(IToken value, Catalog catalog, IPdfTokenScanner pdfScanner, ILog log, out ExplicitDestination destination) { diff --git a/src/UglyToad.PdfPig/Parser/Parts/NameTreeParser.cs b/src/UglyToad.PdfPig/Parser/Parts/NameTreeParser.cs new file mode 100644 index 00000000..83910b44 --- /dev/null +++ b/src/UglyToad.PdfPig/Parser/Parts/NameTreeParser.cs @@ -0,0 +1,65 @@ +namespace UglyToad.PdfPig.Parser.Parts +{ + using System; + using System.Collections.Generic; + using Exceptions; + using Tokenization.Scanner; + using Tokens; + + internal static class NameTreeParser + { + public static IReadOnlyDictionary FlattenNameTreeToDictionary(DictionaryToken nameTreeNodeDictionary, + IPdfTokenScanner pdfScanner, + bool isLenientParsing, + Func valuesFactory) where TResult : class + { + var result = new Dictionary(); + + FlattenNameTree(nameTreeNodeDictionary, pdfScanner, isLenientParsing, valuesFactory, result); + + return result; + } + + public static void FlattenNameTree(DictionaryToken nameTreeNodeDictionary, + IPdfTokenScanner pdfScanner, + bool isLenientParsing, + Func valuesFactory, + Dictionary result) where TResult : class + { + if (nameTreeNodeDictionary.TryGet(NameToken.Names, pdfScanner, out ArrayToken nodeNames)) + { + for (var i = 0; i < nodeNames.Length; i += 2) + { + if (!(nodeNames[i] is IDataToken key)) + { + continue; + } + + var valueToken = nodeNames[i + 1]; + + var value = valuesFactory(valueToken); + + if (value != null) + { + result[key.Data] = value; + } + } + } + + if (nameTreeNodeDictionary.TryGet(NameToken.Kids, pdfScanner, out ArrayToken kids)) + { + foreach (var kid in kids.Data) + { + if (DirectObjectFinder.TryGet(kid, pdfScanner, out DictionaryToken kidDictionary)) + { + FlattenNameTree(kidDictionary, pdfScanner, isLenientParsing, valuesFactory, result); + } + else if (!isLenientParsing) + { + throw new PdfDocumentFormatException($"Invalid kids entry in PDF name tree: {kid} in {kids}."); + } + } + } + } + } +} diff --git a/src/UglyToad.PdfPig/PdfDocument.cs b/src/UglyToad.PdfPig/PdfDocument.cs index 5ad5d296..ecc08a8f 100644 --- a/src/UglyToad.PdfPig/PdfDocument.cs +++ b/src/UglyToad.PdfPig/PdfDocument.cs @@ -62,6 +62,11 @@ [NotNull] public Structure Structure { get; } + /// + /// Access to rare or advanced features of the PDF specification. + /// + public AdvancedPdfDocumentAccess Advanced { get; } + /// /// The version number of the PDF specification which this file conforms to, for example 1.4. /// @@ -104,6 +109,7 @@ Information = information ?? throw new ArgumentNullException(nameof(information)); pages = new Pages(catalog, pageFactory, isLenientParsing, pdfScanner); Structure = new Structure(catalog, crossReferenceTable, pdfScanner); + Advanced = new AdvancedPdfDocumentAccess(pdfScanner, filterProvider, catalog, isLenientParsing); documentForm = new Lazy(() => acroFormFactory.GetAcroForm(catalog)); } @@ -242,6 +248,7 @@ { try { + Advanced.Dispose(); pdfScanner.Dispose(); inputBytes.Dispose(); }