diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderFromExistingTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderFromExistingTests.cs new file mode 100644 index 00000000..e5428c6f --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderFromExistingTests.cs @@ -0,0 +1,24 @@ +namespace UglyToad.PdfPig.Tests.Writer +{ + using System.IO; + using Integration; + using PdfPig.Writer; + using Xunit; + + public class PdfDocumentBuilderFromExistingTests + { + [Fact] + public void LoadAndSaveExistingNoModifications() + { + var path = IntegrationHelpers.GetDocumentPath("Single Page Simple - from open office.pdf"); + + var bytes = File.ReadAllBytes(path); + + var builder = PdfDocumentBuilder.FromPdf(bytes); + + var output = builder.Build(); + + Assert.NotNull(output); + } + } +} diff --git a/src/UglyToad.PdfPig/Content/Pages.cs b/src/UglyToad.PdfPig/Content/Pages.cs index 06752c6c..c6375b94 100644 --- a/src/UglyToad.PdfPig/Content/Pages.cs +++ b/src/UglyToad.PdfPig/Content/Pages.cs @@ -22,16 +22,23 @@ Count = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count); } - + public Page GetPage(int pageNumber, bool clipPaths) { if (pageNumber <= 0 || pageNumber > Count) { - throw new ArgumentOutOfRangeException(nameof(pageNumber), + throw new ArgumentOutOfRangeException(nameof(pageNumber), $"Page number {pageNumber} invalid, must be between 1 and {Count}."); } var pageNode = catalog.GetPageNode(pageNumber); + return CreateFromPageTreeNode(pageNode, pdfScanner, pageFactory, pageNumber, clipPaths); + } + + public static Page CreateFromPageTreeNode(PageTreeNode pageNode, IPdfTokenScanner pdfScanner, + IPageFactory pageFactory, + int pageNumber, bool clipPaths) + { var pageStack = new Stack(); var currentNode = pageNode; @@ -42,7 +49,7 @@ } var pageTreeMembers = new PageTreeMembers(); - + while (pageStack.Count > 0) { currentNode = pageStack.Pop(); @@ -51,7 +58,7 @@ { pageTreeMembers.ParentResources.Enqueue(resourcesDictionary); } - + if (currentNode.NodeDictionary.TryGet(NameToken.MediaBox, pdfScanner, out ArrayToken mediaBox)) { pageTreeMembers.MediaBox = new MediaBox(mediaBox.ToRectangle(pdfScanner)); @@ -64,7 +71,7 @@ } var page = pageFactory.Create(pageNumber, pageNode.NodeDictionary, pageTreeMembers, clipPaths); - + return page; } } diff --git a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs index ef7c5c98..bdb16b1d 100644 --- a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs @@ -52,6 +52,11 @@ namespace UglyToad.PdfPig.Writer /// internal IReadOnlyDictionary Fonts => fonts.ToDictionary(x => x.Key, x => x.Value.FontProgram); + /// + /// Create a builder from an existing PDF file. + /// + public static PdfDocumentBuilder FromPdf(IReadOnlyList bytes) => PdfDocumentToPdfDocumentBuilderFactory.Convert(new ByteArrayInputBytes(bytes)); + /// /// Determines whether the bytes of the TrueType font file provided can be used in a PDF document. /// diff --git a/src/UglyToad.PdfPig/Writer/PdfDocumentToPdfDocumentBuilderFactory.cs b/src/UglyToad.PdfPig/Writer/PdfDocumentToPdfDocumentBuilderFactory.cs new file mode 100644 index 00000000..60015f83 --- /dev/null +++ b/src/UglyToad.PdfPig/Writer/PdfDocumentToPdfDocumentBuilderFactory.cs @@ -0,0 +1,142 @@ +namespace UglyToad.PdfPig.Writer +{ + using System; + using System.Collections.Generic; + using Content; + using Core; + using CrossReference; + using Encryption; + using Exceptions; + using Filters; + using Graphics; + using Logging; + using Parser; + using Parser.FileStructure; + using Parser.Parts; + using PdfFonts; + using PdfFonts.Parser; + using PdfFonts.Parser.Handlers; + using PdfFonts.Parser.Parts; + using PdfPig.Fonts.SystemFonts; + using Tokenization.Scanner; + using Tokens; + + internal static class PdfDocumentToPdfDocumentBuilderFactory + { + private static readonly ILog Log = new NoOpLog(); + private static readonly IFilterProvider FilterProvider = DefaultFilterProvider.Instance; + + public static PdfDocumentBuilder Convert(IInputBytes inputBytes) + { + if (inputBytes == null) + { + throw new ArgumentNullException(nameof(inputBytes)); + } + + var coreScanner = new CoreTokenScanner(inputBytes); + + const bool isLenientParsing = false; + + var version = FileHeaderParser.Parse(coreScanner, isLenientParsing, Log); + + var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log), + new Parser.Parts.CrossReference.CrossReferenceStreamParser(FilterProvider)); + + CrossReferenceTable crossReference = null; + + // ReSharper disable once AccessToModifiedClosure + var locationProvider = new ObjectLocationProvider(() => crossReference, inputBytes); + + var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, FilterProvider, NoOpEncryptionHandler.Instance); + + var crossReferenceOffset = FileTrailerParser.GetFirstCrossReferenceOffset(inputBytes, coreScanner, isLenientParsing); + crossReference = crossReferenceParser.Parse(inputBytes, isLenientParsing, crossReferenceOffset, version.OffsetInFile, pdfScanner, coreScanner); + + var (rootReference, rootDictionary) = ParseTrailer(crossReference, isLenientParsing, + pdfScanner, + out var encryptionDictionary); + + if (encryptionDictionary != null) + { + throw new PdfDocumentEncryptedException("Unable to edit document with password"); + } + + var cidFontFactory = new CidFontFactory(pdfScanner, FilterProvider); + var encodingReader = new EncodingReader(pdfScanner); + + var type1Handler = new Type1FontHandler(pdfScanner, FilterProvider, encodingReader); + + var fontFactory = new FontFactory(Log, new Type0FontHandler(cidFontFactory, + FilterProvider, pdfScanner), + new TrueTypeFontHandler(Log, pdfScanner, FilterProvider, encodingReader, SystemFontFinder.Instance, + type1Handler), + type1Handler, + new Type3FontHandler(pdfScanner, FilterProvider, encodingReader)); + + var resourceContainer = new ResourceStore(pdfScanner, fontFactory); + + var catalog = CatalogFactory.Create(rootReference, rootDictionary, pdfScanner, isLenientParsing); + + var pageFactory = new PageFactory(pdfScanner, resourceContainer, FilterProvider, + new PageContentParser(new ReflectionGraphicsStateOperationFactory()), + Log); + + var builder = new PdfDocumentBuilder(); + + var number = 1; + foreach (var node in GetPages(catalog.PageTree)) + { + // First, what resources can we define, fonts, etc. + // Second, we need to copy resource and dictionary keys we don't understand. + // Third, we need to re-use the inherited properties where possible to prevent double work. + var page = Pages.CreateFromPageTreeNode(node, pdfScanner, pageFactory, number++, false); + var pageBuilder = builder.AddPage(page.Width, page.Height); + pageBuilder.Advanced.Operations.AddRange(page.Operations); + } + + return builder; + } + + private static (IndirectReference, DictionaryToken) ParseTrailer(CrossReferenceTable crossReferenceTable, bool isLenientParsing, IPdfTokenScanner pdfTokenScanner, + out EncryptionDictionary encryptionDictionary) + { + encryptionDictionary = null; + + if (crossReferenceTable.Trailer.EncryptionToken != null) + { + if (!DirectObjectFinder.TryGet(crossReferenceTable.Trailer.EncryptionToken, pdfTokenScanner, out DictionaryToken encryptionDictionaryToken)) + { + throw new PdfDocumentFormatException($"Unrecognized encryption token in trailer: {crossReferenceTable.Trailer.EncryptionToken}."); + } + + encryptionDictionary = EncryptionDictionaryFactory.Read(encryptionDictionaryToken, pdfTokenScanner); + } + + var rootDictionary = DirectObjectFinder.Get(crossReferenceTable.Trailer.Root, pdfTokenScanner); + + if (!rootDictionary.ContainsKey(NameToken.Type) && isLenientParsing) + { + rootDictionary = rootDictionary.With(NameToken.Type, NameToken.Catalog); + } + + return (crossReferenceTable.Trailer.Root, rootDictionary); + } + + private static IEnumerable GetPages(PageTreeNode root) + { + if (root.IsPage) + { + yield return root; + yield break; + } + + foreach (var child in root.Children) + { + foreach (var node in GetPages(child)) + { + yield return node; + } + } + } + } +}