From 1fe54c5f494bb0ff420abaea3c225754db8f67c7 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Thu, 26 Apr 2018 22:22:29 +0100 Subject: [PATCH] add xobjects to pages, fix parsing truetype fonts where the glyphs use the repeat flag. --- .../Graphics/TestOperationContext.cs | 9 ++++ .../Integration/OldGutnishTests.cs | 11 ++++ src/UglyToad.PdfPig/Content/IResourceStore.cs | 2 + src/UglyToad.PdfPig/Content/PageContent.cs | 34 +++++++++++- .../Content/ResourceContainer.cs | 9 ++++ .../Filters/DctDecodeFilter.cs | 13 +++++ .../Fonts/TrueType/Tables/GlyphDataTable.cs | 12 +++++ .../Graphics/ContentStreamProcessor.cs | 53 ++++++++++++++++--- .../Graphics/IOperationContext.cs | 2 + .../Graphics/Operations/InvokeNamedXObject.cs | 3 ++ .../Graphics/XObjectContentRecord.cs | 25 +++++++++ src/UglyToad.PdfPig/Parser/PageFactory.cs | 8 ++- .../Parser/PdfDocumentFactory.cs | 3 +- .../Tokens/IndirectReferenceToken.cs | 6 ++- src/UglyToad.PdfPig/XObject/XObjectFactory.cs | 47 ++++++++++++++++ src/UglyToad.PdfPig/XObject/XObjectType.cs | 9 ++++ 16 files changed, 233 insertions(+), 13 deletions(-) create mode 100644 src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs create mode 100644 src/UglyToad.PdfPig/Graphics/XObjectContentRecord.cs create mode 100644 src/UglyToad.PdfPig/XObject/XObjectFactory.cs create mode 100644 src/UglyToad.PdfPig/XObject/XObjectType.cs diff --git a/src/UglyToad.PdfPig.Tests/Graphics/TestOperationContext.cs b/src/UglyToad.PdfPig.Tests/Graphics/TestOperationContext.cs index 53ce2b25..860d5ee7 100644 --- a/src/UglyToad.PdfPig.Tests/Graphics/TestOperationContext.cs +++ b/src/UglyToad.PdfPig.Tests/Graphics/TestOperationContext.cs @@ -44,6 +44,10 @@ public void ShowPositionedText(IReadOnlyList tokens) { } + + public void ApplyXObject(StreamToken xObjectStream) + { + } } internal class TestResourceStore : IResourceStore @@ -56,5 +60,10 @@ { return null; } + + public StreamToken GetXObject(NameToken name) + { + return null; + } } } diff --git a/src/UglyToad.PdfPig.Tests/Integration/OldGutnishTests.cs b/src/UglyToad.PdfPig.Tests/Integration/OldGutnishTests.cs index a9bec116..6cc169c3 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/OldGutnishTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/OldGutnishTests.cs @@ -37,5 +37,16 @@ } } + [Fact] + public void GetsImageOnPageOne() + { + using (var document = PdfDocument.Open(GetFilename())) + { + var page = document.GetPage(1); + + page.Content.GetImages(); + } + } + } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Content/IResourceStore.cs b/src/UglyToad.PdfPig/Content/IResourceStore.cs index 7ed745d4..57b8f936 100644 --- a/src/UglyToad.PdfPig/Content/IResourceStore.cs +++ b/src/UglyToad.PdfPig/Content/IResourceStore.cs @@ -8,5 +8,7 @@ void LoadResourceDictionary(DictionaryToken resourceDictionary, bool isLenientParsing); IFont GetFont(NameToken name); + + StreamToken GetXObject(NameToken name); } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Content/PageContent.cs b/src/UglyToad.PdfPig/Content/PageContent.cs index 1c7b832b..416d4c15 100644 --- a/src/UglyToad.PdfPig/Content/PageContent.cs +++ b/src/UglyToad.PdfPig/Content/PageContent.cs @@ -1,7 +1,10 @@ namespace UglyToad.PdfPig.Content { using System.Collections.Generic; + using Graphics; using Graphics.Operations; + using Tokenization.Scanner; + using XObject; /// /// @@ -12,8 +15,35 @@ /// internal class PageContent { - internal IReadOnlyList GraphicsStateOperations { get; set; } + private readonly IReadOnlyDictionary> xObjects; + private readonly IPdfTokenScanner pdfScanner; + private readonly XObjectFactory xObjectFactory; + private readonly bool isLenientParsing; - public IReadOnlyList Letters { get; set; } + internal IReadOnlyList GraphicsStateOperations { get; } + + public IReadOnlyList Letters { get; } + + internal PageContent(IReadOnlyList graphicsStateOperations, IReadOnlyList letters, + IReadOnlyDictionary> xObjects, + IPdfTokenScanner pdfScanner, + XObjectFactory xObjectFactory, + bool isLenientParsing) + { + GraphicsStateOperations = graphicsStateOperations; + Letters = letters; + this.xObjects = xObjects; + this.pdfScanner = pdfScanner; + this.xObjectFactory = xObjectFactory; + this.isLenientParsing = isLenientParsing; + } + + public void GetImages() + { + foreach (var contentRecord in xObjects[XObjectType.Image]) + { + xObjectFactory.CreateImage(contentRecord, pdfScanner, isLenientParsing); + } + } } } diff --git a/src/UglyToad.PdfPig/Content/ResourceContainer.cs b/src/UglyToad.PdfPig/Content/ResourceContainer.cs index 468fe814..b9a748ad 100644 --- a/src/UglyToad.PdfPig/Content/ResourceContainer.cs +++ b/src/UglyToad.PdfPig/Content/ResourceContainer.cs @@ -89,6 +89,15 @@ return font; } + + public StreamToken GetXObject(NameToken name) + { + var reference = currentResourceState[name]; + + var stream = DirectObjectFinder.Get(new IndirectReferenceToken(reference), scanner); + + return stream; + } } } diff --git a/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs b/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs new file mode 100644 index 00000000..0976a382 --- /dev/null +++ b/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs @@ -0,0 +1,13 @@ +namespace UglyToad.PdfPig.Filters +{ + using System; + using Tokenization.Tokens; + + internal class DctDecodeFilter : IFilter + { + public byte[] Decode(byte[] input, DictionaryToken streamDictionary, int filterIndex) + { + throw new NotImplementedException(); + } + } +} diff --git a/src/UglyToad.PdfPig/Fonts/TrueType/Tables/GlyphDataTable.cs b/src/UglyToad.PdfPig/Fonts/TrueType/Tables/GlyphDataTable.cs index 2a724445..ea401d6b 100644 --- a/src/UglyToad.PdfPig/Fonts/TrueType/Tables/GlyphDataTable.cs +++ b/src/UglyToad.PdfPig/Fonts/TrueType/Tables/GlyphDataTable.cs @@ -224,6 +224,18 @@ for (var i = 0; i < pointCount; i++) { result[i] = (SimpleGlyphFlags)data.ReadByte(); + + if (result[i].HasFlag(SimpleGlyphFlags.Repeat)) + { + var numberOfRepeats = data.ReadByte(); + + for (int j = 0; j < numberOfRepeats; j++) + { + result[i + j + 1] = result[i]; + } + + i += numberOfRepeats; + } } return result; diff --git a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs index 25c8aeae..2262e2bd 100644 --- a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs +++ b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs @@ -9,28 +9,43 @@ using IO; using Operations; using PdfPig.Core; + using Tokenization.Scanner; using Tokenization.Tokens; using Util; + using XObject; internal class ContentStreamProcessor : IOperationContext { private readonly IResourceStore resourceStore; private readonly UserSpaceUnit userSpaceUnit; private readonly bool isLenientParsing; + private readonly IPdfTokenScanner pdfScanner; + private readonly XObjectFactory xObjectFactory; private Stack graphicsStack = new Stack(); public TextMatrices TextMatrices { get; } = new TextMatrices(); public int StackSize => graphicsStack.Count; - + + private readonly Dictionary> xObjects = new Dictionary> + { + {XObjectType.Form, new List()}, + {XObjectType.Image, new List()}, + {XObjectType.PostScript, new List()} + }; + public List Letters = new List(); - public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore, UserSpaceUnit userSpaceUnit, bool isLenientParsing) + public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore, UserSpaceUnit userSpaceUnit, bool isLenientParsing, + IPdfTokenScanner pdfScanner, + XObjectFactory xObjectFactory) { this.resourceStore = resourceStore; this.userSpaceUnit = userSpaceUnit; this.isLenientParsing = isLenientParsing; + this.pdfScanner = pdfScanner; + this.xObjectFactory = xObjectFactory; graphicsStack.Push(new CurrentGraphicsState()); } @@ -40,11 +55,7 @@ ProcessOperations(operations); - return new PageContent - { - GraphicsStateOperations = operations, - Letters = Letters - }; + return new PageContent(operations, Letters, xObjects, pdfScanner, xObjectFactory, isLenientParsing); } private void ProcessOperations(IReadOnlyList operations) @@ -205,6 +216,34 @@ } } + public void ApplyXObject(StreamToken xObjectStream) + { + // For now we will determine the type and store the object with the graphics state information preceding it. + // Then consumers of the page can request the object/s to be retrieved by type. + var subType = (NameToken)xObjectStream.StreamDictionary.Data[NameToken.Subtype.Data]; + + var state = GetCurrentState(); + + var matrix = state.CurrentTransformationMatrix; + + if (subType.Equals(NameToken.Ps)) + { + xObjects[XObjectType.PostScript].Add(new XObjectContentRecord(XObjectType.PostScript, xObjectStream, matrix)); + } + else if (subType.Equals(NameToken.Image)) + { + xObjects[XObjectType.Image].Add(new XObjectContentRecord(XObjectType.Image, xObjectStream, matrix)); + } + else if (subType.Equals(NameToken.Form)) + { + xObjects[XObjectType.Form].Add(new XObjectContentRecord(XObjectType.Form, xObjectStream, matrix)); + } + else + { + throw new InvalidOperationException($"XObject encountered with unexpected SubType {subType}. {xObjectStream.StreamDictionary}."); + } + } + private void AdjustTextMatrix(decimal tx, decimal ty) { var matrix = TransformationMatrix.GetTranslationMatrix(tx, ty); diff --git a/src/UglyToad.PdfPig/Graphics/IOperationContext.cs b/src/UglyToad.PdfPig/Graphics/IOperationContext.cs index 80cf9a90..e75575ff 100644 --- a/src/UglyToad.PdfPig/Graphics/IOperationContext.cs +++ b/src/UglyToad.PdfPig/Graphics/IOperationContext.cs @@ -19,5 +19,7 @@ void ShowText(IInputBytes bytes); void ShowPositionedText(IReadOnlyList tokens); + + void ApplyXObject(StreamToken xObjectStream); } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Graphics/Operations/InvokeNamedXObject.cs b/src/UglyToad.PdfPig/Graphics/Operations/InvokeNamedXObject.cs index 3f877ce8..4e3f70f2 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/InvokeNamedXObject.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/InvokeNamedXObject.cs @@ -18,6 +18,9 @@ public void Run(IOperationContext operationContext, IResourceStore resourceStore) { + var xobject = resourceStore.GetXObject(Name); + + operationContext.ApplyXObject(xobject); } public override string ToString() diff --git a/src/UglyToad.PdfPig/Graphics/XObjectContentRecord.cs b/src/UglyToad.PdfPig/Graphics/XObjectContentRecord.cs new file mode 100644 index 00000000..faf4d9ad --- /dev/null +++ b/src/UglyToad.PdfPig/Graphics/XObjectContentRecord.cs @@ -0,0 +1,25 @@ +namespace UglyToad.PdfPig.Graphics +{ + using System; + using PdfPig.Core; + using Tokenization.Tokens; + using Util.JetBrains.Annotations; + using XObject; + + internal class XObjectContentRecord + { + public XObjectType Type { get; } + + [NotNull] + public StreamToken Stream { get; } + + public TransformationMatrix AppliedTransformation { get; } + + public XObjectContentRecord(XObjectType type, StreamToken stream, TransformationMatrix appliedTransformation) + { + Type = type; + Stream = stream ?? throw new ArgumentNullException(nameof(stream)); + AppliedTransformation = appliedTransformation; + } + } +} diff --git a/src/UglyToad.PdfPig/Parser/PageFactory.cs b/src/UglyToad.PdfPig/Parser/PageFactory.cs index 0beab45b..c34f3190 100644 --- a/src/UglyToad.PdfPig/Parser/PageFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PageFactory.cs @@ -14,20 +14,24 @@ using Tokenization.Scanner; using Tokenization.Tokens; using Util; + using XObject; internal class PageFactory : IPageFactory { private readonly IResourceStore resourceStore; private readonly IFilterProvider filterProvider; private readonly IPageContentParser pageContentParser; + private readonly XObjectFactory xObjectFactory; private readonly IPdfTokenScanner pdfScanner; public PageFactory(IPdfTokenScanner pdfScanner, IResourceStore resourceStore, IFilterProvider filterProvider, - IPageContentParser pageContentParser) + IPageContentParser pageContentParser, + XObjectFactory xObjectFactory) { this.resourceStore = resourceStore; this.filterProvider = filterProvider; this.pageContentParser = pageContentParser; + this.xObjectFactory = xObjectFactory; this.pdfScanner = pdfScanner; } @@ -111,7 +115,7 @@ var operations = pageContentParser.Parse(new ByteArrayInputBytes(contentBytes)); - var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, isLenientParsing); + var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, isLenientParsing, pdfScanner, xObjectFactory); return context.Process(operations); } diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index 7b79ab9e..6b735c1d 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -21,6 +21,7 @@ using Tokenization.Scanner; using Tokenization.Tokens; using Util; + using XObject; internal static class PdfDocumentFactory { @@ -109,7 +110,7 @@ var resourceContainer = new ResourceContainer(pdfScanner, fontFactory); - var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory())); + var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory()), new XObjectFactory()); var informationFactory = new DocumentInformationFactory(); diff --git a/src/UglyToad.PdfPig/Tokenization/Tokens/IndirectReferenceToken.cs b/src/UglyToad.PdfPig/Tokenization/Tokens/IndirectReferenceToken.cs index d957d44c..71f19971 100644 --- a/src/UglyToad.PdfPig/Tokenization/Tokens/IndirectReferenceToken.cs +++ b/src/UglyToad.PdfPig/Tokenization/Tokens/IndirectReferenceToken.cs @@ -10,6 +10,10 @@ { Data = data; } - } + public override string ToString() + { + return $"{Data}"; + } + } } diff --git a/src/UglyToad.PdfPig/XObject/XObjectFactory.cs b/src/UglyToad.PdfPig/XObject/XObjectFactory.cs new file mode 100644 index 00000000..d13358ee --- /dev/null +++ b/src/UglyToad.PdfPig/XObject/XObjectFactory.cs @@ -0,0 +1,47 @@ +namespace UglyToad.PdfPig.XObject +{ + using System; + using Graphics; + using Tokenization.Scanner; + using Tokenization.Tokens; + + internal class XObjectFactory + { + public void CreateImage(XObjectContentRecord xObject, IPdfTokenScanner pdfScanner, bool isLenientParsing) + { + if (xObject == null) + { + throw new ArgumentNullException(nameof(xObject)); + } + + if (xObject.Type != XObjectType.Image) + { + throw new InvalidOperationException($"Cannot create an image from an XObject with type: {xObject.Type}."); + } + + var width = xObject.Stream.StreamDictionary.Get(NameToken.Width, pdfScanner).Int; + var height = xObject.Stream.StreamDictionary.Get(NameToken.Height, pdfScanner).Int; + + var isJpxDecode = xObject.Stream.StreamDictionary.TryGet(NameToken.Filter, out var token) + && token is NameToken filterName + && filterName.Equals(NameToken.JpxDecode); + + if (isJpxDecode) + { + return; + } + + var isImageMask = xObject.Stream.StreamDictionary.TryGet(NameToken.ImageMask, out var maskToken) + && maskToken is BooleanToken maskBoolean + && maskBoolean.Data; + + if (isImageMask) + { + return; + } + + var bitsPerComponents = xObject.Stream.StreamDictionary.Get(NameToken.BitsPerComponent, pdfScanner).Int; + + } + } +} diff --git a/src/UglyToad.PdfPig/XObject/XObjectType.cs b/src/UglyToad.PdfPig/XObject/XObjectType.cs new file mode 100644 index 00000000..7c285624 --- /dev/null +++ b/src/UglyToad.PdfPig/XObject/XObjectType.cs @@ -0,0 +1,9 @@ +namespace UglyToad.PdfPig.XObject +{ + internal enum XObjectType + { + Image, + Form, + PostScript + } +}