diff --git a/src/UglyToad.PdfPig.Tests/Graphics/Operations/GraphicsStateOperationTests.cs b/src/UglyToad.PdfPig.Tests/Graphics/Operations/GraphicsStateOperationTests.cs index 1aacdee0..b7c8c97c 100644 --- a/src/UglyToad.PdfPig.Tests/Graphics/Operations/GraphicsStateOperationTests.cs +++ b/src/UglyToad.PdfPig.Tests/Graphics/Operations/GraphicsStateOperationTests.cs @@ -44,7 +44,11 @@ } else if (operationType == typeof(EndInlineImage)) { - operation = new EndInlineImage(new List(), new List()); + operation = new EndInlineImage(new List()); + } + else if (operationType == typeof(BeginInlineImageData)) + { + operation = new BeginInlineImageData(new Dictionary()); } else { diff --git a/src/UglyToad.PdfPig.Tests/Graphics/TestOperationContext.cs b/src/UglyToad.PdfPig.Tests/Graphics/TestOperationContext.cs index 508e8c20..9aceb9d9 100644 --- a/src/UglyToad.PdfPig.Tests/Graphics/TestOperationContext.cs +++ b/src/UglyToad.PdfPig.Tests/Graphics/TestOperationContext.cs @@ -77,5 +77,17 @@ public void SetNamedGraphicsState(NameToken stateName) { } + + public void BeginInlineImage() + { + } + + public void SetInlineImageProperties(IReadOnlyDictionary properties) + { + } + + public void EndInlineImage(IReadOnlyList bytes) + { + } } } diff --git a/src/UglyToad.PdfPig.Tests/Graphics/TestResourceStore.cs b/src/UglyToad.PdfPig.Tests/Graphics/TestResourceStore.cs deleted file mode 100644 index 66778454..00000000 --- a/src/UglyToad.PdfPig.Tests/Graphics/TestResourceStore.cs +++ /dev/null @@ -1,33 +0,0 @@ -namespace UglyToad.PdfPig.Tests.Graphics -{ - using Content; - using PdfPig.Fonts; - using PdfPig.Tokens; - - internal class TestResourceStore : IResourceStore - { - public void LoadResourceDictionary(DictionaryToken dictionary, bool isLenientParsing) - { - } - - public IFont GetFont(NameToken name) - { - return null; - } - - public StreamToken GetXObject(NameToken name) - { - return null; - } - - public DictionaryToken GetExtendedGraphicsStateDictionary(NameToken name) - { - return null; - } - - public IFont GetFontDirectly(IndirectReferenceToken fontReferenceToken, bool isLenientParsing) - { - return null; - } - } -} \ No newline at end of file diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/Single Page Images - from libre office.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/Single Page Images - from libre office.pdf new file mode 100644 index 00000000..98ebd288 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/Single Page Images - from libre office.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Integration/IntegrationDocumentTests.cs b/src/UglyToad.PdfPig.Tests/Integration/IntegrationDocumentTests.cs index b4847cb9..6af240a3 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/IntegrationDocumentTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/IntegrationDocumentTests.cs @@ -81,14 +81,14 @@ { var page = document.GetPage(i + 1); - var images = page.ExperimentalAccess.GetRawImages(); + var images = page.GetImages(); Assert.NotNull(images); foreach (var image in images) { - Assert.True(image.Width > 0, $"Image had width of zero on page {i + 1}."); - Assert.True(image.Height > 0, $"Image had height of zero on page {i + 1}."); + Assert.True(image.WidthInSamples > 0, $"Image had width of zero on page {i + 1}."); + Assert.True(image.HeightInSamples > 0, $"Image had height of zero on page {i + 1}."); } } } diff --git a/src/UglyToad.PdfPig.Tests/Integration/OldGutnishTests.cs b/src/UglyToad.PdfPig.Tests/Integration/OldGutnishTests.cs index fa501848..ffbbffbf 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/OldGutnishTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/OldGutnishTests.cs @@ -41,7 +41,7 @@ { var page = document.GetPage(1); - var images = page.ExperimentalAccess.GetRawImages().ToList(); + var images = page.GetImages().ToList(); Assert.Single(images); } } diff --git a/src/UglyToad.PdfPig.Tests/Integration/SinglePageLibreOfficeImages.cs b/src/UglyToad.PdfPig.Tests/Integration/SinglePageLibreOfficeImages.cs new file mode 100644 index 00000000..daf173d5 --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/SinglePageLibreOfficeImages.cs @@ -0,0 +1,87 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using System; + using System.Linq; + using Xunit; + + public class SinglePageLibreOfficeImages + { + private static string GetFilePath() => IntegrationHelpers.GetDocumentPath(@"Single Page Images - from libre office.pdf"); + + [Fact] + public void Has3Images() + { + using (var document = PdfDocument.Open(GetFilePath(), ParsingOptions.LenientParsingOff)) + { + var page = document.GetPage(1); + + var images = page.GetImages().ToList(); + + Assert.Equal(3, images.Count); + } + } + + [Fact] + public void ImagesHaveCorrectDimensionsAndLocations() + { + using (var document = PdfDocument.Open(GetFilePath(), ParsingOptions.LenientParsingOff)) + { + var page = document.GetPage(1); + + var images = page.GetImages().OrderBy(x => x.Bounds.Width).ToList(); + + var pdfPigSquare = images[0]; + + Assert.Equal(148.3m, pdfPigSquare.Bounds.Width); + Assert.Equal(148.3m, pdfPigSquare.Bounds.Height); + Assert.Equal(60.1m, pdfPigSquare.Bounds.Left); + Assert.Equal(765.8m, pdfPigSquare.Bounds.Top); + + var pdfPigSquished = images[1]; + + Assert.Equal(206.8m, pdfPigSquished.Bounds.Width); + Assert.Equal(83.2m, pdfPigSquished.Bounds.Height); + Assert.Equal(309.8m, pdfPigSquished.Bounds.Left); + Assert.Equal(552.1m, pdfPigSquished.Bounds.Top); + + var birthdayPigs = images[2]; + + Assert.Equal(391m, birthdayPigs.Bounds.Width); + Assert.Equal(267.1m, birthdayPigs.Bounds.Height); + Assert.Equal(102.2m, birthdayPigs.Bounds.Left); + Assert.Equal(426.3m, birthdayPigs.Bounds.Top); + } + } + + [Fact] + public void HasCorrectText() + { + using (var document = PdfDocument.Open(GetFilePath(), ParsingOptions.LenientParsingOff)) + { + var page = document.GetPage(1); + Assert.Equal("Oink oink", page.Text); + } + } + + [Fact] + public void CanAccessImageBytesExceptUnsupported() + { + using (var document = PdfDocument.Open(GetFilePath(), ParsingOptions.LenientParsingOff)) + { + var page = document.GetPage(1); + foreach (var image in page.GetImages()) + { + try + { + Assert.NotNull(image.Bytes); + } + catch (NotSupportedException ) + { + // Should allow access to raw bytes. + Assert.NotNull(image.RawBytes); + } + } + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 1e8488e6..a8a28fe7 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -53,6 +53,8 @@ "UglyToad.PdfPig.Annotations.AnnotationType", "UglyToad.PdfPig.Content.Catalog", "UglyToad.PdfPig.Content.DocumentInformation", + "UglyToad.PdfPig.Content.InlineImage", + "UglyToad.PdfPig.Content.IPdfImage", "UglyToad.PdfPig.Content.Letter", "UglyToad.PdfPig.Content.Page", "UglyToad.PdfPig.Content.PageRotationDegrees", diff --git a/src/UglyToad.PdfPig.Tests/TestFilterProvider.cs b/src/UglyToad.PdfPig.Tests/TestFilterProvider.cs index 49b1113f..f024771b 100644 --- a/src/UglyToad.PdfPig.Tests/TestFilterProvider.cs +++ b/src/UglyToad.PdfPig.Tests/TestFilterProvider.cs @@ -11,6 +11,11 @@ return new List(); } + public IReadOnlyList GetNamedFilters(IReadOnlyList names) + { + return new List(); + } + public IReadOnlyList GetAllFilters() { return new List(); diff --git a/src/UglyToad.PdfPig.Tests/Tokens/TestPdfTokenScanner.cs b/src/UglyToad.PdfPig.Tests/Tokens/TestPdfTokenScanner.cs index f1913b7a..d1e42d2d 100644 --- a/src/UglyToad.PdfPig.Tests/Tokens/TestPdfTokenScanner.cs +++ b/src/UglyToad.PdfPig.Tests/Tokens/TestPdfTokenScanner.cs @@ -42,5 +42,9 @@ namespace UglyToad.PdfPig.Tests.Tokens { return Objects[reference]; } + + public void Dispose() + { + } } } diff --git a/src/UglyToad.PdfPig/Content/IPdfImage.cs b/src/UglyToad.PdfPig/Content/IPdfImage.cs new file mode 100644 index 00000000..2aeeff39 --- /dev/null +++ b/src/UglyToad.PdfPig/Content/IPdfImage.cs @@ -0,0 +1,94 @@ +namespace UglyToad.PdfPig.Content +{ + using System.Collections.Generic; + using Geometry; + using Graphics.Colors; + using Graphics.Core; + using XObjects; + + /// + /// An image in a PDF document, may be an or a PostScript image XObject (). + /// + public interface IPdfImage + { + /// + /// The placement rectangle of the image in PDF coordinates. + /// + PdfRectangle Bounds { get; } + + /// + /// The width of the image in samples. + /// + int WidthInSamples { get; } + + /// + /// The height of the image in samples. + /// + int HeightInSamples { get; } + + /// + /// The used to interpret the image. + /// This defines the number of color components per sample, e.g. + /// 1 component for , + /// 3 components for , + /// 4 components for , + /// etc. + /// This is not defined where is and is optional where the image is JPXEncoded for . + /// + ColorSpace? ColorSpace { get; } + + /// + /// The number of bits used to represent each color component. + /// + int BitsPerComponent { get; } + + /// + /// The bytes of the image with any filters decoded. + /// If the filter used to encode the bytes is not supported accessing this property will throw, access the + /// instead. + /// + IReadOnlyList Bytes { get; } + + /// + /// The encoded bytes of the image with all filters still applied. + /// + IReadOnlyList RawBytes { get; } + + /// + /// The color rendering intent to be used when rendering the image. + /// + RenderingIntent RenderingIntent { get; } + + /// + /// Indicates whether the image is to be treated as an image mask. + /// If the image is a monochrome image in which each sample + /// is specified by a single bit ( is 1). + /// The image represents a stencil where sample values represent places on the page + /// that should be marked with the current color or masked (not marked). + /// + bool IsImageMask { get; } + + /// + /// Describes how to map image samples into the values appropriate for the + /// . + /// The image data is initially composed of values in the range 0 to 2^n - 1 + /// where n is . + /// The decode array contains a pair of numbers for each component in the . + /// The value from the image data is then interpolated into the values relevant to the + /// using the corresponding values of the decode array. + /// + IReadOnlyList Decode { get; } + + /// + /// Specifies whether interpolation is to be performed. Interpolation smooths images where a single component in the image + /// as defined may correspond to many pixels on the output device. The interpolation algorithm is implementation + /// dependent and is not defined by the specification. + /// + bool Interpolate { get; } + + /// + /// Whether this image is an or a . + /// + bool IsInlineImage { get; } + } +} diff --git a/src/UglyToad.PdfPig/Content/IResourceStore.cs b/src/UglyToad.PdfPig/Content/IResourceStore.cs index 112f6624..66b03752 100644 --- a/src/UglyToad.PdfPig/Content/IResourceStore.cs +++ b/src/UglyToad.PdfPig/Content/IResourceStore.cs @@ -14,5 +14,7 @@ DictionaryToken GetExtendedGraphicsStateDictionary(NameToken name); IFont GetFontDirectly(IndirectReferenceToken fontReferenceToken, bool isLenientParsing); + + bool TryGetNamedColorSpace(NameToken name, out IToken namedColorSpace); } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Content/InlineImage.cs b/src/UglyToad.PdfPig/Content/InlineImage.cs new file mode 100644 index 00000000..817aa2d8 --- /dev/null +++ b/src/UglyToad.PdfPig/Content/InlineImage.cs @@ -0,0 +1,98 @@ +namespace UglyToad.PdfPig.Content +{ + using System; + using System.Collections.Generic; + using System.Linq; + using Filters; + using Geometry; + using Graphics.Colors; + using Graphics.Core; + using Tokens; + + /// + /// + /// A small image that is completely defined directly inline within a 's content stream. + /// + public class InlineImage : IPdfImage + { + private readonly Lazy> bytesFactory; + + /// + public PdfRectangle Bounds { get; } + + /// + public int WidthInSamples { get; } + + /// + public int HeightInSamples { get; } + + /// + public ColorSpace? ColorSpace { get; } + + /// + public int BitsPerComponent { get; } + + /// + public bool IsImageMask { get; } + + /// + public IReadOnlyList Decode { get; } + + /// + public bool IsInlineImage { get; } = true; + + /// + public RenderingIntent RenderingIntent { get; } + + /// + public bool Interpolate { get; } + + /// + public IReadOnlyList Bytes => bytesFactory.Value; + + /// + public IReadOnlyList RawBytes { get; } + + /// + /// Create a new . + /// + internal InlineImage(PdfRectangle bounds, int widthInSamples, int heightInSamples, int bitsPerComponent, bool isImageMask, + RenderingIntent renderingIntent, + bool interpolate, + ColorSpace? colorSpace, + IReadOnlyList decode, + IReadOnlyList bytes, + IReadOnlyList filters, + DictionaryToken streamDictionary) + { + Bounds = bounds; + WidthInSamples = widthInSamples; + HeightInSamples = heightInSamples; + ColorSpace = colorSpace; + Decode = decode; + BitsPerComponent = bitsPerComponent; + IsImageMask = isImageMask; + RenderingIntent = renderingIntent; + Interpolate = interpolate; + + RawBytes = bytes; + bytesFactory = new Lazy>(() => + { + var b = bytes.ToArray(); + for (var i = 0; i < filters.Count; i++) + { + var filter = filters[i]; + b = filter.Decode(b, streamDictionary, i); + } + + return b; + }); + } + + /// + public override string ToString() + { + return $"Inline Image (w {Bounds.Width}, h {Bounds.Height})"; + } + } +} diff --git a/src/UglyToad.PdfPig/Content/Page.cs b/src/UglyToad.PdfPig/Content/Page.cs index e18c0bf9..be633031 100644 --- a/src/UglyToad.PdfPig/Content/Page.cs +++ b/src/UglyToad.PdfPig/Content/Page.cs @@ -8,7 +8,6 @@ using Tokens; using Util; using Util.JetBrains.Annotations; - using XObjects; using Geometry; /// @@ -60,7 +59,7 @@ public decimal Height { get; } /// - /// The size of the page according to the standard page sizes or Custom if no matching standard size found. + /// The size of the page according to the standard page sizes or if no matching standard size found. /// public PageSize Size { get; } @@ -68,7 +67,7 @@ /// The parsed graphics state operations in the content stream for this page. /// public IReadOnlyList Operations => Content.GraphicsStateOperations; - + /// /// Access to members whose future locations within the API will change without warning. /// @@ -131,6 +130,11 @@ return (wordExtractor ?? DefaultWordExtractor.Instance).GetWords(Letters); } + /// + /// Gets any images on the page. + /// + public IEnumerable GetImages() => Content.GetImages(); + /// /// Provides access to useful members which will change in future releases. /// @@ -150,16 +154,6 @@ this.annotationProvider = annotationProvider; } - /// - /// Retrieve any images referenced in this page's content. - /// These are returned as s which are - /// raw data from the PDF's content rather than images. - /// - public IEnumerable GetRawImages() - { - return page.Content.GetImages(); - } - /// /// Get the annotation objects from the page. /// diff --git a/src/UglyToad.PdfPig/Content/PageContent.cs b/src/UglyToad.PdfPig/Content/PageContent.cs index 0ff1b699..54f3f9f7 100644 --- a/src/UglyToad.PdfPig/Content/PageContent.cs +++ b/src/UglyToad.PdfPig/Content/PageContent.cs @@ -1,14 +1,17 @@ namespace UglyToad.PdfPig.Content { + using System; using System.Collections.Generic; + using Filters; using Graphics; using Graphics.Operations; using Tokenization.Scanner; using XObjects; - using UglyToad.PdfPig.Geometry; + using Geometry; + using Util; /// - /// + /// Wraps content parsed from a page content stream for access. /// /// /// This should contain a replayable stack of drawing instructions for page content @@ -16,36 +19,46 @@ /// internal class PageContent { - private readonly IReadOnlyDictionary> xObjects; + private readonly IReadOnlyList> images; private readonly IPdfTokenScanner pdfScanner; - private readonly XObjectFactory xObjectFactory; + private readonly IFilterProvider filterProvider; + private readonly IResourceStore resourceStore; private readonly bool isLenientParsing; internal IReadOnlyList GraphicsStateOperations { get; } public IReadOnlyList Letters { get; } + public IReadOnlyList Paths { get; } - internal PageContent(IReadOnlyList graphicsStateOperations, IReadOnlyList letters, List paths, - IReadOnlyDictionary> xObjects, + internal PageContent(IReadOnlyList graphicsStateOperations, IReadOnlyList letters, + IReadOnlyList paths, + IReadOnlyList> images, IPdfTokenScanner pdfScanner, - XObjectFactory xObjectFactory, + IFilterProvider filterProvider, + IResourceStore resourceStore, bool isLenientParsing) { GraphicsStateOperations = graphicsStateOperations; Letters = letters; Paths = paths; - this.xObjects = xObjects; - this.pdfScanner = pdfScanner; - this.xObjectFactory = xObjectFactory; + this.images = images; + this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner)); + this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider)); + this.resourceStore = resourceStore ?? throw new ArgumentNullException(nameof(resourceStore)); this.isLenientParsing = isLenientParsing; } - public IEnumerable GetImages() + public IEnumerable GetImages() { - foreach (var contentRecord in xObjects[XObjectType.Image]) + foreach (var image in images) { - yield return xObjectFactory.CreateImage(contentRecord, pdfScanner, isLenientParsing); + + IPdfImage result = null; + image.Match(x => { result = XObjectFactory.ReadImage(x, pdfScanner, filterProvider, resourceStore, isLenientParsing); }, + x => { result = x; }); + + yield return result; } } } diff --git a/src/UglyToad.PdfPig/Content/ResourceContainer.cs b/src/UglyToad.PdfPig/Content/ResourceStore.cs similarity index 67% rename from src/UglyToad.PdfPig/Content/ResourceContainer.cs rename to src/UglyToad.PdfPig/Content/ResourceStore.cs index 15969430..e8a70ce9 100644 --- a/src/UglyToad.PdfPig/Content/ResourceContainer.cs +++ b/src/UglyToad.PdfPig/Content/ResourceStore.cs @@ -8,7 +8,7 @@ using Tokenization.Scanner; using Tokens; - internal class ResourceContainer : IResourceStore + internal class ResourceStore : IResourceStore { private readonly IPdfTokenScanner scanner; private readonly IFontFactory fontFactory; @@ -18,7 +18,9 @@ private readonly Dictionary extendedGraphicsStates = new Dictionary(); - public ResourceContainer(IPdfTokenScanner scanner, IFontFactory fontFactory) + private readonly Dictionary colorSpaceNames = new Dictionary(); + + public ResourceStore(IPdfTokenScanner scanner, IFontFactory fontFactory) { this.scanner = scanner; this.fontFactory = fontFactory; @@ -58,6 +60,39 @@ extendedGraphicsStates[name] = state; } } + + if (resourceDictionary.TryGet(NameToken.ColorSpace, scanner, out DictionaryToken colorSpaceDictionary)) + { + foreach (var nameColorSpacePair in colorSpaceDictionary.Data) + { + var name = NameToken.Create(nameColorSpacePair.Key); + + if (DirectObjectFinder.TryGet(nameColorSpacePair.Value, scanner, out NameToken colorSpaceName)) + { + colorSpaceNames[name] = colorSpaceName; + } + else if (DirectObjectFinder.TryGet(nameColorSpacePair.Value, scanner, out ArrayToken colorSpaceArray)) + { + if (colorSpaceArray.Length == 0) + { + throw new PdfDocumentFormatException($"Empty ColorSpace array encountered in page resource dictionary: {resourceDictionary}."); + } + + var first = colorSpaceArray.Data[0]; + + if (!(first is NameToken arrayNamedColorSpace)) + { + throw new PdfDocumentFormatException($"Invalid ColorSpace array encountered in page resource dictionary: {colorSpaceArray}."); + } + + colorSpaceNames[name] = arrayNamedColorSpace; + } + else + { + throw new PdfDocumentFormatException($"Invalid ColorSpace token encountered in page resource dictionary: {nameColorSpacePair.Value}."); + } + } + } } private void LoadFontDictionary(DictionaryToken fontDictionary, bool isLenientParsing) @@ -115,6 +150,25 @@ return font; } + public bool TryGetNamedColorSpace(NameToken name, out IToken namedToken) + { + namedToken = null; + + if (name == null) + { + throw new ArgumentNullException(nameof(name)); + } + + if (!colorSpaceNames.TryGetValue(name, out var colorSpaceName)) + { + return false; + } + + namedToken = colorSpaceName; + + return true; + } + public StreamToken GetXObject(NameToken name) { var reference = currentResourceState[name]; diff --git a/src/UglyToad.PdfPig/Filters/Ascii85Filter.cs b/src/UglyToad.PdfPig/Filters/Ascii85Filter.cs index a4729ac2..f115de88 100644 --- a/src/UglyToad.PdfPig/Filters/Ascii85Filter.cs +++ b/src/UglyToad.PdfPig/Filters/Ascii85Filter.cs @@ -33,8 +33,6 @@ using (var stream = new MemoryStream()) using (var writer = new BinaryWriter(stream)) { - - for (var i = 0; i < input.Count; i++) { var value = input[i]; diff --git a/src/UglyToad.PdfPig/Filters/CcittFaxDecodeFilter.cs b/src/UglyToad.PdfPig/Filters/CcittFaxDecodeFilter.cs new file mode 100644 index 00000000..98707a0e --- /dev/null +++ b/src/UglyToad.PdfPig/Filters/CcittFaxDecodeFilter.cs @@ -0,0 +1,15 @@ +namespace UglyToad.PdfPig.Filters +{ + using System; + using System.Collections.Generic; + using Tokens; + + internal class CcittFaxDecodeFilter : IFilter + { + public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) + { + throw new NotSupportedException("The CCITT Fax Filter for image data is not currently supported. " + + "Try accessing the raw compressed data directly."); + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs b/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs index 1a850da4..ced83f76 100644 --- a/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs +++ b/src/UglyToad.PdfPig/Filters/DctDecodeFilter.cs @@ -8,7 +8,8 @@ { public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) { - throw new NotImplementedException(); + throw new NotSupportedException("The DST (Discrete Cosine Transform) Filter indicates data is encoded in JPEG format. " + + "This filter is not currently supported but the raw data can be supplied to JPEG supporting libraries."); } } } diff --git a/src/UglyToad.PdfPig/Filters/IFilterProvider.cs b/src/UglyToad.PdfPig/Filters/IFilterProvider.cs index 4a702172..0460174c 100644 --- a/src/UglyToad.PdfPig/Filters/IFilterProvider.cs +++ b/src/UglyToad.PdfPig/Filters/IFilterProvider.cs @@ -7,6 +7,8 @@ { IReadOnlyList GetFilters(DictionaryToken dictionary); + IReadOnlyList GetNamedFilters(IReadOnlyList names); + IReadOnlyList GetAllFilters(); } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Filters/Jbig2DecodeFilter.cs b/src/UglyToad.PdfPig/Filters/Jbig2DecodeFilter.cs new file mode 100644 index 00000000..d51eb3fb --- /dev/null +++ b/src/UglyToad.PdfPig/Filters/Jbig2DecodeFilter.cs @@ -0,0 +1,15 @@ +namespace UglyToad.PdfPig.Filters +{ + using System; + using System.Collections.Generic; + using Tokens; + + internal class Jbig2DecodeFilter : IFilter + { + public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) + { + throw new NotSupportedException("The JBIG2 Filter for monochrome image data is not currently supported. " + + "Try accessing the raw compressed data directly."); + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Filters/JpxDecodeFilter.cs b/src/UglyToad.PdfPig/Filters/JpxDecodeFilter.cs new file mode 100644 index 00000000..9531b15e --- /dev/null +++ b/src/UglyToad.PdfPig/Filters/JpxDecodeFilter.cs @@ -0,0 +1,15 @@ +namespace UglyToad.PdfPig.Filters +{ + using System; + using System.Collections.Generic; + using Tokens; + + internal class JpxDecodeFilter : IFilter + { + public byte[] Decode(IReadOnlyList input, DictionaryToken streamDictionary, int filterIndex) + { + throw new NotSupportedException("The JPX Filter (JPEG2000) for image data is not currently supported. " + + "Try accessing the raw compressed data directly."); + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Filters/MemoryFilterProvider.cs b/src/UglyToad.PdfPig/Filters/MemoryFilterProvider.cs index fab84ca0..44d5b5d1 100644 --- a/src/UglyToad.PdfPig/Filters/MemoryFilterProvider.cs +++ b/src/UglyToad.PdfPig/Filters/MemoryFilterProvider.cs @@ -2,6 +2,7 @@ { using System; using System.Collections.Generic; + using System.Linq; using Exceptions; using Logging; using Tokens; @@ -15,7 +16,11 @@ { var ascii85 = new Ascii85Filter(); var asciiHex = new AsciiHexDecodeFilter(); + var ccitt = new CcittFaxDecodeFilter(); + var dct = new DctDecodeFilter(); var flate = new FlateFilter(decodeParameterResolver, pngPredictor, log); + var jbig2 = new Jbig2DecodeFilter(); + var jpx = new JpxDecodeFilter(); var runLength = new RunLengthFilter(); var lzw = new LzwFilter(decodeParameterResolver, pngPredictor); @@ -25,8 +30,14 @@ {NameToken.Ascii85DecodeAbbreviation.Data, ascii85}, {NameToken.AsciiHexDecode.Data, asciiHex}, {NameToken.AsciiHexDecodeAbbreviation.Data, asciiHex}, + {NameToken.CcittfaxDecode.Data, ccitt}, + {NameToken.CcittfaxDecodeAbbreviation.Data, ccitt}, + {NameToken.DctDecode.Data, dct}, + {NameToken.DctDecodeAbbreviation.Data, dct}, {NameToken.FlateDecode.Data, flate}, {NameToken.FlateDecodeAbbreviation.Data, flate}, + {NameToken.Jbig2Decode.Data, jbig2}, + {NameToken.JpxDecode.Data, jpx}, {NameToken.RunLengthDecode.Data, runLength}, {NameToken.RunLengthDecodeAbbreviation.Data, runLength}, {NameToken.LzwDecode, lzw}, @@ -64,6 +75,23 @@ throw new PdfDocumentFormatException($"The filter for the stream was not a valid object. Expected name or array, instead got: {token}."); } } + + public IReadOnlyList GetNamedFilters(IReadOnlyList names) + { + if (names == null) + { + throw new ArgumentNullException(nameof(names)); + } + + var result = new List(); + + foreach (var name in names) + { + result.Add(GetFilterStrict(name)); + } + + return result; + } private IFilter GetFilterStrict(string name) { @@ -77,7 +105,7 @@ public IReadOnlyList GetAllFilters() { - throw new System.NotImplementedException(); + return filterInstances.Values.Distinct().ToList(); } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs index 337e6f10..2f07cd85 100644 --- a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs +++ b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs @@ -6,6 +6,8 @@ using Colors; using Content; using Core; + using Exceptions; + using Filters; using Fonts; using Geometry; using IO; @@ -19,27 +21,43 @@ internal class ContentStreamProcessor : IOperationContext { + /// + /// Stores each letter as it is encountered in the content stream. + /// + private readonly List letters = new List(); + + /// + /// Stores each path as it is encountered in the content stream. + /// private readonly List paths = new List(); + + /// + /// Stores a link to each image (either inline or XObject) as it is encountered in the content stream. + /// + private readonly List> images = new List>(); + private readonly IResourceStore resourceStore; private readonly UserSpaceUnit userSpaceUnit; private readonly PageRotationDegrees rotation; private readonly bool isLenientParsing; private readonly IPdfTokenScanner pdfScanner; - private readonly XObjectFactory xObjectFactory; + private readonly IFilterProvider filterProvider; private readonly ILog log; private Stack graphicsStack = new Stack(); - private IFont activeExtendedGraphicsStateFont = null; + private IFont activeExtendedGraphicsStateFont; + private InlineImageBuilder inlineImageBuilder; - //a sequence number of ShowText operation to determine whether letters belong to same operation or not (letters that belong to different operations have less changes to belong to same word) - private int textSequence = 0; + /// + /// A counter to track individual calls to operations used to determine if letters are likely to be + /// in the same word/group. This exposes internal grouping of letters used by the PDF creator which may correspond to the + /// intended grouping of letters into words. + /// + private int textSequence; public TextMatrices TextMatrices { get; } = new TextMatrices(); - public TransformationMatrix CurrentTransformationMatrix - { - get { return GetCurrentState().CurrentTransformationMatrix; } - } + public TransformationMatrix CurrentTransformationMatrix => GetCurrentState().CurrentTransformationMatrix; public PdfPath CurrentPath { get; private set; } @@ -56,18 +74,18 @@ {XObjectType.PostScript, new List()} }; - public List Letters = new List(); - public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore, UserSpaceUnit userSpaceUnit, PageRotationDegrees rotation, bool isLenientParsing, + public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore, UserSpaceUnit userSpaceUnit, PageRotationDegrees rotation, + bool isLenientParsing, IPdfTokenScanner pdfScanner, - XObjectFactory xObjectFactory, + IFilterProvider filterProvider, ILog log) { this.resourceStore = resourceStore; this.userSpaceUnit = userSpaceUnit; this.rotation = rotation; this.isLenientParsing = isLenientParsing; - this.pdfScanner = pdfScanner; - this.xObjectFactory = xObjectFactory; + this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner)); + this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider)); this.log = log; graphicsStack.Push(new CurrentGraphicsState()); ColorSpaceContext = new ColorSpaceContext(GetCurrentState); @@ -75,11 +93,11 @@ public PageContent Process(IReadOnlyList operations) { - var currentState = CloneAllStates(); + CloneAllStates(); ProcessOperations(operations); - return new PageContent(operations, Letters, paths, xObjects, pdfScanner, xObjectFactory, isLenientParsing); + return new PageContent(operations, letters, paths, images, pdfScanner, filterProvider, resourceStore, isLenientParsing); } private void ProcessOperations(IReadOnlyList operations) @@ -265,7 +283,7 @@ var xObjectStream = resourceStore.GetXObject(xObjectName); // For now we will determine the type and store the object with the graphics state information preceding it. - // Then consumers of the page can request the object/s to be retrieved by type. + // Then consumers of the page can request the object(s) to be retrieved by type. var subType = (NameToken)xObjectStream.StreamDictionary.Data[NameToken.Subtype.Data]; var state = GetCurrentState(); @@ -274,15 +292,15 @@ if (subType.Equals(NameToken.Ps)) { - xObjects[XObjectType.PostScript].Add(new XObjectContentRecord(XObjectType.PostScript, xObjectStream, matrix)); + xObjects[XObjectType.PostScript].Add(new XObjectContentRecord(XObjectType.PostScript, xObjectStream, matrix, state.RenderingIntent)); } else if (subType.Equals(NameToken.Image)) { - xObjects[XObjectType.Image].Add(new XObjectContentRecord(XObjectType.Image, xObjectStream, matrix)); + images.Add(Union.One(new XObjectContentRecord(XObjectType.Image, xObjectStream, matrix, state.RenderingIntent))); } else if (subType.Equals(NameToken.Form)) { - xObjects[XObjectType.Form].Add(new XObjectContentRecord(XObjectType.Form, xObjectStream, matrix)); + xObjects[XObjectType.Form].Add(new XObjectContentRecord(XObjectType.Form, xObjectStream, matrix, state.RenderingIntent)); } else { @@ -361,6 +379,52 @@ } } + public void BeginInlineImage() + { + if (inlineImageBuilder != null && !isLenientParsing) + { + throw new PdfDocumentFormatException("Begin inline image (BI) command encountered while another inline image was active."); + } + + inlineImageBuilder = new InlineImageBuilder(); + } + + public void SetInlineImageProperties(IReadOnlyDictionary properties) + { + if (inlineImageBuilder == null) + { + if (isLenientParsing) + { + return; + } + + throw new PdfDocumentFormatException("Begin inline image data (ID) command encountered without a corresponding begin inline image (BI) command."); + } + + inlineImageBuilder.Properties = properties; + } + + public void EndInlineImage(IReadOnlyList bytes) + { + if (inlineImageBuilder == null) + { + if (isLenientParsing) + { + return; + } + + throw new PdfDocumentFormatException("End inline image (EI) command encountered without a corresponding begin inline image (BI) command."); + } + + inlineImageBuilder.Bytes = bytes; + + var image = inlineImageBuilder.CreateInlineImage(CurrentTransformationMatrix, filterProvider, pdfScanner, GetCurrentState().RenderingIntent, resourceStore); + + images.Add(Union.Two(image)); + + inlineImageBuilder = null; + } + private void AdjustTextMatrix(decimal tx, decimal ty) { var matrix = TransformationMatrix.GetTranslationMatrix(tx, ty); @@ -390,7 +454,7 @@ pointSize, textSequence); - Letters.Add(letter); + letters.Add(letter); } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Graphics/IOperationContext.cs b/src/UglyToad.PdfPig/Graphics/IOperationContext.cs index b77a415a..24c55d8c 100644 --- a/src/UglyToad.PdfPig/Graphics/IOperationContext.cs +++ b/src/UglyToad.PdfPig/Graphics/IOperationContext.cs @@ -4,7 +4,7 @@ using Geometry; using IO; using Tokens; - using UglyToad.PdfPig.Core; + using PdfPig.Core; using Util.JetBrains.Annotations; /// @@ -104,5 +104,20 @@ /// /// The name of the state to apply. void SetNamedGraphicsState(NameToken stateName); + + /// + /// Indicate that an inline image is being defined. + /// + void BeginInlineImage(); + + /// + /// Define the properties of the inline image currently being drawn. + /// + void SetInlineImageProperties(IReadOnlyDictionary properties); + + /// + /// Indicates that the current inline image is complete. + /// + void EndInlineImage(IReadOnlyList bytes); } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs b/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs new file mode 100644 index 00000000..0eee843e --- /dev/null +++ b/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs @@ -0,0 +1,213 @@ +namespace UglyToad.PdfPig.Graphics +{ + using System; + using System.Collections.Generic; + using System.Linq; + using Colors; + using Content; + using Core; + using Exceptions; + using Filters; + using Geometry; + using PdfPig.Core; + using Tokenization.Scanner; + using Tokens; + using Util; + + internal class InlineImageBuilder + { + public IReadOnlyDictionary Properties { get; set; } + + public IReadOnlyList Bytes { get; set; } + + public InlineImage CreateInlineImage(TransformationMatrix transformationMatrix, IFilterProvider filterProvider, + IPdfTokenScanner tokenScanner, + RenderingIntent defaultRenderingIntent, + IResourceStore resourceStore) + { + if (Properties == null || Bytes == null) + { + throw new InvalidOperationException($"Inline image builder not completely defined before calling {nameof(CreateInlineImage)}."); + } + + bool TryMapColorSpace(NameToken name, out ColorSpace colorSpaceResult) + { + if (name.TryMapToColorSpace(out colorSpaceResult)) + { + return true; + } + + if (TryExtendedColorSpaceNameMapping(name, out colorSpaceResult)) + { + return true; + } + + if (!resourceStore.TryGetNamedColorSpace(name, out var colorSpaceNamedToken) || !(colorSpaceNamedToken is NameToken newName)) + { + return false; + } + + if (newName.TryMapToColorSpace(out colorSpaceResult)) + { + return true; + } + + if (TryExtendedColorSpaceNameMapping(newName, out colorSpaceResult)) + { + return true; + } + + return false; + } + + var bounds = transformationMatrix.Transform(new PdfRectangle(new PdfPoint(1, 1), + new PdfPoint(0, 0))); + + var width = GetByKeys(NameToken.Width, NameToken.W, true).Int; + + var height = GetByKeys(NameToken.Height, NameToken.H, true).Int; + + var maskToken = GetByKeys(NameToken.ImageMask, NameToken.Im, false); + + var isMask = maskToken?.Data == true; + + var bitsPerComponent = GetByKeys(NameToken.BitsPerComponent, NameToken.Bpc, !isMask)?.Int ?? 1; + + var colorSpace = default(ColorSpace?); + + if (!isMask) + { + var colorSpaceName = GetByKeys(NameToken.ColorSpace, NameToken.Cs, false); + + if (colorSpaceName == null) + { + var colorSpaceArray = GetByKeys(NameToken.ColorSpace, NameToken.Cs, true); + + if (colorSpaceArray.Length == 0) + { + throw new PdfDocumentFormatException("Empty ColorSpace array defined for inline image."); + } + + if (!(colorSpaceArray.Data[0] is NameToken firstColorSpaceName)) + { + throw new PdfDocumentFormatException($"Invalid ColorSpace array defined for inline image: {colorSpaceArray}."); + } + + if (!TryMapColorSpace(firstColorSpaceName, out var colorSpaceMapped)) + { + throw new PdfDocumentFormatException($"Invalid ColorSpace defined for inline image: {firstColorSpaceName}."); + } + + colorSpace = colorSpaceMapped; + } + else + { + if (!TryMapColorSpace(colorSpaceName, out var colorSpaceMapped)) + { + throw new PdfDocumentFormatException($"Invalid ColorSpace defined for inline image: {colorSpaceName}."); + } + + colorSpace = colorSpaceMapped; + } + } + + var renderingIntent = GetByKeys(NameToken.Intent, null, false)?.Data?.ToRenderingIntent() ?? defaultRenderingIntent; + + var filterNames = new List(); + + var filterName = GetByKeys(NameToken.Filter, NameToken.F, false); + + if (filterName == null) + { + var filterArray = GetByKeys(NameToken.Filter, NameToken.F, false); + + if (filterArray != null) + { + filterNames.AddRange(filterArray.Data.OfType()); + } + } + else + { + filterNames.Add(filterName); + } + + var filters = filterProvider.GetNamedFilters(filterNames); + + var decodeRaw = GetByKeys(NameToken.Decode, NameToken.D, false) ?? new ArrayToken(EmptyArray.Instance); + + var decode = decodeRaw.Data.OfType().Select(x => x.Data).ToArray(); + + var filterDictionaryEntries = new Dictionary(); + var decodeParamsDict = GetByKeys(NameToken.DecodeParms, NameToken.Dp, false); + + if (decodeParamsDict == null) + { + var decodeParamsArray = GetByKeys(NameToken.DecodeParms, NameToken.Dp, false); + + if (decodeParamsArray != null) + { + filterDictionaryEntries[NameToken.DecodeParms] = decodeParamsArray; + } + } + else + { + filterDictionaryEntries[NameToken.DecodeParms] = decodeParamsDict; + } + + var streamDictionary = new DictionaryToken(filterDictionaryEntries); + + var interpolate = GetByKeys(NameToken.Interpolate, NameToken.I, false)?.Data ?? false; + + return new InlineImage(bounds, width, height, bitsPerComponent, isMask, renderingIntent, interpolate, colorSpace, decode, Bytes, + filters, + streamDictionary); + } + + private static bool TryExtendedColorSpaceNameMapping(NameToken name, out ColorSpace result) + { + result = ColorSpace.DeviceGray; + + switch (name.Data) + { + case "G": + result = ColorSpace.DeviceGray; + return true; + case "RGB": + result = ColorSpace.DeviceRGB; + return true; + case "CMYK": + result = ColorSpace.DeviceCMYK; + return true; + case "I": + result = ColorSpace.Indexed; + return true; + } + + return false; + } + + // ReSharper disable once ParameterOnlyUsedForPreconditionCheck.Local + private T GetByKeys(NameToken name1, NameToken name2, bool required) where T : IToken + { + if (Properties.TryGetValue(name1, out var val) && val is T result) + { + return result; + } + + if (name2 != null) + { + if (Properties.TryGetValue(name2, out val) && val is T result2) + { + return result2; + } + } + + if (required) + { + throw new PdfDocumentFormatException($"Inline image dictionary missing required entry {name1}/{name2}."); + } + + return default(T); + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/BeginInlineImage.cs b/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/BeginInlineImage.cs index d46f917c..c30667e8 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/BeginInlineImage.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/BeginInlineImage.cs @@ -28,6 +28,7 @@ /// public void Run(IOperationContext operationContext) { + operationContext.BeginInlineImage(); } /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/BeginInlineImageData.cs b/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/BeginInlineImageData.cs index e21e1031..06a4bfaa 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/BeginInlineImageData.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/BeginInlineImageData.cs @@ -1,6 +1,9 @@ namespace UglyToad.PdfPig.Graphics.Operations.InlineImages { + using System; + using System.Collections.Generic; using System.IO; + using Tokens; /// /// @@ -12,22 +15,27 @@ /// The symbol for this operation in a stream. /// public const string Symbol = "ID"; - - /// - /// The instance of the operation. - /// - public static readonly BeginInlineImageData Value = new BeginInlineImageData(); - + /// public string Operator => Symbol; - private BeginInlineImageData() + /// + /// The key-value pairs which specify attributes of the following image. + /// + public IReadOnlyDictionary Dictionary { get; } + + /// + /// Create a new . + /// + public BeginInlineImageData(IReadOnlyDictionary dictionary) { + Dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary)); } /// public void Run(IOperationContext operationContext) { + operationContext.SetInlineImageProperties(Dictionary); } /// diff --git a/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/EndInlineImage.cs b/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/EndInlineImage.cs index f6b7ceb3..206af990 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/EndInlineImage.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/InlineImages/EndInlineImage.cs @@ -3,7 +3,6 @@ using System; using System.Collections.Generic; using System.IO; - using Tokens; /// /// @@ -15,14 +14,9 @@ /// The symbol for this operation in a stream. /// public const string Symbol = "EI"; - + /// - /// The tokens declared in order for this inline image object. - /// - public IReadOnlyList ImageTokens { get; } - - /// - /// The raw data for the inline image which should be interpreted according to the . + /// The raw data for the inline image which should be interpreted according to the corresponding . /// public IReadOnlyList ImageData { get; } @@ -32,17 +26,16 @@ /// /// Create a new operation. /// - /// The tokens which were set during the declaration of this image. /// The raw byte data of this image. - public EndInlineImage(IReadOnlyList imageTokens, IReadOnlyList imageData) + public EndInlineImage(IReadOnlyList imageData) { - ImageTokens = imageTokens ?? throw new ArgumentNullException(nameof(imageTokens)); ImageData = imageData ?? throw new ArgumentNullException(nameof(imageData)); } /// public void Run(IOperationContext operationContext) { + operationContext.EndInlineImage(ImageData); } /// diff --git a/src/UglyToad.PdfPig/Graphics/XObjectContentRecord.cs b/src/UglyToad.PdfPig/Graphics/XObjectContentRecord.cs index af132c31..1d3c820f 100644 --- a/src/UglyToad.PdfPig/Graphics/XObjectContentRecord.cs +++ b/src/UglyToad.PdfPig/Graphics/XObjectContentRecord.cs @@ -1,6 +1,7 @@ namespace UglyToad.PdfPig.Graphics { using System; + using Core; using PdfPig.Core; using Tokens; using Util.JetBrains.Annotations; @@ -15,11 +16,15 @@ public TransformationMatrix AppliedTransformation { get; } - public XObjectContentRecord(XObjectType type, StreamToken stream, TransformationMatrix appliedTransformation) + public RenderingIntent DefaultRenderingIntent { get; } + + public XObjectContentRecord(XObjectType type, StreamToken stream, TransformationMatrix appliedTransformation, + RenderingIntent defaultRenderingIntent) { Type = type; Stream = stream ?? throw new ArgumentNullException(nameof(stream)); AppliedTransformation = appliedTransformation; + DefaultRenderingIntent = defaultRenderingIntent; } } } diff --git a/src/UglyToad.PdfPig/Parser/PageContentParser.cs b/src/UglyToad.PdfPig/Parser/PageContentParser.cs index a777a0be..dfe2b0eb 100644 --- a/src/UglyToad.PdfPig/Parser/PageContentParser.cs +++ b/src/UglyToad.PdfPig/Parser/PageContentParser.cs @@ -30,8 +30,23 @@ if (token is InlineImageDataToken inlineImageData) { - graphicsStateOperations.Add(BeginInlineImageData.Value); - graphicsStateOperations.Add(new EndInlineImage(precedingTokens, inlineImageData.Data)); + var dictionary = new Dictionary(); + + for (var i = 0; i < precedingTokens.Count - 1; i++) + { + var t = precedingTokens[i]; + if (!(t is NameToken n)) + { + continue; + } + + i++; + + dictionary[n] = precedingTokens[i]; + } + + graphicsStateOperations.Add(new BeginInlineImageData(dictionary)); + graphicsStateOperations.Add(new EndInlineImage(inlineImageData.Data)); precedingTokens.Clear(); } else if (token is OperatorToken op) diff --git a/src/UglyToad.PdfPig/Parser/PageFactory.cs b/src/UglyToad.PdfPig/Parser/PageFactory.cs index 05c8e6e1..7d717f24 100644 --- a/src/UglyToad.PdfPig/Parser/PageFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PageFactory.cs @@ -14,7 +14,6 @@ using Tokenization.Scanner; using Tokens; using Util; - using XObjects; internal class PageFactory : IPageFactory { @@ -22,18 +21,15 @@ private readonly IResourceStore resourceStore; private readonly IFilterProvider filterProvider; private readonly IPageContentParser pageContentParser; - private readonly XObjectFactory xObjectFactory; private readonly ILog log; public PageFactory(IPdfTokenScanner pdfScanner, IResourceStore resourceStore, IFilterProvider filterProvider, IPageContentParser pageContentParser, - XObjectFactory xObjectFactory, ILog log) { this.resourceStore = resourceStore; this.filterProvider = filterProvider; this.pageContentParser = pageContentParser; - this.xObjectFactory = xObjectFactory; this.log = log; this.pdfScanner = pdfScanner; } @@ -125,7 +121,7 @@ { var operations = pageContentParser.Parse(new ByteArrayInputBytes(contentBytes)); - var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, rotation, isLenientParsing, pdfScanner, xObjectFactory, log); + var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, rotation, isLenientParsing, pdfScanner, filterProvider, log); return context.Process(operations); } diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index e5f7dbaa..79d8ed0c 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -26,7 +26,6 @@ using Tokenization.Scanner; using Tokens; using Util; - using XObjects; internal static class PdfDocumentFactory { @@ -123,11 +122,11 @@ new Type1FontParser(new Type1EncryptedPortionParser()), compactFontFormatParser), new Type3FontHandler(pdfScanner, cMapCache, filterProvider, encodingReader)); - var resourceContainer = new ResourceContainer(pdfScanner, fontFactory); + var resourceContainer = new ResourceStore(pdfScanner, fontFactory); var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory()), - new XObjectFactory(), log); + log); var informationFactory = new DocumentInformationFactory(); var information = informationFactory.Create(pdfScanner, crossReferenceTable.Trailer); diff --git a/src/UglyToad.PdfPig/PdfDocument.cs b/src/UglyToad.PdfPig/PdfDocument.cs index 146387aa..bc6b9ea6 100644 --- a/src/UglyToad.PdfPig/PdfDocument.cs +++ b/src/UglyToad.PdfPig/PdfDocument.cs @@ -220,6 +220,7 @@ { try { + pdfScanner.Dispose(); inputBytes.Dispose(); } catch (Exception ex) diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/IPdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/IPdfTokenScanner.cs new file mode 100644 index 00000000..79ba1d08 --- /dev/null +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/IPdfTokenScanner.cs @@ -0,0 +1,18 @@ +namespace UglyToad.PdfPig.Tokenization.Scanner +{ + using System; + using Tokens; + + /// + /// Tokenizes objects from bytes in a PDF file. + /// + internal interface IPdfTokenScanner : ISeekableTokenScanner, IDisposable + { + /// + /// Tokenize the object with a given object number. + /// + /// The object number for the object to tokenize. + /// The tokenized object. + ObjectToken Get(IndirectReference reference); + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 993186fa..28e8f575 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -13,19 +13,6 @@ using Parser.Parts; using Tokens; - /// - /// Tokenizes objects from bytes in a PDF file. - /// - internal interface IPdfTokenScanner : ISeekableTokenScanner - { - /// - /// Tokenize the object with a given object number. - /// - /// The object number for the object to tokenize. - /// The tokenized object. - ObjectToken Get(IndirectReference reference); - } - internal class PdfTokenScanner : IPdfTokenScanner { private static readonly byte[] EndstreamBytes = @@ -41,6 +28,7 @@ private readonly CoreTokenScanner coreTokenScanner; private IEncryptionHandler encryptionHandler; + private bool isDisposed; /// /// Stores tokens encountered between obj - endobj markers for each call. @@ -75,6 +63,11 @@ public bool MoveNext() { + if (isDisposed) + { + throw new ObjectDisposedException(nameof(PdfTokenScanner)); + } + // Read until we find object-number generation obj, e.g. "69 420 obj". int tokensRead = 0; while (coreTokenScanner.MoveNext() && !Equals(coreTokenScanner.CurrentToken, OperatorToken.StartObject)) @@ -576,26 +569,51 @@ public bool TryReadToken(out T token) where T : class, IToken { + if (isDisposed) + { + throw new ObjectDisposedException(nameof(PdfTokenScanner)); + } + return coreTokenScanner.TryReadToken(out token); } public void Seek(long position) { + if (isDisposed) + { + throw new ObjectDisposedException(nameof(PdfTokenScanner)); + } + coreTokenScanner.Seek(position); } public void RegisterCustomTokenizer(byte firstByte, ITokenizer tokenizer) { + if (isDisposed) + { + throw new ObjectDisposedException(nameof(PdfTokenScanner)); + } + coreTokenScanner.RegisterCustomTokenizer(firstByte, tokenizer); } public void DeregisterCustomTokenizer(ITokenizer tokenizer) { + if (isDisposed) + { + throw new ObjectDisposedException(nameof(PdfTokenScanner)); + } + coreTokenScanner.DeregisterCustomTokenizer(tokenizer); } public ObjectToken Get(IndirectReference reference) { + if (isDisposed) + { + throw new ObjectDisposedException(nameof(PdfTokenScanner)); + } + if (objectLocationProvider.TryGetCached(reference, out var objectToken)) { return objectToken; @@ -717,5 +735,11 @@ return results; } + + public void Dispose() + { + inputBytes?.Dispose(); + isDisposed = true; + } } } diff --git a/src/UglyToad.PdfPig/Tokens/NameToken.Constants.cs b/src/UglyToad.PdfPig/Tokens/NameToken.Constants.cs index 2054f372..31497d7b 100644 --- a/src/UglyToad.PdfPig/Tokens/NameToken.Constants.cs +++ b/src/UglyToad.PdfPig/Tokens/NameToken.Constants.cs @@ -114,7 +114,7 @@ public static readonly NameToken ColorDodge = new NameToken("ColorDodge"); public static readonly NameToken Colorants = new NameToken("Colorants"); public static readonly NameToken Colors = new NameToken("Colors"); - public static readonly NameToken Colorspace = new NameToken("ColorSpace"); + public static readonly NameToken ColorSpace = new NameToken("ColorSpace"); public static readonly NameToken Columns = new NameToken("Columns"); public static readonly NameToken Compatible = new NameToken("Compatible"); public static readonly NameToken Components = new NameToken("Components"); @@ -272,6 +272,7 @@ public static readonly NameToken Info = new NameToken("Info"); public static readonly NameToken Ink = new NameToken("Ink"); public static readonly NameToken Inklist = new NameToken("InkList"); + public static readonly NameToken Intent = new NameToken("Intent"); public static readonly NameToken Interpolate = new NameToken("Interpolate"); public static readonly NameToken It = new NameToken("IT"); public static readonly NameToken ItalicAngle = new NameToken("ItalicAngle"); diff --git a/src/UglyToad.PdfPig/UglyToad.PdfPig.csproj b/src/UglyToad.PdfPig/UglyToad.PdfPig.csproj index 8c3452a3..77bc61af 100644 --- a/src/UglyToad.PdfPig/UglyToad.PdfPig.csproj +++ b/src/UglyToad.PdfPig/UglyToad.PdfPig.csproj @@ -23,6 +23,11 @@ $(AllowedOutputExtensionsInPackageBuildOutputFolder);.pdb + + true + + + diff --git a/src/UglyToad.PdfPig/Util/OtherEncodings.cs b/src/UglyToad.PdfPig/Util/OtherEncodings.cs index 8100e5bd..7ca578ad 100644 --- a/src/UglyToad.PdfPig/Util/OtherEncodings.cs +++ b/src/UglyToad.PdfPig/Util/OtherEncodings.cs @@ -1,5 +1,7 @@ namespace UglyToad.PdfPig.Util { + using System.Collections.Generic; + using System.Linq; using System.Text; internal static class OtherEncodings @@ -19,6 +21,16 @@ return Iso88591.GetBytes(s); } + public static string BytesAsLatin1String(IReadOnlyList bytes) + { + if (bytes is byte[] arr) + { + return BytesAsLatin1String(arr); + } + + return BytesAsLatin1String(bytes.ToArray()); + } + public static string BytesAsLatin1String(byte[] bytes) { if (bytes == null) diff --git a/src/UglyToad.PdfPig/XObjects/XObjectFactory.cs b/src/UglyToad.PdfPig/XObjects/XObjectFactory.cs index f07924ca..c8ef0120 100644 --- a/src/UglyToad.PdfPig/XObjects/XObjectFactory.cs +++ b/src/UglyToad.PdfPig/XObjects/XObjectFactory.cs @@ -1,13 +1,25 @@ namespace UglyToad.PdfPig.XObjects { using System; + using System.Collections.Generic; + using System.Linq; + using Content; + using Exceptions; + using Filters; + using Geometry; using Graphics; + using Graphics.Colors; + using Graphics.Core; using Tokenization.Scanner; using Tokens; + using Util; - internal class XObjectFactory + internal static class XObjectFactory { - public XObjectImage CreateImage(XObjectContentRecord xObject, IPdfTokenScanner pdfScanner, bool isLenientParsing) + public static XObjectImage ReadImage(XObjectContentRecord xObject, IPdfTokenScanner pdfScanner, + IFilterProvider filterProvider, + IResourceStore resourceStore, + bool isLenientParsing) { if (xObject == null) { @@ -19,19 +31,103 @@ throw new InvalidOperationException($"Cannot create an image from an XObject with type: {xObject.Type}."); } - var width = xObject.Stream.StreamDictionary.Get(NameToken.Width, pdfScanner).Int; - var height = xObject.Stream.StreamDictionary.Get(NameToken.Height, pdfScanner).Int; + var dictionary = xObject.Stream.StreamDictionary; - var isJpxDecode = xObject.Stream.StreamDictionary.TryGet(NameToken.Filter, out var token) + var bounds = xObject.AppliedTransformation.Transform(new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(1, 1))); + + var width = dictionary.Get(NameToken.Width, pdfScanner).Int; + var height = dictionary.Get(NameToken.Height, pdfScanner).Int; + + var isImageMask = dictionary.TryGet(NameToken.ImageMask, pdfScanner, out BooleanToken isMaskToken) + && isMaskToken.Data; + + var isJpxDecode = dictionary.TryGet(NameToken.Filter, out var token) && token is NameToken filterName && filterName.Equals(NameToken.JpxDecode); - - var isImageMask = xObject.Stream.StreamDictionary.TryGet(NameToken.ImageMask, out var maskToken) - && maskToken is BooleanToken maskBoolean - && maskBoolean.Data; - return new XObjectImage(width, height, isJpxDecode, isImageMask, xObject.Stream.StreamDictionary, - xObject.Stream.Data); + int bitsPerComponent = 0; + if (!isImageMask && !isJpxDecode) + { + if (!dictionary.TryGet(NameToken.BitsPerComponent, pdfScanner, out NumericToken bitsPerComponentToken)) + { + throw new PdfDocumentFormatException($"No bits per component defined for image: {dictionary}."); + } + + bitsPerComponent = bitsPerComponentToken.Int; + } + else if (isImageMask) + { + bitsPerComponent = 1; + } + + var intent = xObject.DefaultRenderingIntent; + if (dictionary.TryGet(NameToken.Intent, out NameToken renderingIntentToken)) + { + intent = renderingIntentToken.Data.ToRenderingIntent(); + } + + var interpolate = dictionary.TryGet(NameToken.Interpolate, pdfScanner, out BooleanToken interpolateToken) + && interpolateToken.Data; + + var decodedBytes = new Lazy>(() => xObject.Stream.Decode(filterProvider)); + + var decode = EmptyArray.Instance; + + if (dictionary.TryGet(NameToken.Decode, pdfScanner, out ArrayToken decodeArrayToken)) + { + decode = decodeArrayToken.Data.OfType() + .Select(x => x.Data) + .ToArray(); + } + + var colorSpace = default(ColorSpace?); + + if (!isImageMask) + { + if (dictionary.TryGet(NameToken.ColorSpace, pdfScanner, out NameToken colorSpaceNameToken) + && TryMapColorSpace(colorSpaceNameToken, resourceStore, out var colorSpaceResult)) + { + colorSpace = colorSpaceResult; + } + else if (dictionary.TryGet(NameToken.ColorSpace, pdfScanner, out ArrayToken colorSpaceArrayToken)) + { + if (colorSpaceArrayToken.Length == 0) + { + throw new PdfDocumentFormatException($"Empty ColorSpace array defined for image XObject: {dictionary}."); + } + + var first = colorSpaceArrayToken.Data[0]; + + if (!(first is NameToken firstColorSpaceName) || !TryMapColorSpace(firstColorSpaceName, resourceStore, out colorSpaceResult)) + { + throw new PdfDocumentFormatException($"Invalid ColorSpace array defined for image XObject: {colorSpaceArrayToken}."); + } + + colorSpace = colorSpaceResult; + } + else if (!isJpxDecode) + { + throw new PdfDocumentFormatException($"No ColorSpace defined for image XObject: {dictionary}."); + } + } + + return new XObjectImage(bounds, width, height, bitsPerComponent, colorSpace, isJpxDecode, isImageMask, intent, interpolate, decode, + dictionary, xObject.Stream.Data, decodedBytes); + } + + private static bool TryMapColorSpace(NameToken name, IResourceStore resourceStore, out ColorSpace colorSpaceResult) + { + if (name.TryMapToColorSpace(out colorSpaceResult)) + { + return true; + } + + if (!resourceStore.TryGetNamedColorSpace(name, out var colorSpaceNamedToken) || !(colorSpaceNamedToken is NameToken newName)) + { + return false; + } + + return newName.TryMapToColorSpace(out colorSpaceResult); } } } diff --git a/src/UglyToad.PdfPig/XObjects/XObjectImage.cs b/src/UglyToad.PdfPig/XObjects/XObjectImage.cs index 37aea041..625abf8a 100644 --- a/src/UglyToad.PdfPig/XObjects/XObjectImage.cs +++ b/src/UglyToad.PdfPig/XObjects/XObjectImage.cs @@ -2,23 +2,35 @@ { using System; using System.Collections.Generic; + using Content; + using Geometry; + using Graphics.Colors; + using Graphics.Core; using Tokens; using Util.JetBrains.Annotations; + /// /// - /// The raw stream from a PDF document representing an image XObject. + /// A PostScript image XObject. /// - public class XObjectImage + public class XObjectImage : IPdfImage { - /// - /// The width of the image in samples. - /// - public int Width { get; } + private readonly Lazy> bytes; - /// - /// The height of the image in samples. - /// - public int Height { get; } + /// + public PdfRectangle Bounds { get; } + + /// + public int WidthInSamples { get; } + + /// + public int HeightInSamples { get; } + + /// + public ColorSpace? ColorSpace { get; } + + /// + public int BitsPerComponent { get; } /// /// The JPX filter encodes data using the JPEG2000 compression method. @@ -27,41 +39,67 @@ /// public bool IsJpxEncoded { get; } - /// - /// Whether this image should be treated as an image maske. - /// + /// + public RenderingIntent RenderingIntent { get; } + + /// public bool IsImageMask { get; } + /// + public IReadOnlyList Decode { get; } + + /// + public bool Interpolate { get; } + + /// + public bool IsInlineImage { get; } = false; + /// /// The full dictionary for this Image XObject. /// [NotNull] public DictionaryToken ImageDictionary { get; } - /// - /// The encoded bytes of this image, must be decoded via any - /// filters defined in the prior to consumption. - /// - [NotNull] - public IReadOnlyList Bytes { get; } + /// + public IReadOnlyList RawBytes { get; } + /// + [NotNull] + public IReadOnlyList Bytes => bytes.Value; + /// /// Creates a new . /// - internal XObjectImage(int width, int height, bool isJpxEncoded, bool isImageMask, DictionaryToken imageDictionary, IReadOnlyList bytes) + internal XObjectImage(PdfRectangle bounds, int widthInSamples, int heightInSamples, int bitsPerComponent, + ColorSpace? colorSpace, + bool isJpxEncoded, + bool isImageMask, + RenderingIntent renderingIntent, + bool interpolate, + IReadOnlyList decode, + DictionaryToken imageDictionary, + IReadOnlyList rawBytes, + Lazy> bytes) { - Width = width; - Height = height; + Bounds = bounds; + WidthInSamples = widthInSamples; + HeightInSamples = heightInSamples; + BitsPerComponent = bitsPerComponent; + ColorSpace = colorSpace; IsJpxEncoded = isJpxEncoded; IsImageMask = isImageMask; + RenderingIntent = renderingIntent; + Interpolate = interpolate; + Decode = decode; ImageDictionary = imageDictionary ?? throw new ArgumentNullException(nameof(imageDictionary)); - Bytes = bytes ?? throw new ArgumentNullException(nameof(bytes)); + RawBytes = rawBytes; + this.bytes = bytes ?? throw new ArgumentNullException(nameof(bytes)); } /// public override string ToString() { - return ImageDictionary.ToString(); + return $"XObject Image (w {Bounds.Width}, h {Bounds.Height}): {ImageDictionary}"; } } }