diff --git a/font-notes.md b/font-notes.md new file mode 100644 index 00000000..a3a32c4d --- /dev/null +++ b/font-notes.md @@ -0,0 +1,118 @@ +# Fonts # + +## Types of Font ## + +

+
+------	Composite Fonts -------
+
+	Type0 (Composed of glyphs from a CIDFont)
+
+		Children:
+		
+		CIDFont 	CIDFontType0	(Type 1 font glyph descriptions)
+				CIDFontType2	(TrueType font glyph descriptions)
+	
+------	Simple Fonts Below -------
+
+	Type 1	Type 1 (defines gylphs using type 1 font technology)
+		MMType1 (multiple master font - extends type 1 fonts to support many typefaces for a single font)
+
+	Type 3	(defines glyphs with streams of PDF graphics operations)
+	
+	TrueType (from the TrueType font format)
+
+
+ + +## Terminology ## + ++ Font dictionary: PDF dictionary with information about the font ++ Font program: Glyph information in specialized font format + +## Composite Fonts ## + ++ Glyphs are selected from a font-like CIDFont. ++ Has a single CIDFont descendant. ++ Multiple-byte sequences select a single glyph. + +Used for multiple-byte character encodings and large numbers of glyphs. + +Well suited to Chinese, Japanese and Korean (CJK). + +CID stands for character identifier. This is a number used to access glyph descriptions. + +The CMap maps between character codes and CID numbers for the glyphs. + +A CIDFont file provides the glyph descriptions for a character collection. The glyph descriptions are +identified by CIDs. + +CID keyed font combines a CMap with a CIDFont. + +The **Encoding** contains the CMap. +The **DescendantFonts** contains the CIDFont to use with the CMap. + +### CIDFont ### + +A Type0 font descendant (CIDFont) must be either a CIDFontType0 (Adobe Type 1) or CIDFontType2 (TrueType). + +For Type 2 CIDFonts (TrueType) the glyphs are identified by a glyph index (GID). + ++ If the font program is embedded as a stream the CIDFont dictionary must contain a CIDToGIDMap which maps +from CIDs to Glyph Indexes. + ++ If the font program is a predefined external font the CIDFont must not contain a CIDToGIDMap. It +may only use a predefined CMap. + +Though a CID may not be used to select the glyph as in the predefined case, it is always used to select glyph +metrics. Every CIDFont must describe CID 0 which is the ```.notdef``` character for missing characters. + +### Glyph Metrics in CIDFonts ### + +Widths for CIDFonts are defined in the DW and W entries in the CIDFont dictionary. + ++ DW provides the default width for glyphs which are not specified individually. ++ W defines widths for individual CIDs. + +Vertical writing has other stuff, see the spec. + +### CMap ### + +The CMap maps from character codes to character selectors (CIDs). + +The CMap defines the writing mode horizontal or vertical. + +### Type 0 Fonts ### + +The **Font dictionary** has the following entries: + ++ Type (name): /Font ++ Subtype (name): /Type0 ++ BaseFont (name): The PostScript name of the font. ++ Encoding (name/stream R): Name of a predefined CMap or a stream for an embedded CMap. ++ DescendantFonts (array): Single element pointing to the CIDFont. ++ ToUnicode (stream R)?: Stream containing a CMap file to map codes to Unicode. + +## Simple Fonts ## + ++ Glyphs are selected by single-byte character codes. Index into a 256 entry glyph table. ++ Only supports horizontal writing mode. + +## Further Description ## + +### Type 1 Fonts ### + +The **Font program** is a PostScript program describing glyph shape. See the Adobe Type 1 Font Format specification. + +The **Font dictionary** has the following entries: + ++ Type (name): /Font ++ Subtype (name): /Type1 ++ Name (name?): Font name ++ BaseFont (name): The PostScript name of the font. Equivalent to the FontName value in the **Font program**. ++ FirstChar (int): The first character code in the Widths array. ++ LastChar (int) The last character code in the Widths array. ++ Widths (numeric[] R): An array defining the glyph width in units of 1000 == 1 text space unit. ++ FontDescriptor (Dict<> R): Describes font metrics other than widths. ++ Encoding (name/Dict<> R): Specifies the character encoding if different from default. ++ ToUnicode (stream R): CMap mapping character code to Unicode. \ No newline at end of file diff --git a/src/UglyToad.Pdf.Tests/Graphics/TestOperationContext.cs b/src/UglyToad.Pdf.Tests/Graphics/TestOperationContext.cs index 0f514d6d..386e17e0 100644 --- a/src/UglyToad.Pdf.Tests/Graphics/TestOperationContext.cs +++ b/src/UglyToad.Pdf.Tests/Graphics/TestOperationContext.cs @@ -2,6 +2,7 @@ { using System.Collections.Generic; using Content; + using ContentStream; using IO; using Pdf.Cos; using Pdf.Fonts; @@ -44,6 +45,10 @@ internal class TestResourceStore : IResourceStore { + public void LoadResourceDictionary(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing) + { + } + public IFont GetFont(CosName name) { return null; diff --git a/src/UglyToad.Pdf.Tests/Parser/PageContentParserTests.cs b/src/UglyToad.Pdf.Tests/Parser/PageContentParserTests.cs index 89726869..d447f0ab 100644 --- a/src/UglyToad.Pdf.Tests/Parser/PageContentParserTests.cs +++ b/src/UglyToad.Pdf.Tests/Parser/PageContentParserTests.cs @@ -13,15 +13,14 @@ public class PageContentParserTests { - private readonly PageContentParser parser = new PageContentParser(); - private readonly IGraphicsStateOperationFactory operationFactory = new ReflectionGraphicsStateOperationFactory(); + private readonly PageContentParser parser = new PageContentParser(new ReflectionGraphicsStateOperationFactory()); [Fact] public void CorrectlyExtractsOperations() { var input = StringBytesTestConverter.Convert(SimpleGoogleDocPageContent, false); - var result = parser.Parse(new ReflectionGraphicsStateOperationFactory(), input.Bytes); + var result = parser.Parse(input.Bytes); } [Fact] @@ -36,7 +35,7 @@ ET"; var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(operationFactory, input.Bytes); + var result = parser.Parse(input.Bytes); Assert.Equal(7, result.Count); @@ -72,7 +71,7 @@ ET"; var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(operationFactory, input.Bytes); + var result = parser.Parse(input.Bytes); Assert.Equal(4, result.Count); diff --git a/src/UglyToad.Pdf.Tests/Tokenization/DictionaryTokenizerTests.cs b/src/UglyToad.Pdf.Tests/Tokenization/DictionaryTokenizerTests.cs index 1e10b5fe..def8876b 100644 --- a/src/UglyToad.Pdf.Tests/Tokenization/DictionaryTokenizerTests.cs +++ b/src/UglyToad.Pdf.Tests/Tokenization/DictionaryTokenizerTests.cs @@ -3,6 +3,7 @@ namespace UglyToad.Pdf.Tests.Tokenization { using System; using System.Collections.Generic; + using ContentStream; using Pdf.Cos; using Pdf.Tokenization; using Pdf.Tokenization.Tokens; diff --git a/src/UglyToad.Pdf/Content/IPageFactory.cs b/src/UglyToad.Pdf/Content/IPageFactory.cs new file mode 100644 index 00000000..5663d145 --- /dev/null +++ b/src/UglyToad.Pdf/Content/IPageFactory.cs @@ -0,0 +1,11 @@ +namespace UglyToad.Pdf.Content +{ + using ContentStream; + using IO; + + internal interface IPageFactory + { + Page Create(int number, PdfDictionary dictionary, PageTreeMembers pageTreeMembers, IRandomAccessRead reader, + bool isLenientParsing); + } +} \ No newline at end of file diff --git a/src/UglyToad.Pdf/Content/IResourceStore.cs b/src/UglyToad.Pdf/Content/IResourceStore.cs new file mode 100644 index 00000000..fc48790b --- /dev/null +++ b/src/UglyToad.Pdf/Content/IResourceStore.cs @@ -0,0 +1,14 @@ +namespace UglyToad.Pdf.Content +{ + using ContentStream; + using Cos; + using Fonts; + using IO; + + internal interface IResourceStore + { + void LoadResourceDictionary(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing); + + IFont GetFont(CosName name); + } +} \ No newline at end of file diff --git a/src/UglyToad.Pdf/Content/Page.cs b/src/UglyToad.Pdf/Content/Page.cs index 527fd589..5b2536b0 100644 --- a/src/UglyToad.Pdf/Content/Page.cs +++ b/src/UglyToad.Pdf/Content/Page.cs @@ -2,21 +2,9 @@ { using System; using System.Collections.Generic; - using System.Diagnostics; - using ContentStream; - using Cos; - using Filters; - using Geometry; - using Graphics; - using IO; - using Parser; - using Util; public class Page { - private readonly ParsingArguments parsingArguments; - private readonly PdfDictionary dictionary; - /// /// The 1 indexed page number. /// @@ -28,78 +16,16 @@ public IReadOnlyList Text => Content?.Text ?? new string[0]; - internal Page(int number, PdfDictionary dictionary, PageTreeMembers pageTreeMembers, ParsingArguments parsingArguments) + internal Page(int number, MediaBox mediaBox, PageContent content) { if (number <= 0) { throw new ArgumentOutOfRangeException(nameof(number), "Page number cannot be 0 or negative."); } - - this.dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary)); - this.parsingArguments = parsingArguments ?? throw new ArgumentNullException(nameof(parsingArguments)); Number = number; - - var type = dictionary.GetName(CosName.TYPE); - - if (type != null && !type.Equals(CosName.PAGE) && !parsingArguments.IsLenientParsing) - { - throw new InvalidOperationException($"Created page number {number} but its type was specified as {type} rather than 'Page'."); - } - - if (dictionary.TryGetItemOfType(CosName.MEDIA_BOX, out COSArray mediaboxArray)) - { - var x1 = mediaboxArray.getInt(0); - var y1 = mediaboxArray.getInt(1); - var x2 = mediaboxArray.getInt(2); - var y2 = mediaboxArray.getInt(3); - - MediaBox = new MediaBox(new PdfRectangle(x1, y1, x2, y2)); - } - else - { - MediaBox = pageTreeMembers.GetMediaBox(); - - if (MediaBox == null) - { - if (parsingArguments.IsLenientParsing) - { - MediaBox = MediaBox.A4; - } - else - { - throw new InvalidOperationException("No mediabox was present for page: " + number); - } - } - } - - if (dictionary.GetItemOrDefault(CosName.RESOURCES) is PdfDictionary resource) - { - parsingArguments.CachingProviders.ResourceContainer.LoadResourceDictionary(resource, parsingArguments); - } - - var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject; - if (contentObject != null) - { - var contentStream = parsingArguments.Container.Get() - .Parse(parsingArguments, contentObject, false) as RawCosStream; - - if (contentStream == null) - { - throw new InvalidOperationException("Failed to parse the content for the page: " + number); - } - - var contents = contentStream.Decode(parsingArguments.Container.Get()); - - var operations = parsingArguments.Container.Get() - .Parse(parsingArguments.Container.Get(), new ByteArrayInputBytes(contents)); - - var context = new ContentStreamProcessor(MediaBox.Bounds, parsingArguments.CachingProviders.ResourceContainer); - - var content = context.Process(operations); - - Content = content; - } + MediaBox = mediaBox; + Content = content; } } } \ No newline at end of file diff --git a/src/UglyToad.Pdf/Content/PageFactory.cs b/src/UglyToad.Pdf/Content/PageFactory.cs new file mode 100644 index 00000000..fee4739a --- /dev/null +++ b/src/UglyToad.Pdf/Content/PageFactory.cs @@ -0,0 +1,99 @@ +namespace UglyToad.Pdf.Content +{ + using System; + using ContentStream; + using Cos; + using Filters; + using Geometry; + using Graphics; + using IO; + using Parser; + + internal class PageFactory : IPageFactory + { + private readonly IResourceStore resourceStore; + private readonly IPdfObjectParser pdfObjectParser; + private readonly IFilterProvider filterProvider; + private readonly IPageContentParser pageContentParser; + + public PageFactory(IResourceStore resourceStore, IPdfObjectParser pdfObjectParser, IFilterProvider filterProvider, + IPageContentParser pageContentParser) + { + this.resourceStore = resourceStore; + this.pdfObjectParser = pdfObjectParser; + this.filterProvider = filterProvider; + this.pageContentParser = pageContentParser; + } + + public Page Create(int number, PdfDictionary dictionary, PageTreeMembers pageTreeMembers, IRandomAccessRead reader, + bool isLenientParsing) + { + if (dictionary == null) + { + throw new ArgumentNullException(nameof(dictionary)); + } + + var type = dictionary.GetName(CosName.TYPE); + + if (type != null && !type.Equals(CosName.PAGE) && !isLenientParsing) + { + throw new InvalidOperationException($"Page {number} had its type was specified as {type} rather than 'Page'."); + } + + MediaBox mediaBox; + if (dictionary.TryGetItemOfType(CosName.MEDIA_BOX, out COSArray mediaboxArray)) + { + var x1 = mediaboxArray.getInt(0); + var y1 = mediaboxArray.getInt(1); + var x2 = mediaboxArray.getInt(2); + var y2 = mediaboxArray.getInt(3); + + mediaBox = new MediaBox(new PdfRectangle(x1, y1, x2, y2)); + } + else + { + mediaBox = pageTreeMembers.GetMediaBox(); + + if (mediaBox == null) + { + if (isLenientParsing) + { + mediaBox = MediaBox.A4; + } + else + { + throw new InvalidOperationException("No mediabox was present for page: " + number); + } + } + } + + if (dictionary.GetItemOrDefault(CosName.RESOURCES) is PdfDictionary resource) + { + resourceStore.LoadResourceDictionary(resource, reader, isLenientParsing); + } + + PageContent content = default(PageContent); + + var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject; + if (contentObject != null) + { + var contentStream = pdfObjectParser.Parse(contentObject.ToIndirectReference(), reader, false) as RawCosStream; + + if (contentStream == null) + { + throw new InvalidOperationException("Failed to parse the content for the page: " + number); + } + + var contents = contentStream.Decode(filterProvider); + + var operations = pageContentParser.Parse(new ByteArrayInputBytes(contents)); + + var context = new ContentStreamProcessor(mediaBox.Bounds, resourceStore); + + content = context.Process(operations); + } + + return new Page(number, mediaBox, content); + } + } +} diff --git a/src/UglyToad.Pdf/Content/Pages.cs b/src/UglyToad.Pdf/Content/Pages.cs index 61e3c715..f61939a2 100644 --- a/src/UglyToad.Pdf/Content/Pages.cs +++ b/src/UglyToad.Pdf/Content/Pages.cs @@ -6,26 +6,26 @@ using ContentStream; using ContentStream.TypedAccessors; using Cos; + using IO; using Logging; using Parser; - using Parser.PageTree; public class Pages { + private readonly ILog log; private readonly Catalog catalog; - private readonly ParsingArguments arguments; + private readonly IPdfObjectParser pdfObjectParser; + private readonly IPageFactory pageFactory; + private readonly IRandomAccessRead reader; + private readonly bool isLenientParsing; private readonly PdfDictionary rootPageDictionary; private readonly Dictionary locatedPages = new Dictionary(); public int Count { get; } - internal Pages(Catalog catalog, ParsingArguments arguments) + internal Pages(ILog log, Catalog catalog, IPdfObjectParser pdfObjectParser, IPageFactory pageFactory, + IRandomAccessRead reader, bool isLenientParsing) { - if (arguments == null) - { - throw new ArgumentNullException(nameof(arguments)); - } - if (catalog == null) { throw new ArgumentNullException(nameof(catalog)); @@ -38,9 +38,9 @@ throw new InvalidOperationException("No pages were present in the catalog for this PDF document"); } - var pageObject = arguments.Container.Get().Parse(arguments, pages, false); + var pagesObject = pdfObjectParser.Parse(pages.ToIndirectReference(), reader, isLenientParsing); - if (!(pageObject is PdfDictionary catalogPageDictionary)) + if (!(pagesObject is PdfDictionary catalogPageDictionary)) { throw new InvalidOperationException("Could not find the root pages object: " + pages); } @@ -51,8 +51,12 @@ Count = count; + this.log = log; this.catalog = catalog; - this.arguments = arguments; + this.pdfObjectParser = pdfObjectParser; + this.pageFactory = pageFactory; + this.reader = reader; + this.isLenientParsing = isLenientParsing; } @@ -60,7 +64,8 @@ { if (locatedPages.TryGetValue(pageNumber, out PdfDictionary targetPageDictionary)) { - return new Page(pageNumber, targetPageDictionary, new PageTreeMembers(), arguments); + return pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader, + isLenientParsing); } var observed = new List(); @@ -73,8 +78,7 @@ throw new InvalidOperationException("Could not find the page with number: " + pageNumber); } - var page = arguments.Container.Get() - .Parse(pageNumber, targetPageDictionary, arguments); + var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader, isLenientParsing); locatedPages[pageNumber] = targetPageDictionary; @@ -108,8 +112,7 @@ if (!type.Equals(CosName.PAGES)) { - arguments.Container.Get() - .Warn("Did not find the expected type (Page or Pages) in dictionary: " + currentPageDictionary); + log.Warn("Did not find the expected type (Page or Pages) in dictionary: " + currentPageDictionary); return false; } @@ -120,7 +123,7 @@ foreach (var kid in kids.OfType()) { // todo: exit early - var child = arguments.Container.Get().Parse(arguments, kid, false) as PdfDictionary; + var child = pdfObjectParser.Parse(kid.ToIndirectReference(), reader, isLenientParsing) as PdfDictionary; var thisPageMatches = FindPage(child, soughtPageNumber, pageNumbersObserved); diff --git a/src/UglyToad.Pdf/Content/ResourceContainer.cs b/src/UglyToad.Pdf/Content/ResourceContainer.cs index 33577e8c..1dc61471 100644 --- a/src/UglyToad.Pdf/Content/ResourceContainer.cs +++ b/src/UglyToad.Pdf/Content/ResourceContainer.cs @@ -4,31 +4,32 @@ using System.Collections.Generic; using ContentStream; using Cos; - using Filters; using Fonts; - using Fonts.Cmap; - using Fonts.Parser; using IO; using Parser; - internal interface IResourceStore - { - IFont GetFont(CosName name); - } - internal class ResourceContainer : IResourceStore { + private readonly IPdfObjectParser pdfObjectParser; + private readonly IFontFactory fontFactory; + private readonly Dictionary loadedFonts = new Dictionary(); - internal void LoadResourceDictionary(PdfDictionary dictionary, ParsingArguments arguments) + public ResourceContainer(IPdfObjectParser pdfObjectParser, IFontFactory fontFactory) + { + this.pdfObjectParser = pdfObjectParser; + this.fontFactory = fontFactory; + } + + public void LoadResourceDictionary(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing) { if (dictionary.TryGetValue(CosName.FONT, out var fontBase) && fontBase is PdfDictionary fontDictionary) { - LoadFontDictionary(fontDictionary, arguments); + LoadFontDictionary(fontDictionary, reader, isLenientParsing); } } - private void LoadFontDictionary(PdfDictionary fontDictionary, ParsingArguments arguments) + private void LoadFontDictionary(PdfDictionary fontDictionary, IRandomAccessRead reader, bool isLenientParsing) { foreach (var pair in fontDictionary) { @@ -39,24 +40,22 @@ if (!(pair.Value is CosObject objectKey)) { - if (arguments.IsLenientParsing) + if (isLenientParsing) { continue; } throw new InvalidOperationException($"The font with name {pair.Key} did not link to an object key. Value was: {pair.Value}."); } - - var dynamicParser = arguments.Get(); - - var fontObject = dynamicParser.Parse(arguments, objectKey, false) as PdfDictionary; + + var fontObject = pdfObjectParser.Parse(objectKey.ToIndirectReference(), reader, false) as PdfDictionary; if (fontObject == null) { throw new InvalidOperationException($"Could not retrieve the font with name: {pair.Key} which should have been object {objectKey.GetObjectNumber()}"); } - loadedFonts[pair.Key] = arguments.Get().GetFont(fontObject, arguments); + loadedFonts[pair.Key] = fontFactory.Get(fontObject, reader, isLenientParsing); } } diff --git a/src/UglyToad.Pdf/ContentStream/IndirectReference.cs b/src/UglyToad.Pdf/ContentStream/IndirectReference.cs new file mode 100644 index 00000000..f4c501c2 --- /dev/null +++ b/src/UglyToad.Pdf/ContentStream/IndirectReference.cs @@ -0,0 +1,15 @@ +namespace UglyToad.Pdf.ContentStream +{ + public struct IndirectReference + { + public long ObjectNumber { get; } + + public int Generation { get; } + + public IndirectReference(long objectNumber, int generation) + { + ObjectNumber = objectNumber; + Generation = generation; + } + } +} \ No newline at end of file diff --git a/src/UglyToad.Pdf/Cos/CosObject.cs b/src/UglyToad.Pdf/Cos/CosObject.cs index 1a34e7a6..d18ca721 100644 --- a/src/UglyToad.Pdf/Cos/CosObject.cs +++ b/src/UglyToad.Pdf/Cos/CosObject.cs @@ -1,5 +1,7 @@ namespace UglyToad.Pdf.Cos { + using ContentStream; + public class CosObject : CosBase, ICosUpdateInfo { private CosBase baseObject; @@ -124,5 +126,10 @@ } public bool NeedsToBeUpdated { get; set; } + + public IndirectReference ToIndirectReference() + { + return new IndirectReference(objectNumber, generationNumber); + } } } diff --git a/src/UglyToad.Pdf/Fonts/CidFonts/ICidFont.cs b/src/UglyToad.Pdf/Fonts/CidFonts/ICidFont.cs index 5aa31a5a..872c0200 100644 --- a/src/UglyToad.Pdf/Fonts/CidFonts/ICidFont.cs +++ b/src/UglyToad.Pdf/Fonts/CidFonts/ICidFont.cs @@ -32,5 +32,7 @@ /// The definition of the character collection for the font. /// CharacterIdentifierSystemInfo SystemInfo { get; } + + CidFontType CidFontType { get; } } } \ No newline at end of file diff --git a/src/UglyToad.Pdf/Fonts/CidFonts/Type0CidFont.cs b/src/UglyToad.Pdf/Fonts/CidFonts/Type0CidFont.cs index d95e6bac..ccd549e1 100644 --- a/src/UglyToad.Pdf/Fonts/CidFonts/Type0CidFont.cs +++ b/src/UglyToad.Pdf/Fonts/CidFonts/Type0CidFont.cs @@ -1,6 +1,5 @@ namespace UglyToad.Pdf.Fonts.CidFonts { - using Cmap; using Cos; /// @@ -14,5 +13,6 @@ public CosName SubType { get; } public CosName BaseFont { get; } public CharacterIdentifierSystemInfo SystemInfo { get; } + public CidFontType CidFontType => CidFontType.Type0; } } diff --git a/src/UglyToad.Pdf/Fonts/CidFonts/Type2CidFont.cs b/src/UglyToad.Pdf/Fonts/CidFonts/Type2CidFont.cs index a7bc1475..2869e242 100644 --- a/src/UglyToad.Pdf/Fonts/CidFonts/Type2CidFont.cs +++ b/src/UglyToad.Pdf/Fonts/CidFonts/Type2CidFont.cs @@ -1,6 +1,5 @@ namespace UglyToad.Pdf.Fonts.CidFonts { - using Cmap; using Cos; /// @@ -14,5 +13,6 @@ public CosName SubType { get; } public CosName BaseFont { get; } public CharacterIdentifierSystemInfo SystemInfo { get; } + public CidFontType CidFontType => CidFontType.Type2; } } \ No newline at end of file diff --git a/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs b/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs index 785c8faa..b4f10929 100644 --- a/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs +++ b/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs @@ -6,7 +6,7 @@ using IO; using Util.JetBrains.Annotations; - public class CMap + internal class CMap { public CharacterIdentifierSystemInfo Info { get; } @@ -30,6 +30,8 @@ [NotNull] public IReadOnlyList CidCharacterMappings { get; } + public WritingMode WritingMode { get; } + public bool HasCidMappings => CidCharacterMappings.Count > 0 || CidRanges.Count > 0; public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0; @@ -41,7 +43,7 @@ { Info = info; Type = type; - WMode = wMode; + WritingMode = (WritingMode)wMode; Name = name; Version = version; BaseFontCharacterMap = baseFontCharacterMap ?? throw new ArgumentNullException(nameof(baseFontCharacterMap)); @@ -51,8 +53,7 @@ maxCodeLength = CodespaceRanges.Max(x => x.CodeLength); minCodeLength = CodespaceRanges.Min(x => x.CodeLength); } - - private int wmode = 0; + private string cmapName = null; private string cmapVersion = null; private int cmapType = -1; diff --git a/src/UglyToad.Pdf/Fonts/Cmap/WritingMode.cs b/src/UglyToad.Pdf/Fonts/Cmap/WritingMode.cs new file mode 100644 index 00000000..fbee355c --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Cmap/WritingMode.cs @@ -0,0 +1,8 @@ +namespace UglyToad.Pdf.Fonts.Cmap +{ + internal enum WritingMode + { + Horizontal = 0, + Vertical = 1 + } +} \ No newline at end of file diff --git a/src/UglyToad.Pdf/Fonts/Composite/ToUnicodeCMap.cs b/src/UglyToad.Pdf/Fonts/Composite/ToUnicodeCMap.cs new file mode 100644 index 00000000..8ab1e7eb --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Composite/ToUnicodeCMap.cs @@ -0,0 +1,55 @@ +namespace UglyToad.Pdf.Fonts.Composite +{ + using System; + using Cmap; + using IO; + using Util.JetBrains.Annotations; + + /// + /// Defines the information content (actual text) of the font + /// as opposed to the display format. + /// + internal class ToUnicodeCMap + { + [CanBeNull] + private readonly CMap cMap; + + /// + /// Does the font provide a CMap to map CIDs to Unicode values? + /// + public bool CanMapToUnicode => cMap != null; + + /// + /// Is this document (unexpectedly) using a predefined Identity-H/V CMap as its ToUnicode CMap? + /// + public bool IsUsingIdentityAsUnicodeMap { get; } + + public ToUnicodeCMap([CanBeNull]CMap cMap) + { + this.cMap = cMap; + + if (CanMapToUnicode) + { + IsUsingIdentityAsUnicodeMap = + cMap.Name.StartsWith("Identity-", StringComparison.InvariantCultureIgnoreCase); + } + } + + public bool TryGet(int code, out string value) + { + value = null; + + if (!CanMapToUnicode) + { + return false; + } + + return cMap.TryConvertToUnicode(code, out value); + } + + public int ReadCode(IInputBytes inputBytes) + { + return cMap.ReadCode(inputBytes); + } + } +} \ No newline at end of file diff --git a/src/UglyToad.Pdf/Fonts/Composite/Type0Font.cs b/src/UglyToad.Pdf/Fonts/Composite/Type0Font.cs new file mode 100644 index 00000000..2aab9bb2 --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Composite/Type0Font.cs @@ -0,0 +1,76 @@ +namespace UglyToad.Pdf.Fonts.Composite +{ + using System; + using CidFonts; + using Cmap; + using Cos; + using Geometry; + using IO; + using Util.JetBrains.Annotations; + + /// + /// Defines glyphs using a CIDFont + /// + internal class Type0Font : IFont + { + public CosName Name => BaseFont; + + [NotNull] + public CosName BaseFont { get; } + + [NotNull] + public ICidFont CidFont { get; } + + [NotNull] + public CMap CMap { get; } + + [NotNull] + public ToUnicodeCMap ToUnicode { get; } + + public bool IsVertical => CMap.WritingMode == WritingMode.Vertical; + + public Type0Font(CosName baseFont, ICidFont cidFont, CMap cmap, CMap toUnicodeCMap) + { + BaseFont = baseFont ?? throw new ArgumentNullException(nameof(baseFont)); + CidFont = cidFont ?? throw new ArgumentNullException(nameof(cidFont)); + CMap = cmap ?? throw new ArgumentNullException(nameof(cmap)); + ToUnicode = new ToUnicodeCMap(toUnicodeCMap); + } + + public int ReadCharacterCode(IInputBytes bytes, out int codeLength) + { + var current = bytes.CurrentOffset; + + var code = ToUnicode.ReadCode(bytes); + + codeLength = bytes.CurrentOffset - current; + + return code; + } + + public bool TryGetUnicode(int characterCode, out string value) + { + value = null; + + if (!ToUnicode.CanMapToUnicode) + { + return false; + } + + // According to PdfBox certain providers incorrectly using Identity CMaps as ToUnicode. + if (ToUnicode.IsUsingIdentityAsUnicodeMap) + { + value = new string((char)characterCode, 1); + + return true; + } + + return ToUnicode.TryGet(characterCode, out value); + } + + public PdfVector GetDisplacement(int characterCode) + { + return new PdfVector(0.333m, 0); + } + } +} diff --git a/src/UglyToad.Pdf/Fonts/DescriptorFontFile.cs b/src/UglyToad.Pdf/Fonts/DescriptorFontFile.cs index 9ca877cb..d7386a8c 100644 --- a/src/UglyToad.Pdf/Fonts/DescriptorFontFile.cs +++ b/src/UglyToad.Pdf/Fonts/DescriptorFontFile.cs @@ -1,5 +1,6 @@ namespace UglyToad.Pdf.Fonts { + using ContentStream; using Cos; /// @@ -13,13 +14,13 @@ /// internal class DescriptorFontFile { - public CosObjectKey ObjectKey { get; } + public IndirectReference ObjectKey { get; } public byte[] FileBytes { get; } public FontFileType FileType { get; } - public DescriptorFontFile(CosObjectKey key, FontFileType fileType) + public DescriptorFontFile(IndirectReference key, FontFileType fileType) { ObjectKey = key; FileBytes = new byte[0]; diff --git a/src/UglyToad.Pdf/Fonts/Exceptions/InvalidFontFormatException.cs b/src/UglyToad.Pdf/Fonts/Exceptions/InvalidFontFormatException.cs new file mode 100644 index 00000000..44586428 --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Exceptions/InvalidFontFormatException.cs @@ -0,0 +1,32 @@ +namespace UglyToad.Pdf.Fonts.Exceptions +{ + using System; + using System.Runtime.Serialization; + + /// + /// The exception thrown when an error is encountered parsing a font from the PDF document. + /// This occurs where the format of the font program or dictionary does not meet the specification. + /// + /// + [Serializable] + public class InvalidFontFormatException : Exception + { + public InvalidFontFormatException() + { + } + + public InvalidFontFormatException(string message) : base(message) + { + } + + public InvalidFontFormatException(string message, Exception inner) : base(message, inner) + { + } + + protected InvalidFontFormatException( + SerializationInfo info, + StreamingContext context) : base(info, context) + { + } + } +} diff --git a/src/UglyToad.Pdf/Fonts/FontFactory.cs b/src/UglyToad.Pdf/Fonts/FontFactory.cs index 471fcf07..80489d54 100644 --- a/src/UglyToad.Pdf/Fonts/FontFactory.cs +++ b/src/UglyToad.Pdf/Fonts/FontFactory.cs @@ -4,22 +4,26 @@ using System.Collections.Generic; using ContentStream; using Cos; + using Exceptions; + using IO; + using Logging; using Parser.Handlers; - using Pdf.Parser; - internal class FontFactory + internal class FontFactory : IFontFactory { + private readonly ILog log; private readonly IReadOnlyDictionary handlers; - public FontFactory(Type0FontHandler type0FontHandler) + public FontFactory(ILog log, Type0FontHandler type0FontHandler) { + this.log = log; handlers = new Dictionary { {CosName.TYPE0, type0FontHandler} }; } - public IFont GetFont(PdfDictionary dictionary, ParsingArguments arguments) + public IFont Get(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing) { var type = dictionary.GetName(CosName.TYPE); @@ -27,13 +31,13 @@ { var message = "The font dictionary did not have type 'Font'. " + dictionary; - if (arguments.IsLenientParsing) + if (isLenientParsing) { - arguments.Log.Error(message); + log?.Error(message); } else { - throw new InvalidOperationException(message); + throw new InvalidFontFormatException(message); } } @@ -41,7 +45,7 @@ if (handlers.TryGetValue(subtype, out var handler)) { - return handler.Generate(dictionary, arguments); + return handler.Generate(dictionary, reader, isLenientParsing); } throw new NotImplementedException($"Parsing not implemented for fonts of type: {subtype}, please submit a pull request or an issue."); @@ -49,3 +53,4 @@ } } + diff --git a/src/UglyToad.Pdf/Fonts/IFont.cs b/src/UglyToad.Pdf/Fonts/IFont.cs index f22f0e19..e304c167 100644 --- a/src/UglyToad.Pdf/Fonts/IFont.cs +++ b/src/UglyToad.Pdf/Fonts/IFont.cs @@ -10,16 +10,12 @@ internal interface IFont { CosName Name { get; } - - CosName SubType { get; } - - string BaseFontType { get; } - + bool IsVertical { get; } int ReadCharacterCode(IInputBytes bytes, out int codeLength); - string GetUnicode(int characterCode); + bool TryGetUnicode(int characterCode, out string value); PdfVector GetDisplacement(int characterCode); } @@ -51,6 +47,11 @@ return code; } + public bool TryGetUnicode(int characterCode, out string value) + { + throw new NotImplementedException(); + } + public string GetUnicode(int characterCode) { if (ToUnicode != null) diff --git a/src/UglyToad.Pdf/Fonts/IFontFactory.cs b/src/UglyToad.Pdf/Fonts/IFontFactory.cs new file mode 100644 index 00000000..61e4b350 --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/IFontFactory.cs @@ -0,0 +1,10 @@ +namespace UglyToad.Pdf.Fonts +{ + using ContentStream; + using IO; + + internal interface IFontFactory + { + IFont Get(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing); + } +} \ No newline at end of file diff --git a/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs b/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs index f9a0a7d9..80b82ecb 100644 --- a/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs +++ b/src/UglyToad.Pdf/Fonts/Parser/CMapParser.cs @@ -9,7 +9,7 @@ using Tokenization.Scanner; using Tokenization.Tokens; - public class CMapParser + internal class CMapParser { private static readonly BaseFontRangeParser BaseFontRangeParser = new BaseFontRangeParser(); private static readonly BaseFontCharacterParser BaseFontCharacterParser = new BaseFontCharacterParser(); diff --git a/src/UglyToad.Pdf/Fonts/Parser/Handlers/IFontHandler.cs b/src/UglyToad.Pdf/Fonts/Parser/Handlers/IFontHandler.cs index 6539af04..79476f45 100644 --- a/src/UglyToad.Pdf/Fonts/Parser/Handlers/IFontHandler.cs +++ b/src/UglyToad.Pdf/Fonts/Parser/Handlers/IFontHandler.cs @@ -1,10 +1,10 @@ namespace UglyToad.Pdf.Fonts.Parser.Handlers { using ContentStream; - using Pdf.Parser; + using IO; internal interface IFontHandler { - IFont Generate(PdfDictionary dictionary, ParsingArguments parsingArguments); + IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing); } } \ No newline at end of file diff --git a/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type0FontHandler.cs b/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type0FontHandler.cs index e91442c2..54892164 100644 --- a/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type0FontHandler.cs +++ b/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type0FontHandler.cs @@ -1,9 +1,12 @@ namespace UglyToad.Pdf.Fonts.Parser.Handlers { using System; + using CidFonts; using Cmap; + using Composite; using ContentStream; using Cos; + using Exceptions; using Filters; using IO; using Parts; @@ -14,31 +17,35 @@ private readonly CidFontFactory cidFontFactory; private readonly CMapCache cMapCache; private readonly IFilterProvider filterProvider; + private readonly IPdfObjectParser pdfObjectParser; - public Type0FontHandler(CidFontFactory cidFontFactory, CMapCache cMapCache, IFilterProvider filterProvider) + public Type0FontHandler(CidFontFactory cidFontFactory, CMapCache cMapCache, IFilterProvider filterProvider, IPdfObjectParser pdfObjectParser) { this.cidFontFactory = cidFontFactory; this.cMapCache = cMapCache; this.filterProvider = filterProvider; + this.pdfObjectParser = pdfObjectParser; } - public IFont Generate(PdfDictionary dictionary, ParsingArguments arguments) + public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing) { - var dynamicParser = arguments.Get(); - var baseFont = dictionary.GetName(CosName.BASE_FONT); var cMap = ReadEncoding(dictionary, out var isCMapPredefined); if (TryGetFirstDescendant(dictionary, out var descendantObject)) { - var parsed = dynamicParser.Parse(arguments, descendantObject, false); + var parsed = pdfObjectParser.Parse(descendantObject.ToIndirectReference(), reader, isLenientParsing); if (parsed is PdfDictionary descendantFontDictionary) { - ParseDescendant(descendantFontDictionary, arguments); + ParseDescendant(descendantFontDictionary, reader, isLenientParsing); } } + else + { + throw new InvalidFontFormatException("No descendant font dictionary was declared for this Type 0 font. This dictionary should contain the CIDFont for the Type 0 font. " + dictionary); + } var ucs2CMap = GetUcs2CMap(dictionary, isCMapPredefined, false); @@ -47,22 +54,17 @@ { var toUnicodeValue = dictionary[CosName.TO_UNICODE]; - var toUnicode = dynamicParser.Parse(arguments, toUnicodeValue as CosObject, false) as RawCosStream; + var toUnicode = pdfObjectParser.Parse(((CosObject)toUnicodeValue).ToIndirectReference(), reader, isLenientParsing) as RawCosStream; var decodedUnicodeCMap = toUnicode?.Decode(filterProvider); if (decodedUnicodeCMap != null) { - toUnicodeCMap = cMapCache.Parse(new ByteArrayInputBytes(decodedUnicodeCMap), arguments.IsLenientParsing); + toUnicodeCMap = cMapCache.Parse(new ByteArrayInputBytes(decodedUnicodeCMap), isLenientParsing); } } - var font = new CompositeFont - { - SubType = CosName.TYPE0, - ToUnicode = toUnicodeCMap, - BaseFont = baseFont - }; + var font = new Type0Font(baseFont, new Type0CidFont(), cMap, toUnicodeCMap); return font; } @@ -91,7 +93,7 @@ return false; } - private void ParseDescendant(PdfDictionary dictionary, ParsingArguments arguments) + private void ParseDescendant(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing) { var type = dictionary.GetName(CosName.TYPE); if (!CosName.FONT.Equals(type)) @@ -99,7 +101,7 @@ throw new InvalidOperationException($"Expected \'Font\' dictionary but found \'{type.Name}\'"); } - cidFontFactory.Generate(dictionary, arguments, arguments.IsLenientParsing); + cidFontFactory.Generate(dictionary, reader, isLenientParsing); } private CMap ReadEncoding(PdfDictionary dictionary, out bool isCMapPredefined) diff --git a/src/UglyToad.Pdf/Fonts/Parser/Parts/CidFontFactory.cs b/src/UglyToad.Pdf/Fonts/Parser/Parts/CidFontFactory.cs index ecbca45a..fefc1bf1 100644 --- a/src/UglyToad.Pdf/Fonts/Parser/Parts/CidFontFactory.cs +++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/CidFontFactory.cs @@ -2,7 +2,6 @@ { using System; using System.Collections.Generic; - using System.IO; using CidFonts; using ContentStream; using Cos; @@ -12,20 +11,25 @@ using Pdf.Parser; using TrueType; using TrueType.Parser; - using Util; internal class CidFontFactory { private readonly FontDescriptorFactory descriptorFactory; private readonly TrueTypeFontParser trueTypeFontParser; + private readonly IPdfObjectParser pdfObjectParser; + private readonly IFilterProvider filterProvider; - public CidFontFactory(FontDescriptorFactory descriptorFactory, TrueTypeFontParser trueTypeFontParser) + public CidFontFactory(FontDescriptorFactory descriptorFactory, TrueTypeFontParser trueTypeFontParser, + IPdfObjectParser pdfObjectParser, + IFilterProvider filterProvider) { this.descriptorFactory = descriptorFactory; this.trueTypeFontParser = trueTypeFontParser; + this.pdfObjectParser = pdfObjectParser; + this.filterProvider = filterProvider; } - public ICidFont Generate(PdfDictionary dictionary, ParsingArguments arguments, bool isLenientParsing) + public ICidFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing) { var type = dictionary.GetName(CosName.TYPE); if (!CosName.FONT.Equals(type)) @@ -37,12 +41,12 @@ var verticalWritingMetrics = ReadVerticalDisplacements(dictionary); FontDescriptor descriptor = null; - if (TryGetFontDescriptor(dictionary, arguments, out var descriptorDictionary)) + if (TryGetFontDescriptor(dictionary, reader, out var descriptorDictionary)) { - descriptor = descriptorFactory.Generate(descriptorDictionary, arguments.IsLenientParsing); + descriptor = descriptorFactory.Generate(descriptorDictionary, isLenientParsing); } - ReadDescriptorFile(descriptor, arguments); + ReadDescriptorFile(descriptor, reader, isLenientParsing); var subType = dictionary.GetName(CosName.SUBTYPE); if (CosName.CID_FONT_TYPE0.Equals(subType)) @@ -58,8 +62,7 @@ return null; } - private static bool TryGetFontDescriptor(PdfDictionary dictionary, ParsingArguments arguments, - out PdfDictionary descriptorDictionary) + private bool TryGetFontDescriptor(PdfDictionary dictionary, IRandomAccessRead reader, out PdfDictionary descriptorDictionary) { descriptorDictionary = null; @@ -68,7 +71,7 @@ return false; } - var descriptorObj = arguments.Get().Parse(arguments, obj, false); + var descriptorObj = pdfObjectParser.Parse(obj.ToIndirectReference(), reader, false); if (!(descriptorObj is PdfDictionary descriptor)) { @@ -80,21 +83,21 @@ return true; } - private void ReadDescriptorFile(FontDescriptor descriptor, ParsingArguments arguments) + private void ReadDescriptorFile(FontDescriptor descriptor, IRandomAccessRead reader, bool isLenientParsing) { if (descriptor?.FontFile == null) { return; } - var fontFileStream = arguments.Get().Parse(arguments, descriptor.FontFile.ObjectKey, false) as RawCosStream; + var fontFileStream = pdfObjectParser.Parse(descriptor.FontFile.ObjectKey, reader, isLenientParsing) as RawCosStream; if (fontFileStream == null) { return; } - var fontFile = fontFileStream.Decode(arguments.Get()); + var fontFile = fontFileStream.Decode(filterProvider); switch (descriptor.FontFile.FileType) { diff --git a/src/UglyToad.Pdf/Fonts/Parser/Parts/FontDescriptorFactory.cs b/src/UglyToad.Pdf/Fonts/Parser/Parts/FontDescriptorFactory.cs index 8abf6601..d9d26e1c 100644 --- a/src/UglyToad.Pdf/Fonts/Parser/Parts/FontDescriptorFactory.cs +++ b/src/UglyToad.Pdf/Fonts/Parser/Parts/FontDescriptorFactory.cs @@ -141,7 +141,7 @@ throw new NotSupportedException("We currently expect the FontFile to be an object reference."); } - return new DescriptorFontFile(obj.GetObjectKey(), DescriptorFontFile.FontFileType.Type1); + return new DescriptorFontFile(obj.ToIndirectReference(), DescriptorFontFile.FontFileType.Type1); } if (dictionary.TryGetValue(CosName.FONT_FILE2, out value)) @@ -151,7 +151,7 @@ throw new NotSupportedException("We currently expect the FontFile2 to be an object reference."); } - return new DescriptorFontFile(obj.GetObjectKey(), DescriptorFontFile.FontFileType.TrueType); + return new DescriptorFontFile(obj.ToIndirectReference(), DescriptorFontFile.FontFileType.TrueType); } if (dictionary.TryGetValue(CosName.FONT_FILE3, out value)) @@ -161,7 +161,7 @@ throw new NotSupportedException("We currently expect the FontFile3 to be an object reference."); } - return new DescriptorFontFile(obj.GetObjectKey(), DescriptorFontFile.FontFileType.FromSubtype); + return new DescriptorFontFile(obj.ToIndirectReference(), DescriptorFontFile.FontFileType.FromSubtype); } return null; diff --git a/src/UglyToad.Pdf/Geometry/Paths/GeneralPath.cs b/src/UglyToad.Pdf/Geometry/Paths/GeneralPath.cs new file mode 100644 index 00000000..0b0ec6e0 --- /dev/null +++ b/src/UglyToad.Pdf/Geometry/Paths/GeneralPath.cs @@ -0,0 +1,7 @@ +namespace UglyToad.Pdf.Geometry.Paths +{ + internal class GeneralPath + { + // TODO: provide an implementation + } +} diff --git a/src/UglyToad.Pdf/Graphics/ContentStreamProcessor.cs b/src/UglyToad.Pdf/Graphics/ContentStreamProcessor.cs index a05ca91a..1475aa07 100644 --- a/src/UglyToad.Pdf/Graphics/ContentStreamProcessor.cs +++ b/src/UglyToad.Pdf/Graphics/ContentStreamProcessor.cs @@ -85,7 +85,7 @@ { var code = font.ReadCharacterCode(bytes, out int codeLength); - var unicode = font.GetUnicode(code); + font.TryGetUnicode(code, out var unicode); var wordSpacing = 0m; if (code == ' ' && codeLength == 1) diff --git a/src/UglyToad.Pdf/Parser/IPageContentParser.cs b/src/UglyToad.Pdf/Parser/IPageContentParser.cs index ceafb76b..e6948d32 100644 --- a/src/UglyToad.Pdf/Parser/IPageContentParser.cs +++ b/src/UglyToad.Pdf/Parser/IPageContentParser.cs @@ -1,12 +1,11 @@ namespace UglyToad.Pdf.Parser { using System.Collections.Generic; - using Graphics; using Graphics.Operations; using IO; internal interface IPageContentParser { - IReadOnlyList Parse(IGraphicsStateOperationFactory operationFactory, IInputBytes inputBytes); + IReadOnlyList Parse(IInputBytes inputBytes); } } \ No newline at end of file diff --git a/src/UglyToad.Pdf/Parser/IPdfObjectParser.cs b/src/UglyToad.Pdf/Parser/IPdfObjectParser.cs new file mode 100644 index 00000000..3445dad7 --- /dev/null +++ b/src/UglyToad.Pdf/Parser/IPdfObjectParser.cs @@ -0,0 +1,218 @@ +namespace UglyToad.Pdf.Parser +{ + using System; + using System.Collections.Generic; + using System.Linq; + using ContentStream; + using Cos; + using IO; + using Logging; + using Parts; + using Util; + + internal interface IPdfObjectParser + { + CosBase Parse(IndirectReference indirectReference, IRandomAccessRead reader, bool isLenientParsing = true, bool requireExistingObject = false); + } + + internal class PdfObjectParser : IPdfObjectParser + { + private readonly ILog log; + private readonly CosBaseParser baseParser; + private readonly CosStreamParser streamParser; + private readonly CrossReferenceTable crossReferenceTable; + private readonly BruteForceSearcher bruteForceSearcher; + private readonly CosObjectPool objectPool; + private readonly ObjectStreamParser objectStreamParser; + + public PdfObjectParser(ILog log, CosBaseParser baseParser, CosStreamParser streamParser, CrossReferenceTable crossReferenceTable, + BruteForceSearcher bruteForceSearcher, + CosObjectPool objectPool, + ObjectStreamParser objectStreamParser) + { + this.log = log ?? new NoOpLog(); + this.baseParser = baseParser ?? throw new ArgumentNullException(nameof(baseParser)); + this.streamParser = streamParser ?? throw new ArgumentNullException(nameof(streamParser)); + this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable)); + this.bruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher)); + this.objectPool = objectPool ?? throw new ArgumentNullException(nameof(objectPool)); + this.objectStreamParser = objectStreamParser ?? throw new ArgumentNullException(nameof(objectStreamParser)); + } + + public CosBase Parse(IndirectReference indirectReference, IRandomAccessRead reader, bool isLenientParsing = true, bool requireExistingObject = false) + { + var key = new CosObjectKey(indirectReference.ObjectNumber, indirectReference.Generation); + + var pdfObject = objectPool.GetOrCreateDefault(key); + + if (pdfObject.GetObject() != null) + { + return pdfObject.GetObject(); + } + + var offsetOrStreamNumber = TryGet(key, crossReferenceTable.ObjectOffsets); + + if (requireExistingObject && (offsetOrStreamNumber == null || offsetOrStreamNumber <= 0)) + { + throw new InvalidOperationException("Object must be defined and not compressed: " + key); + } + + if (isLenientParsing && offsetOrStreamNumber == null) + { + var locations = bruteForceSearcher.GetObjectLocations(); + + offsetOrStreamNumber = TryGet(key, locations); + + if (offsetOrStreamNumber != null) + { + crossReferenceTable.UpdateOffset(key, offsetOrStreamNumber.Value); + } + } + + if (offsetOrStreamNumber == null) + { + return CosNull.Null; + } + + var isCompressedStreamObject = offsetOrStreamNumber <= 0; + + if (!isCompressedStreamObject) + { + return ParseObjectFromFile(offsetOrStreamNumber.Value, reader, key, objectPool, isLenientParsing); + } + + return ParseCompressedStreamObject(reader, -offsetOrStreamNumber.Value, indirectReference.ObjectNumber, isLenientParsing); + } + + private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader, + CosObjectKey key, + CosObjectPool pool, + bool isLenientParsing) + { + reader.Seek(offset); + + var objectNumber = ObjectHelper.ReadObjectNumber(reader); + var objectGeneration = ObjectHelper.ReadGenerationNumber(reader); + + ReadHelper.ReadExpectedString(reader, "obj", true); + + if (objectNumber != key.Number || objectGeneration != key.Generation) + { + throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}"); + } + + ReadHelper.SkipSpaces(reader); + + var baseObject = baseParser.Parse(reader, pool); + + var endObjectKey = ReadHelper.ReadString(reader); + + var atStreamStart = string.Equals(endObjectKey, "stream"); + + if (atStreamStart) + { + var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey); + + reader.Rewind(streamStartBytes.Length); + + baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey); + } + + if (!string.Equals(endObjectKey, "endobj")) + { + var message = + $"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'"; + + if (isLenientParsing) + { + log.Warn(message); + } + else + { + throw new InvalidOperationException(message); + } + } + + return baseObject; + } + + private CosBase ReadNormalObjectStream(IRandomAccessRead reader, CosBase currentBase, long offset, + bool isLenientParsing, + out string endObjectKey) + { + if (currentBase is PdfDictionary dictionary) + { + RawCosStream stream = streamParser.Parse(reader, dictionary, isLenientParsing); + + currentBase = stream; + } + else + { + // this is not legal + // the combination of a dict and the stream/endstream + // forms a complete stream object + throw new InvalidOperationException($"Stream not preceded by dictionary (offset: {offset})."); + } + + ReadHelper.SkipSpaces(reader); + endObjectKey = ReadHelper.ReadLine(reader); + + // we have case with a second 'endstream' before endobj + if (!endObjectKey.StartsWith("endobj") && endObjectKey.StartsWith("endstream")) + { + endObjectKey = endObjectKey.Substring(9).Trim(); + if (endObjectKey.Length == 0) + { + // no other characters in extra endstream line + // read next line + endObjectKey = ReadHelper.ReadLine(reader); + } + } + + return currentBase; + } + + private CosBase ParseCompressedStreamObject(IRandomAccessRead reader, long streamObjectNumber, long requestedNumber, bool isLenientParsing) + { + var baseStream = Parse(new IndirectReference(streamObjectNumber, 0), reader, isLenientParsing, true); + + if (!(baseStream is RawCosStream stream)) + { + log.Warn($"Could not find a stream for the object number, defaults to returning CosNull: {streamObjectNumber}"); + + return CosNull.Null; + } + + var objects = objectStreamParser.Parse(stream, objectPool); + + // register all objects which are referenced to be contained in object stream + foreach (var next in objects) + { + var streamKey = new CosObjectKey(next); + var offset = TryGet(streamKey, crossReferenceTable.ObjectOffsets); + + if (offset != null && offset == -streamObjectNumber) + { + var streamObject = objectPool.Get(streamKey); + streamObject.SetObject(next.GetObject()); + } + } + + var matchingStreamObject = objects.FirstOrDefault(x => x.GetObjectNumber() == requestedNumber); + + if (matchingStreamObject != null) + { + return matchingStreamObject; + } + + log.Error($"Could not find the object {requestedNumber} in the stream for object {streamObjectNumber}. Returning CosNull."); + + return CosNull.Null; + } + + private static T? TryGet(TKey key, IReadOnlyDictionary dictionary) where T : struct + { + return dictionary.TryGetValue(key, out var value) ? value : default(T?); + } + } +} diff --git a/src/UglyToad.Pdf/Parser/PageContentParser.cs b/src/UglyToad.Pdf/Parser/PageContentParser.cs index dc7a4316..5c391711 100644 --- a/src/UglyToad.Pdf/Parser/PageContentParser.cs +++ b/src/UglyToad.Pdf/Parser/PageContentParser.cs @@ -9,7 +9,14 @@ internal class PageContentParser : IPageContentParser { - public IReadOnlyList Parse(IGraphicsStateOperationFactory operationFactory, IInputBytes inputBytes) + private readonly IGraphicsStateOperationFactory operationFactory; + + public PageContentParser(IGraphicsStateOperationFactory operationFactory) + { + this.operationFactory = operationFactory; + } + + public IReadOnlyList Parse(IInputBytes inputBytes) { var scanner = new CoreTokenScanner(inputBytes); diff --git a/src/UglyToad.Pdf/Parser/PageTree/PageParser.cs b/src/UglyToad.Pdf/Parser/PageTree/PageParser.cs index b7c999eb..e2aa4a97 100644 --- a/src/UglyToad.Pdf/Parser/PageTree/PageParser.cs +++ b/src/UglyToad.Pdf/Parser/PageTree/PageParser.cs @@ -1,57 +1,12 @@ namespace UglyToad.Pdf.Parser.PageTree { using System; - using Content; using ContentStream; using ContentStream.TypedAccessors; using Cos; using Filters; using Fonts; - - internal class PageParser - { - public Page Parse(int number, PdfDictionary dictionary, ParsingArguments arguments) - { - if (dictionary == null) - { - throw new ArgumentNullException(nameof(dictionary)); - } - - if (arguments == null) - { - throw new ArgumentNullException(nameof(arguments)); - } - - if (!dictionary.IsType(CosName.PAGE)) - { - throw new InvalidOperationException("Expected a Dictionary of Type Page, instead got this: " + dictionary); - } - - return new Page(number, dictionary, new PageTreeMembers(), arguments); - } - } - - internal class FontParser - { - public Font Parse(PdfDictionary dictionary, ParsingArguments arguments) - { - var type = dictionary.GetName(CosName.SUBTYPE); - - if (CosName.Equals(type, CosName.TYPE0)) - { - var compositeFont = arguments.Container.Get() - .Parse(dictionary, arguments); - } - else - { - var simpleFont = arguments.Container.Get() - .Parse(dictionary, arguments); - } - - return new Font(); - } - } - + internal class CompositeFontParser { public CompositeFont Parse(PdfDictionary dictionary, ParsingArguments arguments) diff --git a/src/UglyToad.Pdf/Parser/PageTree/ResourceDictionary.cs b/src/UglyToad.Pdf/Parser/PageTree/ResourceDictionary.cs index 6b65ad46..cd931a3e 100644 --- a/src/UglyToad.Pdf/Parser/PageTree/ResourceDictionary.cs +++ b/src/UglyToad.Pdf/Parser/PageTree/ResourceDictionary.cs @@ -2,7 +2,6 @@ { using System.Collections.Generic; using Content; - using ContentStream; using Cos; /// @@ -37,34 +36,5 @@ { return fonts.ContainsKey(name); } - - internal bool GetFont(CosName name, ParsingArguments arguments, out Font value) - { - if (fontObjects.TryGetValue(name, out value)) - { - return true; - } - - if (!fonts.TryGetValue(name, out var key)) - { - return false; - } - - var dictionary = arguments.Container.Get() - .Parse(arguments, key, false) as PdfDictionary; - - if (dictionary == null) - { - return false; - } - - var font = arguments.Container.Get() - .Parse(dictionary, arguments); - - fontObjects[name] = font; - - // retrieve and cache - return false; - } } } \ No newline at end of file diff --git a/src/UglyToad.Pdf/Parser/ParsingCachingProviders.cs b/src/UglyToad.Pdf/Parser/ParsingCachingProviders.cs index 5d2aaab5..a7cc0bf8 100644 --- a/src/UglyToad.Pdf/Parser/ParsingCachingProviders.cs +++ b/src/UglyToad.Pdf/Parser/ParsingCachingProviders.cs @@ -14,9 +14,9 @@ public BruteForceSearcher BruteForceSearcher { get; } - public ResourceContainer ResourceContainer { get; } + public IResourceStore ResourceContainer { get; } - public ParsingCachingProviders(CosObjectPool objectPool, BruteForceSearcher bruteForceSearcher, ResourceContainer resourceContainer) + public ParsingCachingProviders(CosObjectPool objectPool, BruteForceSearcher bruteForceSearcher, IResourceStore resourceContainer) { ObjectPool = objectPool ?? throw new ArgumentNullException(nameof(objectPool)); BruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher)); diff --git a/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs b/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs index 5cdcacec..d7ae7ccd 100644 --- a/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs @@ -5,7 +5,15 @@ using Content; using ContentStream; using Cos; + using Filters; + using Fonts; + using Fonts.Parser; + using Fonts.Parser.Handlers; + using Fonts.Parser.Parts; + using Fonts.TrueType.Parser; + using Graphics; using IO; + using Logging; using Parts; using Parts.CrossReference; using Util; @@ -37,6 +45,8 @@ private static PdfDocument OpenDocument(IRandomAccessRead reader, IContainer container, bool isLenientParsing) { + var log = container.Get(); + var version = container.Get().ReadHeader(reader, isLenientParsing); var crossReferenceOffset = container.Get().GetXrefOffset(reader, isLenientParsing); @@ -46,9 +56,27 @@ var crossReferenceTable = container.Get() .Parse(reader, isLenientParsing, crossReferenceOffset, pool); - var dynamicParser = container.Get(); + var filterProvider = container.Get(); var bruteForceSearcher = new BruteForceSearcher(reader); - var resourceContainer = new ResourceContainer(); + var pdfObjectParser = new PdfObjectParser(container.Get(), container.Get(), + container.Get(), crossReferenceTable, bruteForceSearcher, pool, container.Get()); + + var trueTypeFontParser = new TrueTypeFontParser(); + var fontDescriptorFactory = new FontDescriptorFactory(); + + var cidFontFactory = new CidFontFactory(fontDescriptorFactory, trueTypeFontParser, pdfObjectParser, filterProvider); + + var cMapCache = new CMapCache(new CMapParser()); + + var fontFactory = new FontFactory(container.Get(), new Type0FontHandler(cidFontFactory, + cMapCache, + filterProvider, + pdfObjectParser)); + + var dynamicParser = container.Get(); + var resourceContainer = new ResourceContainer(pdfObjectParser, fontFactory); + + var pageFactory = new PageFactory(resourceContainer, pdfObjectParser, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory())); var root = ParseTrailer(reader, crossReferenceTable, dynamicParser, bruteForceSearcher, pool, isLenientParsing); @@ -66,7 +94,7 @@ var caching = new ParsingCachingProviders(pool, bruteForceSearcher, resourceContainer); - return new PdfDocument(reader, version, crossReferenceTable, container, isLenientParsing, caching, new Catalog(rootDictionary)); + return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, new Catalog(rootDictionary)); } private static CosBase ParseTrailer(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable, diff --git a/src/UglyToad.Pdf/PdfDocument.cs b/src/UglyToad.Pdf/PdfDocument.cs index 544cd5f9..b320d8ba 100644 --- a/src/UglyToad.Pdf/PdfDocument.cs +++ b/src/UglyToad.Pdf/PdfDocument.cs @@ -4,6 +4,7 @@ using Content; using Cos; using IO; + using Logging; using Parser; using Parser.Parts; using Util; @@ -17,8 +18,8 @@ private readonly HeaderVersion version; [NotNull] private readonly CrossReferenceTable crossReferenceTable; - [NotNull] - private readonly IContainer container; + + private readonly ILog log; private readonly bool isLenientParsing; [NotNull] private readonly ParsingCachingProviders cachingProviders; @@ -29,21 +30,21 @@ [NotNull] public Pages Pages { get; } - internal PdfDocument(IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable, - IContainer container, + internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable, bool isLenientParsing, ParsingCachingProviders cachingProviders, + IPageFactory pageFactory, + IPdfObjectParser pdfObjectParser, Catalog catalog) { + this.log = log; this.reader = reader ?? throw new ArgumentNullException(nameof(reader)); this.version = version ?? throw new ArgumentNullException(nameof(version)); this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable)); - this.container = container ?? throw new ArgumentNullException(nameof(container)); this.isLenientParsing = isLenientParsing; this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders)); Catalog = catalog ?? throw new ArgumentNullException(nameof(catalog)); - var arguments = new ParsingArguments(reader, crossReferenceTable, cachingProviders, container, isLenientParsing); - Pages = new Pages(Catalog, arguments); + Pages = new Pages(log, Catalog, pdfObjectParser, pageFactory, reader, isLenientParsing); } public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options); diff --git a/src/UglyToad.Pdf/Tokenization/DictionaryTokenizer.cs b/src/UglyToad.Pdf/Tokenization/DictionaryTokenizer.cs index 9bdd0649..8aaea9a1 100644 --- a/src/UglyToad.Pdf/Tokenization/DictionaryTokenizer.cs +++ b/src/UglyToad.Pdf/Tokenization/DictionaryTokenizer.cs @@ -1,6 +1,7 @@ namespace UglyToad.Pdf.Tokenization { using System.Collections.Generic; + using ContentStream; using IO; using Parser.Parts; using Scanner; @@ -79,7 +80,7 @@ if (r == OperatorToken.R) { - result[key] = new IndirectReferenceToken(new IndirectReference(num.Long, gen.Long)); + result[key] = new IndirectReferenceToken(new IndirectReference(num.Long, gen.Int)); i = i + 2; } } diff --git a/src/UglyToad.Pdf/Tokenization/Tokens/IndirectReferenceToken.cs b/src/UglyToad.Pdf/Tokenization/Tokens/IndirectReferenceToken.cs index 1e7c55fc..90e14257 100644 --- a/src/UglyToad.Pdf/Tokenization/Tokens/IndirectReferenceToken.cs +++ b/src/UglyToad.Pdf/Tokenization/Tokens/IndirectReferenceToken.cs @@ -1,5 +1,7 @@ namespace UglyToad.Pdf.Tokenization.Tokens { + using ContentStream; + public class IndirectReferenceToken : IDataToken { public IndirectReference Data { get; } @@ -10,16 +12,4 @@ } } - public struct IndirectReference - { - public long ObjectNumber { get; } - - public long Generation { get; } - - public IndirectReference(long objectNumber, long generation) - { - ObjectNumber = objectNumber; - Generation = generation; - } - } } diff --git a/src/UglyToad.Pdf/UglyToad.Pdf.csproj b/src/UglyToad.Pdf/UglyToad.Pdf.csproj index 0cbc5832..3b03c0f7 100644 --- a/src/UglyToad.Pdf/UglyToad.Pdf.csproj +++ b/src/UglyToad.Pdf/UglyToad.Pdf.csproj @@ -227,5 +227,9 @@ + + + + diff --git a/src/UglyToad.Pdf/Util/Bootstrapper.cs b/src/UglyToad.Pdf/Util/Bootstrapper.cs index c26c94a5..492b818a 100644 --- a/src/UglyToad.Pdf/Util/Bootstrapper.cs +++ b/src/UglyToad.Pdf/Util/Bootstrapper.cs @@ -1,12 +1,7 @@ namespace UglyToad.Pdf.Util { using Filters; - using Fonts; using Fonts.Parser; - using Fonts.Parser.Handlers; - using Fonts.Parser.Parts; - using Fonts.TrueType.Parser; - using Graphics; using Logging; using Parser; using Parser.PageTree; @@ -49,19 +44,12 @@ new CrossReferenceTableParser(logger, dictionaryParser, baseParser)); var resourceDictionaryParser = new ResourceDictionaryParser(); - var pageParser = new PageParser(); var simpleFontParser = new SimpleFontParser(); var compositeFontParser = new CompositeFontParser(); - var fontParser = new FontParser(); - var pageContentParser = new PageContentParser(); - var operationFactory = new ReflectionGraphicsStateOperationFactory(); var cmapParser = new CMapParser(); var afmParser = new AdobeFontMetricsParser(); - - var type0FontFactory = new Type0FontHandler(new CidFontFactory(new FontDescriptorFactory(), new TrueTypeFontParser()), new CMapCache(cmapParser), filterProvider); - var fontFactory = new FontFactory(type0FontFactory); - + var container = new Container(); container.Register(headerParser); container.Register(trailerParser); @@ -75,15 +63,11 @@ container.Register(objectStreamParser); container.Register(filterProvider); container.Register(resourceDictionaryParser); - container.Register(pageParser); container.Register(simpleFontParser); container.Register(compositeFontParser); - container.Register(fontParser); - container.Register(pageContentParser); - container.Register(operationFactory); container.Register(cmapParser); container.Register(afmParser); - container.Register(fontFactory); + container.Register(logger); return container; }