create Type0 font, notes about font format, heavy duty refactoring to inject dependencies rather than god object

This commit is contained in:
Eliot Jones 2017-12-22 23:54:54 +00:00
parent 206eb91ff1
commit f4d58e8aa9
43 changed files with 853 additions and 296 deletions

118
font-notes.md Normal file
View File

@ -0,0 +1,118 @@
# Fonts #
## Types of Font ##
<pre><code>
------ Composite Fonts -------
Type0 (Composed of glyphs from a CIDFont)
Children:
CIDFont CIDFontType0 (Type 1 font glyph descriptions)
CIDFontType2 (TrueType font glyph descriptions)
------ Simple Fonts Below -------
Type 1 Type 1 (defines gylphs using type 1 font technology)
MMType1 (multiple master font - extends type 1 fonts to support many typefaces for a single font)
Type 3 (defines glyphs with streams of PDF graphics operations)
TrueType (from the TrueType font format)
</code></pre>
## Terminology ##
+ Font dictionary: PDF dictionary with information about the font
+ Font program: Glyph information in specialized font format
## Composite Fonts ##
+ Glyphs are selected from a font-like CIDFont.
+ Has a single CIDFont descendant.
+ Multiple-byte sequences select a single glyph.
Used for multiple-byte character encodings and large numbers of glyphs.
Well suited to Chinese, Japanese and Korean (CJK).
CID stands for character identifier. This is a number used to access glyph descriptions.
The CMap maps between character codes and CID numbers for the glyphs.
A CIDFont file provides the glyph descriptions for a character collection. The glyph descriptions are
identified by CIDs.
CID keyed font combines a CMap with a CIDFont.
The **Encoding** contains the CMap.
The **DescendantFonts** contains the CIDFont to use with the CMap.
### CIDFont ###
A Type0 font descendant (CIDFont) must be either a CIDFontType0 (Adobe Type 1) or CIDFontType2 (TrueType).
For Type 2 CIDFonts (TrueType) the glyphs are identified by a glyph index (GID).
+ If the font program is embedded as a stream the CIDFont dictionary must contain a CIDToGIDMap which maps
from CIDs to Glyph Indexes.
+ If the font program is a predefined external font the CIDFont must not contain a CIDToGIDMap. It
may only use a predefined CMap.
Though a CID may not be used to select the glyph as in the predefined case, it is always used to select glyph
metrics. Every CIDFont must describe CID 0 which is the ```.notdef``` character for missing characters.
### Glyph Metrics in CIDFonts ###
Widths for CIDFonts are defined in the DW and W entries in the CIDFont dictionary.
+ DW provides the default width for glyphs which are not specified individually.
+ W defines widths for individual CIDs.
Vertical writing has other stuff, see the spec.
### CMap ###
The CMap maps from character codes to character selectors (CIDs).
The CMap defines the writing mode horizontal or vertical.
### Type 0 Fonts ###
The **Font dictionary** has the following entries:
+ Type (name): /Font
+ Subtype (name): /Type0
+ BaseFont (name): The PostScript name of the font.
+ Encoding (name/stream R): Name of a predefined CMap or a stream for an embedded CMap.
+ DescendantFonts (array): Single element pointing to the CIDFont.
+ ToUnicode (stream R)?: Stream containing a CMap file to map codes to Unicode.
## Simple Fonts ##
+ Glyphs are selected by single-byte character codes. Index into a 256 entry glyph table.
+ Only supports horizontal writing mode.
## Further Description ##
### Type 1 Fonts ###
The **Font program** is a PostScript program describing glyph shape. See the Adobe Type 1 Font Format specification.
The **Font dictionary** has the following entries:
+ Type (name): /Font
+ Subtype (name): /Type1
+ Name (name?): Font name
+ BaseFont (name): The PostScript name of the font. Equivalent to the FontName value in the **Font program**.
+ FirstChar (int): The first character code in the Widths array.
+ LastChar (int) The last character code in the Widths array.
+ Widths (numeric[] R): An array defining the glyph width in units of 1000 == 1 text space unit.
+ FontDescriptor (Dict<> R): Describes font metrics other than widths.
+ Encoding (name/Dict<> R): Specifies the character encoding if different from default.
+ ToUnicode (stream R): CMap mapping character code to Unicode.

View File

@ -2,6 +2,7 @@
{ {
using System.Collections.Generic; using System.Collections.Generic;
using Content; using Content;
using ContentStream;
using IO; using IO;
using Pdf.Cos; using Pdf.Cos;
using Pdf.Fonts; using Pdf.Fonts;
@ -44,6 +45,10 @@
internal class TestResourceStore : IResourceStore internal class TestResourceStore : IResourceStore
{ {
public void LoadResourceDictionary(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{
}
public IFont GetFont(CosName name) public IFont GetFont(CosName name)
{ {
return null; return null;

View File

@ -13,15 +13,14 @@
public class PageContentParserTests public class PageContentParserTests
{ {
private readonly PageContentParser parser = new PageContentParser(); private readonly PageContentParser parser = new PageContentParser(new ReflectionGraphicsStateOperationFactory());
private readonly IGraphicsStateOperationFactory operationFactory = new ReflectionGraphicsStateOperationFactory();
[Fact] [Fact]
public void CorrectlyExtractsOperations() public void CorrectlyExtractsOperations()
{ {
var input = StringBytesTestConverter.Convert(SimpleGoogleDocPageContent, false); var input = StringBytesTestConverter.Convert(SimpleGoogleDocPageContent, false);
var result = parser.Parse(new ReflectionGraphicsStateOperationFactory(), input.Bytes); var result = parser.Parse(input.Bytes);
} }
[Fact] [Fact]
@ -36,7 +35,7 @@
ET"; ET";
var input = StringBytesTestConverter.Convert(s, false); var input = StringBytesTestConverter.Convert(s, false);
var result = parser.Parse(operationFactory, input.Bytes); var result = parser.Parse(input.Bytes);
Assert.Equal(7, result.Count); Assert.Equal(7, result.Count);
@ -72,7 +71,7 @@ ET";
var input = StringBytesTestConverter.Convert(s, false); var input = StringBytesTestConverter.Convert(s, false);
var result = parser.Parse(operationFactory, input.Bytes); var result = parser.Parse(input.Bytes);
Assert.Equal(4, result.Count); Assert.Equal(4, result.Count);

View File

@ -3,6 +3,7 @@ namespace UglyToad.Pdf.Tests.Tokenization
{ {
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using ContentStream;
using Pdf.Cos; using Pdf.Cos;
using Pdf.Tokenization; using Pdf.Tokenization;
using Pdf.Tokenization.Tokens; using Pdf.Tokenization.Tokens;

View File

@ -0,0 +1,11 @@
namespace UglyToad.Pdf.Content
{
using ContentStream;
using IO;
internal interface IPageFactory
{
Page Create(int number, PdfDictionary dictionary, PageTreeMembers pageTreeMembers, IRandomAccessRead reader,
bool isLenientParsing);
}
}

View File

@ -0,0 +1,14 @@
namespace UglyToad.Pdf.Content
{
using ContentStream;
using Cos;
using Fonts;
using IO;
internal interface IResourceStore
{
void LoadResourceDictionary(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing);
IFont GetFont(CosName name);
}
}

View File

@ -2,21 +2,9 @@
{ {
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Diagnostics;
using ContentStream;
using Cos;
using Filters;
using Geometry;
using Graphics;
using IO;
using Parser;
using Util;
public class Page public class Page
{ {
private readonly ParsingArguments parsingArguments;
private readonly PdfDictionary dictionary;
/// <summary> /// <summary>
/// The 1 indexed page number. /// The 1 indexed page number.
/// </summary> /// </summary>
@ -28,78 +16,16 @@
public IReadOnlyList<string> Text => Content?.Text ?? new string[0]; public IReadOnlyList<string> Text => Content?.Text ?? new string[0];
internal Page(int number, PdfDictionary dictionary, PageTreeMembers pageTreeMembers, ParsingArguments parsingArguments) internal Page(int number, MediaBox mediaBox, PageContent content)
{ {
if (number <= 0) if (number <= 0)
{ {
throw new ArgumentOutOfRangeException(nameof(number), "Page number cannot be 0 or negative."); throw new ArgumentOutOfRangeException(nameof(number), "Page number cannot be 0 or negative.");
} }
this.dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary));
this.parsingArguments = parsingArguments ?? throw new ArgumentNullException(nameof(parsingArguments));
Number = number; Number = number;
MediaBox = mediaBox;
var type = dictionary.GetName(CosName.TYPE); Content = content;
if (type != null && !type.Equals(CosName.PAGE) && !parsingArguments.IsLenientParsing)
{
throw new InvalidOperationException($"Created page number {number} but its type was specified as {type} rather than 'Page'.");
}
if (dictionary.TryGetItemOfType(CosName.MEDIA_BOX, out COSArray mediaboxArray))
{
var x1 = mediaboxArray.getInt(0);
var y1 = mediaboxArray.getInt(1);
var x2 = mediaboxArray.getInt(2);
var y2 = mediaboxArray.getInt(3);
MediaBox = new MediaBox(new PdfRectangle(x1, y1, x2, y2));
}
else
{
MediaBox = pageTreeMembers.GetMediaBox();
if (MediaBox == null)
{
if (parsingArguments.IsLenientParsing)
{
MediaBox = MediaBox.A4;
}
else
{
throw new InvalidOperationException("No mediabox was present for page: " + number);
}
}
}
if (dictionary.GetItemOrDefault(CosName.RESOURCES) is PdfDictionary resource)
{
parsingArguments.CachingProviders.ResourceContainer.LoadResourceDictionary(resource, parsingArguments);
}
var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject;
if (contentObject != null)
{
var contentStream = parsingArguments.Container.Get<DynamicParser>()
.Parse(parsingArguments, contentObject, false) as RawCosStream;
if (contentStream == null)
{
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
}
var contents = contentStream.Decode(parsingArguments.Container.Get<IFilterProvider>());
var operations = parsingArguments.Container.Get<PageContentParser>()
.Parse(parsingArguments.Container.Get<IGraphicsStateOperationFactory>(), new ByteArrayInputBytes(contents));
var context = new ContentStreamProcessor(MediaBox.Bounds, parsingArguments.CachingProviders.ResourceContainer);
var content = context.Process(operations);
Content = content;
}
} }
} }
} }

View File

@ -0,0 +1,99 @@
namespace UglyToad.Pdf.Content
{
using System;
using ContentStream;
using Cos;
using Filters;
using Geometry;
using Graphics;
using IO;
using Parser;
internal class PageFactory : IPageFactory
{
private readonly IResourceStore resourceStore;
private readonly IPdfObjectParser pdfObjectParser;
private readonly IFilterProvider filterProvider;
private readonly IPageContentParser pageContentParser;
public PageFactory(IResourceStore resourceStore, IPdfObjectParser pdfObjectParser, IFilterProvider filterProvider,
IPageContentParser pageContentParser)
{
this.resourceStore = resourceStore;
this.pdfObjectParser = pdfObjectParser;
this.filterProvider = filterProvider;
this.pageContentParser = pageContentParser;
}
public Page Create(int number, PdfDictionary dictionary, PageTreeMembers pageTreeMembers, IRandomAccessRead reader,
bool isLenientParsing)
{
if (dictionary == null)
{
throw new ArgumentNullException(nameof(dictionary));
}
var type = dictionary.GetName(CosName.TYPE);
if (type != null && !type.Equals(CosName.PAGE) && !isLenientParsing)
{
throw new InvalidOperationException($"Page {number} had its type was specified as {type} rather than 'Page'.");
}
MediaBox mediaBox;
if (dictionary.TryGetItemOfType(CosName.MEDIA_BOX, out COSArray mediaboxArray))
{
var x1 = mediaboxArray.getInt(0);
var y1 = mediaboxArray.getInt(1);
var x2 = mediaboxArray.getInt(2);
var y2 = mediaboxArray.getInt(3);
mediaBox = new MediaBox(new PdfRectangle(x1, y1, x2, y2));
}
else
{
mediaBox = pageTreeMembers.GetMediaBox();
if (mediaBox == null)
{
if (isLenientParsing)
{
mediaBox = MediaBox.A4;
}
else
{
throw new InvalidOperationException("No mediabox was present for page: " + number);
}
}
}
if (dictionary.GetItemOrDefault(CosName.RESOURCES) is PdfDictionary resource)
{
resourceStore.LoadResourceDictionary(resource, reader, isLenientParsing);
}
PageContent content = default(PageContent);
var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject;
if (contentObject != null)
{
var contentStream = pdfObjectParser.Parse(contentObject.ToIndirectReference(), reader, false) as RawCosStream;
if (contentStream == null)
{
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
}
var contents = contentStream.Decode(filterProvider);
var operations = pageContentParser.Parse(new ByteArrayInputBytes(contents));
var context = new ContentStreamProcessor(mediaBox.Bounds, resourceStore);
content = context.Process(operations);
}
return new Page(number, mediaBox, content);
}
}
}

View File

@ -6,26 +6,26 @@
using ContentStream; using ContentStream;
using ContentStream.TypedAccessors; using ContentStream.TypedAccessors;
using Cos; using Cos;
using IO;
using Logging; using Logging;
using Parser; using Parser;
using Parser.PageTree;
public class Pages public class Pages
{ {
private readonly ILog log;
private readonly Catalog catalog; private readonly Catalog catalog;
private readonly ParsingArguments arguments; private readonly IPdfObjectParser pdfObjectParser;
private readonly IPageFactory pageFactory;
private readonly IRandomAccessRead reader;
private readonly bool isLenientParsing;
private readonly PdfDictionary rootPageDictionary; private readonly PdfDictionary rootPageDictionary;
private readonly Dictionary<int, PdfDictionary> locatedPages = new Dictionary<int, PdfDictionary>(); private readonly Dictionary<int, PdfDictionary> locatedPages = new Dictionary<int, PdfDictionary>();
public int Count { get; } public int Count { get; }
internal Pages(Catalog catalog, ParsingArguments arguments) internal Pages(ILog log, Catalog catalog, IPdfObjectParser pdfObjectParser, IPageFactory pageFactory,
IRandomAccessRead reader, bool isLenientParsing)
{ {
if (arguments == null)
{
throw new ArgumentNullException(nameof(arguments));
}
if (catalog == null) if (catalog == null)
{ {
throw new ArgumentNullException(nameof(catalog)); throw new ArgumentNullException(nameof(catalog));
@ -38,9 +38,9 @@
throw new InvalidOperationException("No pages were present in the catalog for this PDF document"); throw new InvalidOperationException("No pages were present in the catalog for this PDF document");
} }
var pageObject = arguments.Container.Get<DynamicParser>().Parse(arguments, pages, false); var pagesObject = pdfObjectParser.Parse(pages.ToIndirectReference(), reader, isLenientParsing);
if (!(pageObject is PdfDictionary catalogPageDictionary)) if (!(pagesObject is PdfDictionary catalogPageDictionary))
{ {
throw new InvalidOperationException("Could not find the root pages object: " + pages); throw new InvalidOperationException("Could not find the root pages object: " + pages);
} }
@ -51,8 +51,12 @@
Count = count; Count = count;
this.log = log;
this.catalog = catalog; this.catalog = catalog;
this.arguments = arguments; this.pdfObjectParser = pdfObjectParser;
this.pageFactory = pageFactory;
this.reader = reader;
this.isLenientParsing = isLenientParsing;
} }
@ -60,7 +64,8 @@
{ {
if (locatedPages.TryGetValue(pageNumber, out PdfDictionary targetPageDictionary)) if (locatedPages.TryGetValue(pageNumber, out PdfDictionary targetPageDictionary))
{ {
return new Page(pageNumber, targetPageDictionary, new PageTreeMembers(), arguments); return pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader,
isLenientParsing);
} }
var observed = new List<int>(); var observed = new List<int>();
@ -73,8 +78,7 @@
throw new InvalidOperationException("Could not find the page with number: " + pageNumber); throw new InvalidOperationException("Could not find the page with number: " + pageNumber);
} }
var page = arguments.Container.Get<PageParser>() var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader, isLenientParsing);
.Parse(pageNumber, targetPageDictionary, arguments);
locatedPages[pageNumber] = targetPageDictionary; locatedPages[pageNumber] = targetPageDictionary;
@ -108,8 +112,7 @@
if (!type.Equals(CosName.PAGES)) if (!type.Equals(CosName.PAGES))
{ {
arguments.Container.Get<ILog>() log.Warn("Did not find the expected type (Page or Pages) in dictionary: " + currentPageDictionary);
.Warn("Did not find the expected type (Page or Pages) in dictionary: " + currentPageDictionary);
return false; return false;
} }
@ -120,7 +123,7 @@
foreach (var kid in kids.OfType<CosObject>()) foreach (var kid in kids.OfType<CosObject>())
{ {
// todo: exit early // todo: exit early
var child = arguments.Container.Get<DynamicParser>().Parse(arguments, kid, false) as PdfDictionary; var child = pdfObjectParser.Parse(kid.ToIndirectReference(), reader, isLenientParsing) as PdfDictionary;
var thisPageMatches = FindPage(child, soughtPageNumber, pageNumbersObserved); var thisPageMatches = FindPage(child, soughtPageNumber, pageNumbersObserved);

View File

@ -4,31 +4,32 @@
using System.Collections.Generic; using System.Collections.Generic;
using ContentStream; using ContentStream;
using Cos; using Cos;
using Filters;
using Fonts; using Fonts;
using Fonts.Cmap;
using Fonts.Parser;
using IO; using IO;
using Parser; using Parser;
internal interface IResourceStore
{
IFont GetFont(CosName name);
}
internal class ResourceContainer : IResourceStore internal class ResourceContainer : IResourceStore
{ {
private readonly IPdfObjectParser pdfObjectParser;
private readonly IFontFactory fontFactory;
private readonly Dictionary<CosName, IFont> loadedFonts = new Dictionary<CosName, IFont>(); private readonly Dictionary<CosName, IFont> loadedFonts = new Dictionary<CosName, IFont>();
internal void LoadResourceDictionary(PdfDictionary dictionary, ParsingArguments arguments) public ResourceContainer(IPdfObjectParser pdfObjectParser, IFontFactory fontFactory)
{
this.pdfObjectParser = pdfObjectParser;
this.fontFactory = fontFactory;
}
public void LoadResourceDictionary(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{ {
if (dictionary.TryGetValue(CosName.FONT, out var fontBase) && fontBase is PdfDictionary fontDictionary) if (dictionary.TryGetValue(CosName.FONT, out var fontBase) && fontBase is PdfDictionary fontDictionary)
{ {
LoadFontDictionary(fontDictionary, arguments); LoadFontDictionary(fontDictionary, reader, isLenientParsing);
} }
} }
private void LoadFontDictionary(PdfDictionary fontDictionary, ParsingArguments arguments) private void LoadFontDictionary(PdfDictionary fontDictionary, IRandomAccessRead reader, bool isLenientParsing)
{ {
foreach (var pair in fontDictionary) foreach (var pair in fontDictionary)
{ {
@ -39,24 +40,22 @@
if (!(pair.Value is CosObject objectKey)) if (!(pair.Value is CosObject objectKey))
{ {
if (arguments.IsLenientParsing) if (isLenientParsing)
{ {
continue; continue;
} }
throw new InvalidOperationException($"The font with name {pair.Key} did not link to an object key. Value was: {pair.Value}."); throw new InvalidOperationException($"The font with name {pair.Key} did not link to an object key. Value was: {pair.Value}.");
} }
var dynamicParser = arguments.Get<DynamicParser>(); var fontObject = pdfObjectParser.Parse(objectKey.ToIndirectReference(), reader, false) as PdfDictionary;
var fontObject = dynamicParser.Parse(arguments, objectKey, false) as PdfDictionary;
if (fontObject == null) if (fontObject == null)
{ {
throw new InvalidOperationException($"Could not retrieve the font with name: {pair.Key} which should have been object {objectKey.GetObjectNumber()}"); throw new InvalidOperationException($"Could not retrieve the font with name: {pair.Key} which should have been object {objectKey.GetObjectNumber()}");
} }
loadedFonts[pair.Key] = arguments.Get<FontFactory>().GetFont(fontObject, arguments); loadedFonts[pair.Key] = fontFactory.Get(fontObject, reader, isLenientParsing);
} }
} }

View File

@ -0,0 +1,15 @@
namespace UglyToad.Pdf.ContentStream
{
public struct IndirectReference
{
public long ObjectNumber { get; }
public int Generation { get; }
public IndirectReference(long objectNumber, int generation)
{
ObjectNumber = objectNumber;
Generation = generation;
}
}
}

View File

@ -1,5 +1,7 @@
namespace UglyToad.Pdf.Cos namespace UglyToad.Pdf.Cos
{ {
using ContentStream;
public class CosObject : CosBase, ICosUpdateInfo public class CosObject : CosBase, ICosUpdateInfo
{ {
private CosBase baseObject; private CosBase baseObject;
@ -124,5 +126,10 @@
} }
public bool NeedsToBeUpdated { get; set; } public bool NeedsToBeUpdated { get; set; }
public IndirectReference ToIndirectReference()
{
return new IndirectReference(objectNumber, generationNumber);
}
} }
} }

View File

@ -32,5 +32,7 @@
/// The definition of the character collection for the font. /// The definition of the character collection for the font.
/// </summary> /// </summary>
CharacterIdentifierSystemInfo SystemInfo { get; } CharacterIdentifierSystemInfo SystemInfo { get; }
CidFontType CidFontType { get; }
} }
} }

View File

@ -1,6 +1,5 @@
namespace UglyToad.Pdf.Fonts.CidFonts namespace UglyToad.Pdf.Fonts.CidFonts
{ {
using Cmap;
using Cos; using Cos;
/// <inheritdoc/> /// <inheritdoc/>
@ -14,5 +13,6 @@
public CosName SubType { get; } public CosName SubType { get; }
public CosName BaseFont { get; } public CosName BaseFont { get; }
public CharacterIdentifierSystemInfo SystemInfo { get; } public CharacterIdentifierSystemInfo SystemInfo { get; }
public CidFontType CidFontType => CidFontType.Type0;
} }
} }

View File

@ -1,6 +1,5 @@
namespace UglyToad.Pdf.Fonts.CidFonts namespace UglyToad.Pdf.Fonts.CidFonts
{ {
using Cmap;
using Cos; using Cos;
/// <inheritdoc /> /// <inheritdoc />
@ -14,5 +13,6 @@
public CosName SubType { get; } public CosName SubType { get; }
public CosName BaseFont { get; } public CosName BaseFont { get; }
public CharacterIdentifierSystemInfo SystemInfo { get; } public CharacterIdentifierSystemInfo SystemInfo { get; }
public CidFontType CidFontType => CidFontType.Type2;
} }
} }

View File

@ -6,7 +6,7 @@
using IO; using IO;
using Util.JetBrains.Annotations; using Util.JetBrains.Annotations;
public class CMap internal class CMap
{ {
public CharacterIdentifierSystemInfo Info { get; } public CharacterIdentifierSystemInfo Info { get; }
@ -30,6 +30,8 @@
[NotNull] [NotNull]
public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; } public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; }
public WritingMode WritingMode { get; }
public bool HasCidMappings => CidCharacterMappings.Count > 0 || CidRanges.Count > 0; public bool HasCidMappings => CidCharacterMappings.Count > 0 || CidRanges.Count > 0;
public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0; public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0;
@ -41,7 +43,7 @@
{ {
Info = info; Info = info;
Type = type; Type = type;
WMode = wMode; WritingMode = (WritingMode)wMode;
Name = name; Name = name;
Version = version; Version = version;
BaseFontCharacterMap = baseFontCharacterMap ?? throw new ArgumentNullException(nameof(baseFontCharacterMap)); BaseFontCharacterMap = baseFontCharacterMap ?? throw new ArgumentNullException(nameof(baseFontCharacterMap));
@ -51,8 +53,7 @@
maxCodeLength = CodespaceRanges.Max(x => x.CodeLength); maxCodeLength = CodespaceRanges.Max(x => x.CodeLength);
minCodeLength = CodespaceRanges.Min(x => x.CodeLength); minCodeLength = CodespaceRanges.Min(x => x.CodeLength);
} }
private int wmode = 0;
private string cmapName = null; private string cmapName = null;
private string cmapVersion = null; private string cmapVersion = null;
private int cmapType = -1; private int cmapType = -1;

View File

@ -0,0 +1,8 @@
namespace UglyToad.Pdf.Fonts.Cmap
{
internal enum WritingMode
{
Horizontal = 0,
Vertical = 1
}
}

View File

@ -0,0 +1,55 @@
namespace UglyToad.Pdf.Fonts.Composite
{
using System;
using Cmap;
using IO;
using Util.JetBrains.Annotations;
/// <summary>
/// Defines the information content (actual text) of the font
/// as opposed to the display format.
/// </summary>
internal class ToUnicodeCMap
{
[CanBeNull]
private readonly CMap cMap;
/// <summary>
/// Does the font provide a CMap to map CIDs to Unicode values?
/// </summary>
public bool CanMapToUnicode => cMap != null;
/// <summary>
/// Is this document (unexpectedly) using a predefined Identity-H/V CMap as its ToUnicode CMap?
/// </summary>
public bool IsUsingIdentityAsUnicodeMap { get; }
public ToUnicodeCMap([CanBeNull]CMap cMap)
{
this.cMap = cMap;
if (CanMapToUnicode)
{
IsUsingIdentityAsUnicodeMap =
cMap.Name.StartsWith("Identity-", StringComparison.InvariantCultureIgnoreCase);
}
}
public bool TryGet(int code, out string value)
{
value = null;
if (!CanMapToUnicode)
{
return false;
}
return cMap.TryConvertToUnicode(code, out value);
}
public int ReadCode(IInputBytes inputBytes)
{
return cMap.ReadCode(inputBytes);
}
}
}

View File

@ -0,0 +1,76 @@
namespace UglyToad.Pdf.Fonts.Composite
{
using System;
using CidFonts;
using Cmap;
using Cos;
using Geometry;
using IO;
using Util.JetBrains.Annotations;
/// <summary>
/// Defines glyphs using a CIDFont
/// </summary>
internal class Type0Font : IFont
{
public CosName Name => BaseFont;
[NotNull]
public CosName BaseFont { get; }
[NotNull]
public ICidFont CidFont { get; }
[NotNull]
public CMap CMap { get; }
[NotNull]
public ToUnicodeCMap ToUnicode { get; }
public bool IsVertical => CMap.WritingMode == WritingMode.Vertical;
public Type0Font(CosName baseFont, ICidFont cidFont, CMap cmap, CMap toUnicodeCMap)
{
BaseFont = baseFont ?? throw new ArgumentNullException(nameof(baseFont));
CidFont = cidFont ?? throw new ArgumentNullException(nameof(cidFont));
CMap = cmap ?? throw new ArgumentNullException(nameof(cmap));
ToUnicode = new ToUnicodeCMap(toUnicodeCMap);
}
public int ReadCharacterCode(IInputBytes bytes, out int codeLength)
{
var current = bytes.CurrentOffset;
var code = ToUnicode.ReadCode(bytes);
codeLength = bytes.CurrentOffset - current;
return code;
}
public bool TryGetUnicode(int characterCode, out string value)
{
value = null;
if (!ToUnicode.CanMapToUnicode)
{
return false;
}
// According to PdfBox certain providers incorrectly using Identity CMaps as ToUnicode.
if (ToUnicode.IsUsingIdentityAsUnicodeMap)
{
value = new string((char)characterCode, 1);
return true;
}
return ToUnicode.TryGet(characterCode, out value);
}
public PdfVector GetDisplacement(int characterCode)
{
return new PdfVector(0.333m, 0);
}
}
}

View File

@ -1,5 +1,6 @@
namespace UglyToad.Pdf.Fonts namespace UglyToad.Pdf.Fonts
{ {
using ContentStream;
using Cos; using Cos;
/// <summary> /// <summary>
@ -13,13 +14,13 @@
/// </remarks> /// </remarks>
internal class DescriptorFontFile internal class DescriptorFontFile
{ {
public CosObjectKey ObjectKey { get; } public IndirectReference ObjectKey { get; }
public byte[] FileBytes { get; } public byte[] FileBytes { get; }
public FontFileType FileType { get; } public FontFileType FileType { get; }
public DescriptorFontFile(CosObjectKey key, FontFileType fileType) public DescriptorFontFile(IndirectReference key, FontFileType fileType)
{ {
ObjectKey = key; ObjectKey = key;
FileBytes = new byte[0]; FileBytes = new byte[0];

View File

@ -0,0 +1,32 @@
namespace UglyToad.Pdf.Fonts.Exceptions
{
using System;
using System.Runtime.Serialization;
/// <summary>
/// The exception thrown when an error is encountered parsing a font from the PDF document.
/// This occurs where the format of the font program or dictionary does not meet the specification.
/// </summary>
/// <inheritdoc cref="Exception"/>
[Serializable]
public class InvalidFontFormatException : Exception
{
public InvalidFontFormatException()
{
}
public InvalidFontFormatException(string message) : base(message)
{
}
public InvalidFontFormatException(string message, Exception inner) : base(message, inner)
{
}
protected InvalidFontFormatException(
SerializationInfo info,
StreamingContext context) : base(info, context)
{
}
}
}

View File

@ -4,22 +4,26 @@
using System.Collections.Generic; using System.Collections.Generic;
using ContentStream; using ContentStream;
using Cos; using Cos;
using Exceptions;
using IO;
using Logging;
using Parser.Handlers; using Parser.Handlers;
using Pdf.Parser;
internal class FontFactory internal class FontFactory : IFontFactory
{ {
private readonly ILog log;
private readonly IReadOnlyDictionary<CosName, IFontHandler> handlers; private readonly IReadOnlyDictionary<CosName, IFontHandler> handlers;
public FontFactory(Type0FontHandler type0FontHandler) public FontFactory(ILog log, Type0FontHandler type0FontHandler)
{ {
this.log = log;
handlers = new Dictionary<CosName, IFontHandler> handlers = new Dictionary<CosName, IFontHandler>
{ {
{CosName.TYPE0, type0FontHandler} {CosName.TYPE0, type0FontHandler}
}; };
} }
public IFont GetFont(PdfDictionary dictionary, ParsingArguments arguments) public IFont Get(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{ {
var type = dictionary.GetName(CosName.TYPE); var type = dictionary.GetName(CosName.TYPE);
@ -27,13 +31,13 @@
{ {
var message = "The font dictionary did not have type 'Font'. " + dictionary; var message = "The font dictionary did not have type 'Font'. " + dictionary;
if (arguments.IsLenientParsing) if (isLenientParsing)
{ {
arguments.Log.Error(message); log?.Error(message);
} }
else else
{ {
throw new InvalidOperationException(message); throw new InvalidFontFormatException(message);
} }
} }
@ -41,7 +45,7 @@
if (handlers.TryGetValue(subtype, out var handler)) if (handlers.TryGetValue(subtype, out var handler))
{ {
return handler.Generate(dictionary, arguments); return handler.Generate(dictionary, reader, isLenientParsing);
} }
throw new NotImplementedException($"Parsing not implemented for fonts of type: {subtype}, please submit a pull request or an issue."); throw new NotImplementedException($"Parsing not implemented for fonts of type: {subtype}, please submit a pull request or an issue.");
@ -49,3 +53,4 @@
} }
} }

View File

@ -10,16 +10,12 @@
internal interface IFont internal interface IFont
{ {
CosName Name { get; } CosName Name { get; }
CosName SubType { get; }
string BaseFontType { get; }
bool IsVertical { get; } bool IsVertical { get; }
int ReadCharacterCode(IInputBytes bytes, out int codeLength); int ReadCharacterCode(IInputBytes bytes, out int codeLength);
string GetUnicode(int characterCode); bool TryGetUnicode(int characterCode, out string value);
PdfVector GetDisplacement(int characterCode); PdfVector GetDisplacement(int characterCode);
} }
@ -51,6 +47,11 @@
return code; return code;
} }
public bool TryGetUnicode(int characterCode, out string value)
{
throw new NotImplementedException();
}
public string GetUnicode(int characterCode) public string GetUnicode(int characterCode)
{ {
if (ToUnicode != null) if (ToUnicode != null)

View File

@ -0,0 +1,10 @@
namespace UglyToad.Pdf.Fonts
{
using ContentStream;
using IO;
internal interface IFontFactory
{
IFont Get(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing);
}
}

View File

@ -9,7 +9,7 @@
using Tokenization.Scanner; using Tokenization.Scanner;
using Tokenization.Tokens; using Tokenization.Tokens;
public class CMapParser internal class CMapParser
{ {
private static readonly BaseFontRangeParser BaseFontRangeParser = new BaseFontRangeParser(); private static readonly BaseFontRangeParser BaseFontRangeParser = new BaseFontRangeParser();
private static readonly BaseFontCharacterParser BaseFontCharacterParser = new BaseFontCharacterParser(); private static readonly BaseFontCharacterParser BaseFontCharacterParser = new BaseFontCharacterParser();

View File

@ -1,10 +1,10 @@
namespace UglyToad.Pdf.Fonts.Parser.Handlers namespace UglyToad.Pdf.Fonts.Parser.Handlers
{ {
using ContentStream; using ContentStream;
using Pdf.Parser; using IO;
internal interface IFontHandler internal interface IFontHandler
{ {
IFont Generate(PdfDictionary dictionary, ParsingArguments parsingArguments); IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing);
} }
} }

View File

@ -1,9 +1,12 @@
namespace UglyToad.Pdf.Fonts.Parser.Handlers namespace UglyToad.Pdf.Fonts.Parser.Handlers
{ {
using System; using System;
using CidFonts;
using Cmap; using Cmap;
using Composite;
using ContentStream; using ContentStream;
using Cos; using Cos;
using Exceptions;
using Filters; using Filters;
using IO; using IO;
using Parts; using Parts;
@ -14,31 +17,35 @@
private readonly CidFontFactory cidFontFactory; private readonly CidFontFactory cidFontFactory;
private readonly CMapCache cMapCache; private readonly CMapCache cMapCache;
private readonly IFilterProvider filterProvider; private readonly IFilterProvider filterProvider;
private readonly IPdfObjectParser pdfObjectParser;
public Type0FontHandler(CidFontFactory cidFontFactory, CMapCache cMapCache, IFilterProvider filterProvider) public Type0FontHandler(CidFontFactory cidFontFactory, CMapCache cMapCache, IFilterProvider filterProvider, IPdfObjectParser pdfObjectParser)
{ {
this.cidFontFactory = cidFontFactory; this.cidFontFactory = cidFontFactory;
this.cMapCache = cMapCache; this.cMapCache = cMapCache;
this.filterProvider = filterProvider; this.filterProvider = filterProvider;
this.pdfObjectParser = pdfObjectParser;
} }
public IFont Generate(PdfDictionary dictionary, ParsingArguments arguments) public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{ {
var dynamicParser = arguments.Get<DynamicParser>();
var baseFont = dictionary.GetName(CosName.BASE_FONT); var baseFont = dictionary.GetName(CosName.BASE_FONT);
var cMap = ReadEncoding(dictionary, out var isCMapPredefined); var cMap = ReadEncoding(dictionary, out var isCMapPredefined);
if (TryGetFirstDescendant(dictionary, out var descendantObject)) if (TryGetFirstDescendant(dictionary, out var descendantObject))
{ {
var parsed = dynamicParser.Parse(arguments, descendantObject, false); var parsed = pdfObjectParser.Parse(descendantObject.ToIndirectReference(), reader, isLenientParsing);
if (parsed is PdfDictionary descendantFontDictionary) if (parsed is PdfDictionary descendantFontDictionary)
{ {
ParseDescendant(descendantFontDictionary, arguments); ParseDescendant(descendantFontDictionary, reader, isLenientParsing);
} }
} }
else
{
throw new InvalidFontFormatException("No descendant font dictionary was declared for this Type 0 font. This dictionary should contain the CIDFont for the Type 0 font. " + dictionary);
}
var ucs2CMap = GetUcs2CMap(dictionary, isCMapPredefined, false); var ucs2CMap = GetUcs2CMap(dictionary, isCMapPredefined, false);
@ -47,22 +54,17 @@
{ {
var toUnicodeValue = dictionary[CosName.TO_UNICODE]; var toUnicodeValue = dictionary[CosName.TO_UNICODE];
var toUnicode = dynamicParser.Parse(arguments, toUnicodeValue as CosObject, false) as RawCosStream; var toUnicode = pdfObjectParser.Parse(((CosObject)toUnicodeValue).ToIndirectReference(), reader, isLenientParsing) as RawCosStream;
var decodedUnicodeCMap = toUnicode?.Decode(filterProvider); var decodedUnicodeCMap = toUnicode?.Decode(filterProvider);
if (decodedUnicodeCMap != null) if (decodedUnicodeCMap != null)
{ {
toUnicodeCMap = cMapCache.Parse(new ByteArrayInputBytes(decodedUnicodeCMap), arguments.IsLenientParsing); toUnicodeCMap = cMapCache.Parse(new ByteArrayInputBytes(decodedUnicodeCMap), isLenientParsing);
} }
} }
var font = new CompositeFont var font = new Type0Font(baseFont, new Type0CidFont(), cMap, toUnicodeCMap);
{
SubType = CosName.TYPE0,
ToUnicode = toUnicodeCMap,
BaseFont = baseFont
};
return font; return font;
} }
@ -91,7 +93,7 @@
return false; return false;
} }
private void ParseDescendant(PdfDictionary dictionary, ParsingArguments arguments) private void ParseDescendant(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{ {
var type = dictionary.GetName(CosName.TYPE); var type = dictionary.GetName(CosName.TYPE);
if (!CosName.FONT.Equals(type)) if (!CosName.FONT.Equals(type))
@ -99,7 +101,7 @@
throw new InvalidOperationException($"Expected \'Font\' dictionary but found \'{type.Name}\'"); throw new InvalidOperationException($"Expected \'Font\' dictionary but found \'{type.Name}\'");
} }
cidFontFactory.Generate(dictionary, arguments, arguments.IsLenientParsing); cidFontFactory.Generate(dictionary, reader, isLenientParsing);
} }
private CMap ReadEncoding(PdfDictionary dictionary, out bool isCMapPredefined) private CMap ReadEncoding(PdfDictionary dictionary, out bool isCMapPredefined)

View File

@ -2,7 +2,6 @@
{ {
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.IO;
using CidFonts; using CidFonts;
using ContentStream; using ContentStream;
using Cos; using Cos;
@ -12,20 +11,25 @@
using Pdf.Parser; using Pdf.Parser;
using TrueType; using TrueType;
using TrueType.Parser; using TrueType.Parser;
using Util;
internal class CidFontFactory internal class CidFontFactory
{ {
private readonly FontDescriptorFactory descriptorFactory; private readonly FontDescriptorFactory descriptorFactory;
private readonly TrueTypeFontParser trueTypeFontParser; private readonly TrueTypeFontParser trueTypeFontParser;
private readonly IPdfObjectParser pdfObjectParser;
private readonly IFilterProvider filterProvider;
public CidFontFactory(FontDescriptorFactory descriptorFactory, TrueTypeFontParser trueTypeFontParser) public CidFontFactory(FontDescriptorFactory descriptorFactory, TrueTypeFontParser trueTypeFontParser,
IPdfObjectParser pdfObjectParser,
IFilterProvider filterProvider)
{ {
this.descriptorFactory = descriptorFactory; this.descriptorFactory = descriptorFactory;
this.trueTypeFontParser = trueTypeFontParser; this.trueTypeFontParser = trueTypeFontParser;
this.pdfObjectParser = pdfObjectParser;
this.filterProvider = filterProvider;
} }
public ICidFont Generate(PdfDictionary dictionary, ParsingArguments arguments, bool isLenientParsing) public ICidFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{ {
var type = dictionary.GetName(CosName.TYPE); var type = dictionary.GetName(CosName.TYPE);
if (!CosName.FONT.Equals(type)) if (!CosName.FONT.Equals(type))
@ -37,12 +41,12 @@
var verticalWritingMetrics = ReadVerticalDisplacements(dictionary); var verticalWritingMetrics = ReadVerticalDisplacements(dictionary);
FontDescriptor descriptor = null; FontDescriptor descriptor = null;
if (TryGetFontDescriptor(dictionary, arguments, out var descriptorDictionary)) if (TryGetFontDescriptor(dictionary, reader, out var descriptorDictionary))
{ {
descriptor = descriptorFactory.Generate(descriptorDictionary, arguments.IsLenientParsing); descriptor = descriptorFactory.Generate(descriptorDictionary, isLenientParsing);
} }
ReadDescriptorFile(descriptor, arguments); ReadDescriptorFile(descriptor, reader, isLenientParsing);
var subType = dictionary.GetName(CosName.SUBTYPE); var subType = dictionary.GetName(CosName.SUBTYPE);
if (CosName.CID_FONT_TYPE0.Equals(subType)) if (CosName.CID_FONT_TYPE0.Equals(subType))
@ -58,8 +62,7 @@
return null; return null;
} }
private static bool TryGetFontDescriptor(PdfDictionary dictionary, ParsingArguments arguments, private bool TryGetFontDescriptor(PdfDictionary dictionary, IRandomAccessRead reader, out PdfDictionary descriptorDictionary)
out PdfDictionary descriptorDictionary)
{ {
descriptorDictionary = null; descriptorDictionary = null;
@ -68,7 +71,7 @@
return false; return false;
} }
var descriptorObj = arguments.Get<DynamicParser>().Parse(arguments, obj, false); var descriptorObj = pdfObjectParser.Parse(obj.ToIndirectReference(), reader, false);
if (!(descriptorObj is PdfDictionary descriptor)) if (!(descriptorObj is PdfDictionary descriptor))
{ {
@ -80,21 +83,21 @@
return true; return true;
} }
private void ReadDescriptorFile(FontDescriptor descriptor, ParsingArguments arguments) private void ReadDescriptorFile(FontDescriptor descriptor, IRandomAccessRead reader, bool isLenientParsing)
{ {
if (descriptor?.FontFile == null) if (descriptor?.FontFile == null)
{ {
return; return;
} }
var fontFileStream = arguments.Get<DynamicParser>().Parse(arguments, descriptor.FontFile.ObjectKey, false) as RawCosStream; var fontFileStream = pdfObjectParser.Parse(descriptor.FontFile.ObjectKey, reader, isLenientParsing) as RawCosStream;
if (fontFileStream == null) if (fontFileStream == null)
{ {
return; return;
} }
var fontFile = fontFileStream.Decode(arguments.Get<IFilterProvider>()); var fontFile = fontFileStream.Decode(filterProvider);
switch (descriptor.FontFile.FileType) switch (descriptor.FontFile.FileType)
{ {

View File

@ -141,7 +141,7 @@
throw new NotSupportedException("We currently expect the FontFile to be an object reference."); throw new NotSupportedException("We currently expect the FontFile to be an object reference.");
} }
return new DescriptorFontFile(obj.GetObjectKey(), DescriptorFontFile.FontFileType.Type1); return new DescriptorFontFile(obj.ToIndirectReference(), DescriptorFontFile.FontFileType.Type1);
} }
if (dictionary.TryGetValue(CosName.FONT_FILE2, out value)) if (dictionary.TryGetValue(CosName.FONT_FILE2, out value))
@ -151,7 +151,7 @@
throw new NotSupportedException("We currently expect the FontFile2 to be an object reference."); throw new NotSupportedException("We currently expect the FontFile2 to be an object reference.");
} }
return new DescriptorFontFile(obj.GetObjectKey(), DescriptorFontFile.FontFileType.TrueType); return new DescriptorFontFile(obj.ToIndirectReference(), DescriptorFontFile.FontFileType.TrueType);
} }
if (dictionary.TryGetValue(CosName.FONT_FILE3, out value)) if (dictionary.TryGetValue(CosName.FONT_FILE3, out value))
@ -161,7 +161,7 @@
throw new NotSupportedException("We currently expect the FontFile3 to be an object reference."); throw new NotSupportedException("We currently expect the FontFile3 to be an object reference.");
} }
return new DescriptorFontFile(obj.GetObjectKey(), DescriptorFontFile.FontFileType.FromSubtype); return new DescriptorFontFile(obj.ToIndirectReference(), DescriptorFontFile.FontFileType.FromSubtype);
} }
return null; return null;

View File

@ -0,0 +1,7 @@
namespace UglyToad.Pdf.Geometry.Paths
{
internal class GeneralPath
{
// TODO: provide an implementation
}
}

View File

@ -85,7 +85,7 @@
{ {
var code = font.ReadCharacterCode(bytes, out int codeLength); var code = font.ReadCharacterCode(bytes, out int codeLength);
var unicode = font.GetUnicode(code); font.TryGetUnicode(code, out var unicode);
var wordSpacing = 0m; var wordSpacing = 0m;
if (code == ' ' && codeLength == 1) if (code == ' ' && codeLength == 1)

View File

@ -1,12 +1,11 @@
namespace UglyToad.Pdf.Parser namespace UglyToad.Pdf.Parser
{ {
using System.Collections.Generic; using System.Collections.Generic;
using Graphics;
using Graphics.Operations; using Graphics.Operations;
using IO; using IO;
internal interface IPageContentParser internal interface IPageContentParser
{ {
IReadOnlyList<IGraphicsStateOperation> Parse(IGraphicsStateOperationFactory operationFactory, IInputBytes inputBytes); IReadOnlyList<IGraphicsStateOperation> Parse(IInputBytes inputBytes);
} }
} }

View File

@ -0,0 +1,218 @@
namespace UglyToad.Pdf.Parser
{
using System;
using System.Collections.Generic;
using System.Linq;
using ContentStream;
using Cos;
using IO;
using Logging;
using Parts;
using Util;
internal interface IPdfObjectParser
{
CosBase Parse(IndirectReference indirectReference, IRandomAccessRead reader, bool isLenientParsing = true, bool requireExistingObject = false);
}
internal class PdfObjectParser : IPdfObjectParser
{
private readonly ILog log;
private readonly CosBaseParser baseParser;
private readonly CosStreamParser streamParser;
private readonly CrossReferenceTable crossReferenceTable;
private readonly BruteForceSearcher bruteForceSearcher;
private readonly CosObjectPool objectPool;
private readonly ObjectStreamParser objectStreamParser;
public PdfObjectParser(ILog log, CosBaseParser baseParser, CosStreamParser streamParser, CrossReferenceTable crossReferenceTable,
BruteForceSearcher bruteForceSearcher,
CosObjectPool objectPool,
ObjectStreamParser objectStreamParser)
{
this.log = log ?? new NoOpLog();
this.baseParser = baseParser ?? throw new ArgumentNullException(nameof(baseParser));
this.streamParser = streamParser ?? throw new ArgumentNullException(nameof(streamParser));
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
this.bruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher));
this.objectPool = objectPool ?? throw new ArgumentNullException(nameof(objectPool));
this.objectStreamParser = objectStreamParser ?? throw new ArgumentNullException(nameof(objectStreamParser));
}
public CosBase Parse(IndirectReference indirectReference, IRandomAccessRead reader, bool isLenientParsing = true, bool requireExistingObject = false)
{
var key = new CosObjectKey(indirectReference.ObjectNumber, indirectReference.Generation);
var pdfObject = objectPool.GetOrCreateDefault(key);
if (pdfObject.GetObject() != null)
{
return pdfObject.GetObject();
}
var offsetOrStreamNumber = TryGet(key, crossReferenceTable.ObjectOffsets);
if (requireExistingObject && (offsetOrStreamNumber == null || offsetOrStreamNumber <= 0))
{
throw new InvalidOperationException("Object must be defined and not compressed: " + key);
}
if (isLenientParsing && offsetOrStreamNumber == null)
{
var locations = bruteForceSearcher.GetObjectLocations();
offsetOrStreamNumber = TryGet(key, locations);
if (offsetOrStreamNumber != null)
{
crossReferenceTable.UpdateOffset(key, offsetOrStreamNumber.Value);
}
}
if (offsetOrStreamNumber == null)
{
return CosNull.Null;
}
var isCompressedStreamObject = offsetOrStreamNumber <= 0;
if (!isCompressedStreamObject)
{
return ParseObjectFromFile(offsetOrStreamNumber.Value, reader, key, objectPool, isLenientParsing);
}
return ParseCompressedStreamObject(reader, -offsetOrStreamNumber.Value, indirectReference.ObjectNumber, isLenientParsing);
}
private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader,
CosObjectKey key,
CosObjectPool pool,
bool isLenientParsing)
{
reader.Seek(offset);
var objectNumber = ObjectHelper.ReadObjectNumber(reader);
var objectGeneration = ObjectHelper.ReadGenerationNumber(reader);
ReadHelper.ReadExpectedString(reader, "obj", true);
if (objectNumber != key.Number || objectGeneration != key.Generation)
{
throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}");
}
ReadHelper.SkipSpaces(reader);
var baseObject = baseParser.Parse(reader, pool);
var endObjectKey = ReadHelper.ReadString(reader);
var atStreamStart = string.Equals(endObjectKey, "stream");
if (atStreamStart)
{
var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey);
reader.Rewind(streamStartBytes.Length);
baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey);
}
if (!string.Equals(endObjectKey, "endobj"))
{
var message =
$"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'";
if (isLenientParsing)
{
log.Warn(message);
}
else
{
throw new InvalidOperationException(message);
}
}
return baseObject;
}
private CosBase ReadNormalObjectStream(IRandomAccessRead reader, CosBase currentBase, long offset,
bool isLenientParsing,
out string endObjectKey)
{
if (currentBase is PdfDictionary dictionary)
{
RawCosStream stream = streamParser.Parse(reader, dictionary, isLenientParsing);
currentBase = stream;
}
else
{
// this is not legal
// the combination of a dict and the stream/endstream
// forms a complete stream object
throw new InvalidOperationException($"Stream not preceded by dictionary (offset: {offset}).");
}
ReadHelper.SkipSpaces(reader);
endObjectKey = ReadHelper.ReadLine(reader);
// we have case with a second 'endstream' before endobj
if (!endObjectKey.StartsWith("endobj") && endObjectKey.StartsWith("endstream"))
{
endObjectKey = endObjectKey.Substring(9).Trim();
if (endObjectKey.Length == 0)
{
// no other characters in extra endstream line
// read next line
endObjectKey = ReadHelper.ReadLine(reader);
}
}
return currentBase;
}
private CosBase ParseCompressedStreamObject(IRandomAccessRead reader, long streamObjectNumber, long requestedNumber, bool isLenientParsing)
{
var baseStream = Parse(new IndirectReference(streamObjectNumber, 0), reader, isLenientParsing, true);
if (!(baseStream is RawCosStream stream))
{
log.Warn($"Could not find a stream for the object number, defaults to returning CosNull: {streamObjectNumber}");
return CosNull.Null;
}
var objects = objectStreamParser.Parse(stream, objectPool);
// register all objects which are referenced to be contained in object stream
foreach (var next in objects)
{
var streamKey = new CosObjectKey(next);
var offset = TryGet(streamKey, crossReferenceTable.ObjectOffsets);
if (offset != null && offset == -streamObjectNumber)
{
var streamObject = objectPool.Get(streamKey);
streamObject.SetObject(next.GetObject());
}
}
var matchingStreamObject = objects.FirstOrDefault(x => x.GetObjectNumber() == requestedNumber);
if (matchingStreamObject != null)
{
return matchingStreamObject;
}
log.Error($"Could not find the object {requestedNumber} in the stream for object {streamObjectNumber}. Returning CosNull.");
return CosNull.Null;
}
private static T? TryGet<T, TKey>(TKey key, IReadOnlyDictionary<TKey, T> dictionary) where T : struct
{
return dictionary.TryGetValue(key, out var value) ? value : default(T?);
}
}
}

View File

@ -9,7 +9,14 @@
internal class PageContentParser : IPageContentParser internal class PageContentParser : IPageContentParser
{ {
public IReadOnlyList<IGraphicsStateOperation> Parse(IGraphicsStateOperationFactory operationFactory, IInputBytes inputBytes) private readonly IGraphicsStateOperationFactory operationFactory;
public PageContentParser(IGraphicsStateOperationFactory operationFactory)
{
this.operationFactory = operationFactory;
}
public IReadOnlyList<IGraphicsStateOperation> Parse(IInputBytes inputBytes)
{ {
var scanner = new CoreTokenScanner(inputBytes); var scanner = new CoreTokenScanner(inputBytes);

View File

@ -1,57 +1,12 @@
namespace UglyToad.Pdf.Parser.PageTree namespace UglyToad.Pdf.Parser.PageTree
{ {
using System; using System;
using Content;
using ContentStream; using ContentStream;
using ContentStream.TypedAccessors; using ContentStream.TypedAccessors;
using Cos; using Cos;
using Filters; using Filters;
using Fonts; using Fonts;
internal class PageParser
{
public Page Parse(int number, PdfDictionary dictionary, ParsingArguments arguments)
{
if (dictionary == null)
{
throw new ArgumentNullException(nameof(dictionary));
}
if (arguments == null)
{
throw new ArgumentNullException(nameof(arguments));
}
if (!dictionary.IsType(CosName.PAGE))
{
throw new InvalidOperationException("Expected a Dictionary of Type Page, instead got this: " + dictionary);
}
return new Page(number, dictionary, new PageTreeMembers(), arguments);
}
}
internal class FontParser
{
public Font Parse(PdfDictionary dictionary, ParsingArguments arguments)
{
var type = dictionary.GetName(CosName.SUBTYPE);
if (CosName.Equals(type, CosName.TYPE0))
{
var compositeFont = arguments.Container.Get<CompositeFontParser>()
.Parse(dictionary, arguments);
}
else
{
var simpleFont = arguments.Container.Get<SimpleFontParser>()
.Parse(dictionary, arguments);
}
return new Font();
}
}
internal class CompositeFontParser internal class CompositeFontParser
{ {
public CompositeFont Parse(PdfDictionary dictionary, ParsingArguments arguments) public CompositeFont Parse(PdfDictionary dictionary, ParsingArguments arguments)

View File

@ -2,7 +2,6 @@
{ {
using System.Collections.Generic; using System.Collections.Generic;
using Content; using Content;
using ContentStream;
using Cos; using Cos;
/// <summary> /// <summary>
@ -37,34 +36,5 @@
{ {
return fonts.ContainsKey(name); return fonts.ContainsKey(name);
} }
internal bool GetFont(CosName name, ParsingArguments arguments, out Font value)
{
if (fontObjects.TryGetValue(name, out value))
{
return true;
}
if (!fonts.TryGetValue(name, out var key))
{
return false;
}
var dictionary = arguments.Container.Get<DynamicParser>()
.Parse(arguments, key, false) as PdfDictionary;
if (dictionary == null)
{
return false;
}
var font = arguments.Container.Get<FontParser>()
.Parse(dictionary, arguments);
fontObjects[name] = font;
// retrieve and cache
return false;
}
} }
} }

View File

@ -14,9 +14,9 @@
public BruteForceSearcher BruteForceSearcher { get; } public BruteForceSearcher BruteForceSearcher { get; }
public ResourceContainer ResourceContainer { get; } public IResourceStore ResourceContainer { get; }
public ParsingCachingProviders(CosObjectPool objectPool, BruteForceSearcher bruteForceSearcher, ResourceContainer resourceContainer) public ParsingCachingProviders(CosObjectPool objectPool, BruteForceSearcher bruteForceSearcher, IResourceStore resourceContainer)
{ {
ObjectPool = objectPool ?? throw new ArgumentNullException(nameof(objectPool)); ObjectPool = objectPool ?? throw new ArgumentNullException(nameof(objectPool));
BruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher)); BruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher));

View File

@ -5,7 +5,15 @@
using Content; using Content;
using ContentStream; using ContentStream;
using Cos; using Cos;
using Filters;
using Fonts;
using Fonts.Parser;
using Fonts.Parser.Handlers;
using Fonts.Parser.Parts;
using Fonts.TrueType.Parser;
using Graphics;
using IO; using IO;
using Logging;
using Parts; using Parts;
using Parts.CrossReference; using Parts.CrossReference;
using Util; using Util;
@ -37,6 +45,8 @@
private static PdfDocument OpenDocument(IRandomAccessRead reader, IContainer container, bool isLenientParsing) private static PdfDocument OpenDocument(IRandomAccessRead reader, IContainer container, bool isLenientParsing)
{ {
var log = container.Get<ILog>();
var version = container.Get<FileHeaderParser>().ReadHeader(reader, isLenientParsing); var version = container.Get<FileHeaderParser>().ReadHeader(reader, isLenientParsing);
var crossReferenceOffset = container.Get<FileTrailerParser>().GetXrefOffset(reader, isLenientParsing); var crossReferenceOffset = container.Get<FileTrailerParser>().GetXrefOffset(reader, isLenientParsing);
@ -46,9 +56,27 @@
var crossReferenceTable = container.Get<FileCrossReferenceTableParser>() var crossReferenceTable = container.Get<FileCrossReferenceTableParser>()
.Parse(reader, isLenientParsing, crossReferenceOffset, pool); .Parse(reader, isLenientParsing, crossReferenceOffset, pool);
var dynamicParser = container.Get<DynamicParser>(); var filterProvider = container.Get<IFilterProvider>();
var bruteForceSearcher = new BruteForceSearcher(reader); var bruteForceSearcher = new BruteForceSearcher(reader);
var resourceContainer = new ResourceContainer(); var pdfObjectParser = new PdfObjectParser(container.Get<ILog>(), container.Get<CosBaseParser>(),
container.Get<CosStreamParser>(), crossReferenceTable, bruteForceSearcher, pool, container.Get<ObjectStreamParser>());
var trueTypeFontParser = new TrueTypeFontParser();
var fontDescriptorFactory = new FontDescriptorFactory();
var cidFontFactory = new CidFontFactory(fontDescriptorFactory, trueTypeFontParser, pdfObjectParser, filterProvider);
var cMapCache = new CMapCache(new CMapParser());
var fontFactory = new FontFactory(container.Get<ILog>(), new Type0FontHandler(cidFontFactory,
cMapCache,
filterProvider,
pdfObjectParser));
var dynamicParser = container.Get<DynamicParser>();
var resourceContainer = new ResourceContainer(pdfObjectParser, fontFactory);
var pageFactory = new PageFactory(resourceContainer, pdfObjectParser, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory()));
var root = ParseTrailer(reader, crossReferenceTable, dynamicParser, bruteForceSearcher, pool, var root = ParseTrailer(reader, crossReferenceTable, dynamicParser, bruteForceSearcher, pool,
isLenientParsing); isLenientParsing);
@ -66,7 +94,7 @@
var caching = new ParsingCachingProviders(pool, bruteForceSearcher, resourceContainer); var caching = new ParsingCachingProviders(pool, bruteForceSearcher, resourceContainer);
return new PdfDocument(reader, version, crossReferenceTable, container, isLenientParsing, caching, new Catalog(rootDictionary)); return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, new Catalog(rootDictionary));
} }
private static CosBase ParseTrailer(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable, private static CosBase ParseTrailer(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable,

View File

@ -4,6 +4,7 @@
using Content; using Content;
using Cos; using Cos;
using IO; using IO;
using Logging;
using Parser; using Parser;
using Parser.Parts; using Parser.Parts;
using Util; using Util;
@ -17,8 +18,8 @@
private readonly HeaderVersion version; private readonly HeaderVersion version;
[NotNull] [NotNull]
private readonly CrossReferenceTable crossReferenceTable; private readonly CrossReferenceTable crossReferenceTable;
[NotNull]
private readonly IContainer container; private readonly ILog log;
private readonly bool isLenientParsing; private readonly bool isLenientParsing;
[NotNull] [NotNull]
private readonly ParsingCachingProviders cachingProviders; private readonly ParsingCachingProviders cachingProviders;
@ -29,21 +30,21 @@
[NotNull] [NotNull]
public Pages Pages { get; } public Pages Pages { get; }
internal PdfDocument(IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable, internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable,
IContainer container,
bool isLenientParsing, bool isLenientParsing,
ParsingCachingProviders cachingProviders, ParsingCachingProviders cachingProviders,
IPageFactory pageFactory,
IPdfObjectParser pdfObjectParser,
Catalog catalog) Catalog catalog)
{ {
this.log = log;
this.reader = reader ?? throw new ArgumentNullException(nameof(reader)); this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
this.version = version ?? throw new ArgumentNullException(nameof(version)); this.version = version ?? throw new ArgumentNullException(nameof(version));
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable)); this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
this.container = container ?? throw new ArgumentNullException(nameof(container));
this.isLenientParsing = isLenientParsing; this.isLenientParsing = isLenientParsing;
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders)); this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
Catalog = catalog ?? throw new ArgumentNullException(nameof(catalog)); Catalog = catalog ?? throw new ArgumentNullException(nameof(catalog));
var arguments = new ParsingArguments(reader, crossReferenceTable, cachingProviders, container, isLenientParsing); Pages = new Pages(log, Catalog, pdfObjectParser, pageFactory, reader, isLenientParsing);
Pages = new Pages(Catalog, arguments);
} }
public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options); public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options);

View File

@ -1,6 +1,7 @@
namespace UglyToad.Pdf.Tokenization namespace UglyToad.Pdf.Tokenization
{ {
using System.Collections.Generic; using System.Collections.Generic;
using ContentStream;
using IO; using IO;
using Parser.Parts; using Parser.Parts;
using Scanner; using Scanner;
@ -79,7 +80,7 @@
if (r == OperatorToken.R) if (r == OperatorToken.R)
{ {
result[key] = new IndirectReferenceToken(new IndirectReference(num.Long, gen.Long)); result[key] = new IndirectReferenceToken(new IndirectReference(num.Long, gen.Int));
i = i + 2; i = i + 2;
} }
} }

View File

@ -1,5 +1,7 @@
namespace UglyToad.Pdf.Tokenization.Tokens namespace UglyToad.Pdf.Tokenization.Tokens
{ {
using ContentStream;
public class IndirectReferenceToken : IDataToken<IndirectReference> public class IndirectReferenceToken : IDataToken<IndirectReference>
{ {
public IndirectReference Data { get; } public IndirectReference Data { get; }
@ -10,16 +12,4 @@
} }
} }
public struct IndirectReference
{
public long ObjectNumber { get; }
public long Generation { get; }
public IndirectReference(long objectNumber, long generation)
{
ObjectNumber = objectNumber;
Generation = generation;
}
}
} }

View File

@ -227,5 +227,9 @@
<EmbeddedResource Include="Resources\CMap\UniKS-UTF16-V" /> <EmbeddedResource Include="Resources\CMap\UniKS-UTF16-V" />
<EmbeddedResource Include="Resources\CMap\V" /> <EmbeddedResource Include="Resources\CMap\V" />
</ItemGroup> </ItemGroup>
<ItemGroup>
<Folder Include="Fonts\Simple\" />
</ItemGroup>
</Project> </Project>

View File

@ -1,12 +1,7 @@
namespace UglyToad.Pdf.Util namespace UglyToad.Pdf.Util
{ {
using Filters; using Filters;
using Fonts;
using Fonts.Parser; using Fonts.Parser;
using Fonts.Parser.Handlers;
using Fonts.Parser.Parts;
using Fonts.TrueType.Parser;
using Graphics;
using Logging; using Logging;
using Parser; using Parser;
using Parser.PageTree; using Parser.PageTree;
@ -49,19 +44,12 @@
new CrossReferenceTableParser(logger, dictionaryParser, baseParser)); new CrossReferenceTableParser(logger, dictionaryParser, baseParser));
var resourceDictionaryParser = new ResourceDictionaryParser(); var resourceDictionaryParser = new ResourceDictionaryParser();
var pageParser = new PageParser();
var simpleFontParser = new SimpleFontParser(); var simpleFontParser = new SimpleFontParser();
var compositeFontParser = new CompositeFontParser(); var compositeFontParser = new CompositeFontParser();
var fontParser = new FontParser();
var pageContentParser = new PageContentParser();
var operationFactory = new ReflectionGraphicsStateOperationFactory();
var cmapParser = new CMapParser(); var cmapParser = new CMapParser();
var afmParser = new AdobeFontMetricsParser(); var afmParser = new AdobeFontMetricsParser();
var type0FontFactory = new Type0FontHandler(new CidFontFactory(new FontDescriptorFactory(), new TrueTypeFontParser()), new CMapCache(cmapParser), filterProvider);
var fontFactory = new FontFactory(type0FontFactory);
var container = new Container(); var container = new Container();
container.Register(headerParser); container.Register(headerParser);
container.Register(trailerParser); container.Register(trailerParser);
@ -75,15 +63,11 @@
container.Register(objectStreamParser); container.Register(objectStreamParser);
container.Register(filterProvider); container.Register(filterProvider);
container.Register(resourceDictionaryParser); container.Register(resourceDictionaryParser);
container.Register(pageParser);
container.Register(simpleFontParser); container.Register(simpleFontParser);
container.Register(compositeFontParser); container.Register(compositeFontParser);
container.Register(fontParser);
container.Register(pageContentParser);
container.Register(operationFactory);
container.Register(cmapParser); container.Register(cmapParser);
container.Register(afmParser); container.Register(afmParser);
container.Register(fontFactory); container.Register(logger);
return container; return container;
} }