checkpoint check in for adobe font metrics parsing

This commit is contained in:
Eliot Jones
2018-01-06 14:11:14 +00:00
parent 03f31a84e5
commit eb66611e55
12 changed files with 512 additions and 85 deletions

View File

@@ -1,5 +1,8 @@
namespace UglyToad.Pdf.Tests.Fonts.Parser
{
using System;
using System.IO;
using IO;
using Pdf.Fonts.Parser;
using Xunit;
@@ -79,5 +82,33 @@ C 37 ; WX 600 ; N percent ; B 81 -15 518 622 ;";
Assert.NotNull(metrics);
}
[Fact]
public void CanParseHelveticaAfmFile()
{
var helvetica = GetResourceBytes("UglyToad.Pdf.Resources.AdobeFontMetrics.Helvetica.afm");
var input = new ByteArrayInputBytes(helvetica);
var metrics = parser.Parse(input, false);
Assert.NotNull(metrics);
}
private static byte[] GetResourceBytes(string name)
{
using (var memoryStream = new MemoryStream())
using (var resource = typeof(AdobeFontMetricsParser).Assembly.GetManifestResourceStream(name))
{
if (resource == null)
{
throw new InvalidOperationException($"No assembly resource with name: {name}.");
}
resource.CopyTo(memoryStream);
return memoryStream.ToArray();
}
}
}
}

View File

@@ -0,0 +1,40 @@
namespace UglyToad.Pdf.Tests.Integration
{
using System;
using System.IO;
using System.Linq;
using Content;
using Xunit;
public class SinglePageType1FontTests
{
private static string GetFilename()
{
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
return Path.Combine(documentFolder, "Single Page Type 1 Font.pdf");
}
[Fact]
public void HasCorrectNumberOfPages()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
Assert.Equal(1, document.NumberOfPages);
}
}
[Fact]
public void HasCorrectPageSize()
{
using (var document = PdfDocument.Open(GetFilename()))
{
//var page = document.GetPage(1);
//Assert.Equal(PageSize.Letter, page.Size);
}
}
}
}

View File

@@ -17,6 +17,7 @@
<None Remove="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf" />
<None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" />
<None Remove="Integration\Documents\Single Page Simple - from open office.pdf" />
<None Remove="Integration\Documents\Single Page Type 1 Font.pdf" />
<None Remove="Integration\Documents\Two Page Text Only - from libre office.pdf" />
</ItemGroup>
@@ -48,6 +49,9 @@
<Content Include="Integration\Documents\Single Page Simple - from open office.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Single Page Type 1 Font.pdf">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Two Page Text Only - from libre office.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>

View File

@@ -14,13 +14,14 @@
private readonly ILog log;
private readonly IReadOnlyDictionary<CosName, IFontHandler> handlers;
public FontFactory(ILog log, Type0FontHandler type0FontHandler, TrueTypeFontHandler trueTypeFontHandler)
public FontFactory(ILog log, Type0FontHandler type0FontHandler, TrueTypeFontHandler trueTypeFontHandler, Type1FontHandler type1FontHandler)
{
this.log = log;
handlers = new Dictionary<CosName, IFontHandler>
{
{CosName.TYPE0, type0FontHandler},
{CosName.TRUE_TYPE, trueTypeFontHandler}
{CosName.TRUE_TYPE, trueTypeFontHandler},
{CosName.TYPE1, type1FontHandler}
};
}

View File

@@ -1,6 +1,172 @@
namespace UglyToad.Pdf.Fonts
{
using System.Collections.Generic;
using Geometry;
class FontMetrics
{
}
internal class FontMetricsBuilder
{
public decimal AfmVersion { get; }
public List<string> Comments { get; }
public List<IndividualCharacterMetric> CharacterMetrics { get; } = new List<IndividualCharacterMetric>();
/// <summary>
/// Name of the font as seen by PostScript.
/// </summary>
public string FontName { get; set; }
/// <summary>
/// The full text name of the font.
/// </summary>
public string FullName { get; set; }
/// <summary>
/// The name of the typeface family for the font.
/// </summary>
public string FamilyName { get; set; }
/// <summary>
/// The weight of the font.
/// </summary>
public string Weight { get; set; }
/// <summary>
/// Angle in degrees counter-clockwise from vertical of vertical strokes of the font.
/// </summary>
public decimal ItalicAngle { get; set; }
/// <summary>
/// Whether the font is monospaced or not.
/// </summary>
public bool IsFixedPitch { get; set; }
/// <summary>
/// The dimensions of the font bounding box.
/// </summary>
public PdfRectangle PdfBoundingBox { get; private set; }
/// <summary>
/// Distance from the baseline for underlining.
/// </summary>
public decimal UnderlinePosition { get; set; }
/// <summary>
/// The stroke width for underlining.
/// </summary>
public decimal UnderlineThickness { get; set; }
/// <summary>
/// Version identifier for the font program.
/// </summary>
public string Version { get; set; }
/// <summary>
/// Font name trademark or copyright notice.
/// </summary>
public string Notice { get; set; }
public string EncodingScheme { get; set; }
/// <summary>
/// Code describing mapping scheme for a non base font.
/// </summary>
public int MappingScheme { get; set; }
/// <summary>
/// The character set of this font.
/// </summary>
public string CharacterSet { get; set; }
public bool IsBaseFont { get; set; } = true;
/// <summary>
/// The y-value of the top of a capital H.
/// </summary>
public decimal CapHeight { get; set; }
/// <summary>
/// The y-value of the top of lowercase x.
/// </summary>
public decimal XHeight { get; set; }
/// <summary>
/// Generally the y-value of the top of lowercase d.
/// </summary>
public decimal Ascender { get; set; }
/// <summary>
/// The y-value of the bottom of lowercase p.
/// </summary>
public decimal Descender { get; set; }
/// <summary>
/// Width of horizontal stems.
/// </summary>
public decimal StdHw { get; set; }
/// <summary>
/// Width of vertical stems.
/// </summary>
public decimal StdVw { get; set; }
public CharacterWidth CharacterWidth { get; private set; }
public FontMetricsBuilder(decimal afmVersion)
{
AfmVersion = afmVersion;
Comments = new List<string>();
}
public void SetBoundingBox(decimal x1, decimal y1, decimal x2, decimal y2)
{
PdfBoundingBox = new PdfRectangle(x1, y1, x2, y2);
}
public void SetCharacterWidth(decimal x, decimal y)
{
CharacterWidth = new CharacterWidth(x, y);
}
}
/// <summary>
/// The x and y components of the width vector of the font's characters.
/// Presence implies that IsFixedPitch is true.
/// </summary>
internal class CharacterWidth
{
public decimal X { get; }
public decimal Y { get; }
public CharacterWidth(decimal x, decimal y)
{
X = x;
Y = y;
}
}
internal class IndividualCharacterMetric
{
public int CharacterCode { get; set; }
public decimal WidthX { get; set; }
public decimal WidthY { get; set; }
public decimal WidthXDirection0 { get; set; }
public decimal WidthYDirection0 { get; set; }
public decimal WidthXDirection1 { get; set; }
public decimal WidthYDirection1 { get; set; }
public string Name { get; set; }
public PdfVector VVector { get; set; }
public PdfRectangle BoundingBox { get; set; }
}
}

View File

@@ -1,9 +1,10 @@
namespace UglyToad.Pdf.Fonts.Parser
{
using System;
using System.Text;
using Exceptions;
using IO;
using Tokenization.Scanner;
using Tokenization.Tokens;
using Pdf.Parser.Parts;
internal class AdobeFontMetricsParser : IAdobeFontMetricsParser
{
@@ -316,24 +317,103 @@
public FontMetrics Parse(IInputBytes bytes, bool useReducedDataSet)
{
var tokenizer = new CoreTokenScanner(bytes);
var token = ReadString(bytes);
tokenizer.MoveNext();
var current = tokenizer.CurrentToken;
if (!(current is OperatorToken operatorToken) || operatorToken.Data != StartFontMetrics)
if (!string.Equals(StartFontMetrics, token, StringComparison.OrdinalIgnoreCase))
{
throw new InvalidOperationException($"The font metrics file started with {current} rather than {StartFontMetrics}.");
throw new InvalidFontFormatException($"The AFM file was not valid, it did not start with {StartFontMetrics}.");
}
while (tokenizer.MoveNext())
{
var version = ReadDecimal(bytes);
var builder = new FontMetricsBuilder(version);
while ((token = ReadString(bytes)) != EndFontMetrics)
{
switch (token)
{
case Comment:
builder.Comments.Add(ReadLine(bytes));
break;
case FontName:
builder.FontName = ReadLine(bytes);
break;
case FullName:
builder.FullName = ReadLine(bytes);
break;
case FamilyName:
builder.FamilyName = ReadLine(bytes);
break;
}
}
return new FontMetrics();
}
private static decimal ReadDecimal(IInputBytes input)
{
var str = ReadString(input);
return decimal.Parse(str);
}
private static bool ReadBool(IInputBytes input)
{
var boolean = ReadString(input);
switch (boolean)
{
case "true":
return true;
case "false":
return false;
default:
throw new InvalidFontFormatException($"The AFM should have contained a boolean but instead contained: {boolean}.");
}
}
private static readonly StringBuilder Builder = new StringBuilder();
private static string ReadString(IInputBytes input)
{
Builder.Clear();
if (input.IsAtEnd())
{
return EndFontMetrics;
}
while (ReadHelper.IsWhitespace(input.CurrentByte) && input.MoveNext())
{
}
Builder.Append((char)input.CurrentByte);
while (input.MoveNext() && !ReadHelper.IsWhitespace(input.CurrentByte))
{
Builder.Append((char)input.CurrentByte);
}
return Builder.ToString();
}
private static string ReadLine(IInputBytes input)
{
Builder.Clear();
while (ReadHelper.IsWhitespace(input.CurrentByte) && input.MoveNext())
{
}
Builder.Append((char)input.CurrentByte);
while (input.MoveNext() && !ReadHelper.IsEndOfLine(input.CurrentByte))
{
Builder.Append((char)input.CurrentByte);
}
return Builder.ToString();
}
}
internal interface IAdobeFontMetricsParser

View File

@@ -0,0 +1,80 @@
namespace UglyToad.Pdf.Fonts.Parser
{
using System.Linq;
using ContentStream;
using Cos;
using Exceptions;
using IO;
using Parts;
using Pdf.Parser;
internal static class FontDictionaryAccessHelper
{
public static int GetFirstCharacter(PdfDictionary dictionary)
{
if (!dictionary.TryGetItemOfType(CosName.FIRST_CHAR, out CosInt firstChar))
{
throw new InvalidFontFormatException(
$"No first character entry was found in the font dictionary for this TrueType font: {dictionary}.");
}
return firstChar.AsInt();
}
public static int GetLastCharacter(PdfDictionary dictionary)
{
if (!dictionary.TryGetItemOfType(CosName.LAST_CHAR, out CosInt lastChar))
{
throw new InvalidFontFormatException(
$"No last character entry was found in the font dictionary for this TrueType font: {dictionary}.");
}
return lastChar.AsInt();
}
public static decimal[] GetWidths(PdfDictionary dictionary)
{
if (!dictionary.TryGetItemOfType(CosName.WIDTHS, out COSArray widthArray))
{
throw new InvalidFontFormatException($"No widths array was found in the font dictionary for this TrueType font: {dictionary}.");
}
return widthArray.Select(x => ((ICosNumber)x).AsDecimal()).ToArray();
}
public static FontDescriptor GetFontDescriptor(IPdfObjectParser pdfObjectParser, FontDescriptorFactory fontDescriptorFactory, PdfDictionary dictionary,
IRandomAccessRead reader, bool isLenientParsing)
{
if (!dictionary.TryGetItemOfType(CosName.FONT_DESC, out CosObject obj))
{
throw new InvalidFontFormatException($"No font descriptor indirect reference found in the TrueType font: {dictionary}.");
}
var parsed = pdfObjectParser.Parse(obj.ToIndirectReference(), reader, isLenientParsing);
if (!(parsed is PdfDictionary descriptorDictionary))
{
throw new InvalidFontFormatException($"Expected a font descriptor dictionary but instead found {parsed}.");
}
var descriptor = fontDescriptorFactory.Generate(descriptorDictionary, isLenientParsing);
return descriptor;
}
public static CosName GetName(PdfDictionary dictionary, FontDescriptor descriptor)
{
if (dictionary.TryGetName(CosName.BASE_FONT, out CosName name))
{
return name;
}
if (descriptor.FontName != null)
{
return descriptor.FontName;
}
throw new InvalidFontFormatException($"Could not find a name for this TrueType font {dictionary}.");
}
}
}

View File

@@ -37,17 +37,17 @@
public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{
var firstCharacter = GetFirstCharacter(dictionary);
var firstCharacter = FontDictionaryAccessHelper.GetFirstCharacter(dictionary);
var lastCharacter = GetLastCharacter(dictionary);
var lastCharacter = FontDictionaryAccessHelper.GetLastCharacter(dictionary);
var widths = GetWidths(dictionary);
var widths = FontDictionaryAccessHelper.GetWidths(dictionary);
var descriptor = GetFontDescriptor(dictionary, reader, isLenientParsing);
var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfObjectParser, fontDescriptorFactory, dictionary, reader, isLenientParsing);
var font = ParseTrueTypeFont(descriptor, reader, isLenientParsing);
var name = GetName(dictionary, descriptor);
var name = FontDictionaryAccessHelper.GetName(dictionary, descriptor);
CMap toUnicodeCMap = null;
if (dictionary.TryGetItemOfType(CosName.TO_UNICODE, out CosObject toUnicodeObj))
@@ -92,57 +92,6 @@
return new TrueTypeSimpleFont(name, firstCharacter, lastCharacter, widths, descriptor, toUnicodeCMap, encoding);
}
private static int GetFirstCharacter(PdfDictionary dictionary)
{
if (!dictionary.TryGetItemOfType(CosName.FIRST_CHAR, out CosInt firstChar))
{
throw new InvalidFontFormatException(
$"No first character entry was found in the font dictionary for this TrueType font: {dictionary}.");
}
return firstChar.AsInt();
}
private static int GetLastCharacter(PdfDictionary dictionary)
{
if (!dictionary.TryGetItemOfType(CosName.LAST_CHAR, out CosInt lastChar))
{
throw new InvalidFontFormatException(
$"No last character entry was found in the font dictionary for this TrueType font: {dictionary}.");
}
return lastChar.AsInt();
}
private static decimal[] GetWidths(PdfDictionary dictionary)
{
if (!dictionary.TryGetItemOfType(CosName.WIDTHS, out COSArray widthArray))
{
throw new InvalidFontFormatException($"No widths array was found in the font dictionary for this TrueType font: {dictionary}.");
}
return widthArray.Select(x => ((ICosNumber)x).AsDecimal()).ToArray();
}
private FontDescriptor GetFontDescriptor(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{
if (!dictionary.TryGetItemOfType(CosName.FONT_DESC, out CosObject obj))
{
throw new InvalidFontFormatException($"No font descriptor indirect reference found in the TrueType font: {dictionary}.");
}
var parsed = pdfObjectParser.Parse(obj.ToIndirectReference(), reader, isLenientParsing);
if (!(parsed is PdfDictionary descriptorDictionary))
{
throw new InvalidFontFormatException($"Expected a font descriptor dictionary but instead found {parsed}.");
}
var descriptor = fontDescriptorFactory.Generate(descriptorDictionary, isLenientParsing);
return descriptor;
}
private TrueTypeFont ParseTrueTypeFont(FontDescriptor descriptor, IRandomAccessRead reader,
bool isLenientParsing)
{
@@ -170,20 +119,5 @@
return font;
}
private static CosName GetName(PdfDictionary dictionary, FontDescriptor descriptor)
{
if (dictionary.TryGetName(CosName.BASE_FONT, out CosName name))
{
return name;
}
if (descriptor.FontName != null)
{
return descriptor.FontName;
}
throw new InvalidFontFormatException($"Could not find a name for this TrueType font {dictionary}.");
}
}
}

View File

@@ -0,0 +1,85 @@
namespace UglyToad.Pdf.Fonts.Parser.Handlers
{
using System;
using Cmap;
using ContentStream;
using Cos;
using Encodings;
using Exceptions;
using Filters;
using IO;
using Parts;
using Pdf.Parser;
using Simple;
internal class Type1FontHandler : IFontHandler
{
private readonly IPdfObjectParser pdfObjectParser;
private readonly CMapCache cMapCache;
private readonly IFilterProvider filterProvider;
private readonly FontDescriptorFactory fontDescriptorFactory;
public Type1FontHandler(IPdfObjectParser pdfObjectParser, CMapCache cMapCache, IFilterProvider filterProvider, FontDescriptorFactory fontDescriptorFactory)
{
this.pdfObjectParser = pdfObjectParser;
this.cMapCache = cMapCache;
this.filterProvider = filterProvider;
this.fontDescriptorFactory = fontDescriptorFactory;
}
public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{
var firstCharacter = FontDictionaryAccessHelper.GetFirstCharacter(dictionary);
var lastCharacter = FontDictionaryAccessHelper.GetLastCharacter(dictionary);
var widths = FontDictionaryAccessHelper.GetWidths(dictionary);
var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfObjectParser, fontDescriptorFactory, dictionary, reader, isLenientParsing);
var name = FontDictionaryAccessHelper.GetName(dictionary, descriptor);
CMap toUnicodeCMap = null;
if (dictionary.TryGetItemOfType(CosName.TO_UNICODE, out CosObject toUnicodeObj))
{
var toUnicode = pdfObjectParser.Parse(toUnicodeObj.ToIndirectReference(), reader, isLenientParsing) as PdfRawStream;
var decodedUnicodeCMap = toUnicode?.Decode(filterProvider);
if (decodedUnicodeCMap != null)
{
toUnicodeCMap = cMapCache.Parse(new ByteArrayInputBytes(decodedUnicodeCMap), isLenientParsing);
}
}
Encoding encoding = null;
if (dictionary.TryGetValue(CosName.ENCODING, out var encodingBase))
{
// Symbolic fonts default to standard encoding.
if (descriptor.Flags.HasFlag(FontFlags.Symbolic))
{
encoding = StandardEncoding.Instance;
}
if (encodingBase is CosName encodingName)
{
if (!Encoding.TryGetNamedEncoding(encodingName, out encoding))
{
// TODO: PDFBox would not throw here.
throw new InvalidFontFormatException($"Unrecognised encoding name: {encodingName}");
}
}
else if (encodingBase is CosDictionary encodingDictionary)
{
throw new NotImplementedException("No support for reading encoding from dictionary yet.");
}
else
{
throw new NotImplementedException("No support for reading encoding from font yet.");
}
}
return new TrueTypeSimpleFont(name, firstCharacter, lastCharacter, widths, descriptor, toUnicodeCMap, encoding);
}
}
}

View File

@@ -141,6 +141,11 @@ namespace UglyToad.Pdf.Parser.Parts
return IsLineFeed(c) || IsCarriageReturn(c);
}
public static bool IsEndOfLine(byte b)
{
return IsLineFeed(b) || IsCarriageReturn(b);
}
public static bool IsLineFeed(int c)
{
return AsciiLineFeed == c;

View File

@@ -86,7 +86,8 @@
cMapCache,
filterProvider,
pdfObjectParser),
new TrueTypeFontHandler(pdfObjectParser, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser));
new TrueTypeFontHandler(pdfObjectParser, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser),
new Type1FontHandler(pdfObjectParser, cMapCache, filterProvider, fontDescriptorFactory));
var dynamicParser = container.Get<DynamicParser>();
var resourceContainer = new ResourceContainer(pdfObjectParser, fontFactory);