change the content stream processor to extract letters rather than just strings.

This commit is contained in:
Eliot Jones
2017-12-27 19:11:27 +00:00
parent d826ac1b79
commit 940c51e2fb
14 changed files with 301 additions and 97 deletions

View File

@@ -0,0 +1,92 @@
// ReSharper disable ObjectCreationAsStatement
namespace UglyToad.Pdf.Tests.Fonts.Cmap
{
using System;
using Pdf.Fonts.Cmap;
using Xunit;
public class CidRangeTests
{
[Fact]
public void EndCannotBeLowerThanStart()
{
Action action = () => new CidRange(0, -56, 1);
Assert.Throws<ArgumentOutOfRangeException>(action);
}
[Fact]
public void ContainsFalseForLowerNumber()
{
var range = new CidRange(0, 69, 0);
var result = range.Contains(-12);
Assert.False(result);
}
[Fact]
public void ContainsFalseForHigherNumber()
{
var range = new CidRange(0, 69, 0);
var result = range.Contains(100);
Assert.False(result);
}
[Fact]
public void ContainsTrueForNumberInRange()
{
var range = new CidRange(0, 69, 0);
var result = range.Contains(52);
Assert.True(result);
}
[Fact]
public void TryMapFalseForNumberLowerThanRange()
{
var range = new CidRange(0, 69, 0);
var result = range.TryMap(-12, out var _);
Assert.False(result);
}
[Fact]
public void TryMapFalseForNumberHigherThanRange()
{
var range = new CidRange(0, 69, 0);
var result = range.TryMap(250, out var _);
Assert.False(result);
}
[Fact]
public void TryMapMapsCorrectlyForNumberInRange()
{
var range = new CidRange(0, 69, 0);
var result = range.TryMap(52, out var cid);
Assert.True(result);
Assert.Equal(52, cid);
}
[Fact]
public void TryMapMapsCorrectlyForNumberInRangeWithCidOffset()
{
var range = new CidRange(0, 69, 9);
var result = range.TryMap(52, out var cid);
Assert.True(result);
Assert.Equal(61, cid);
}
}
}

View File

@@ -102,6 +102,27 @@ end";
Assert.NotNull(cmap); Assert.NotNull(cmap);
} }
[Fact]
public void CanParseIdentityHorizontalCMap()
{
var input = new ByteArrayInputBytes(ReadResourceBytes("UglyToad.Pdf.Resources.CMap.Identity-H"));
var cmap = cMapParser.Parse(input, false);
Assert.Equal(1, cmap.CodespaceRanges.Count);
var range = cmap.CodespaceRanges[0];
Assert.Equal(0, range.StartInt);
Assert.Equal(65535, range.EndInt);
Assert.Equal(2, range.CodeLength);
Assert.Equal(256, cmap.CidRanges.Count);
Assert.Equal("10.003", cmap.Version);
}
private static byte[] ReadResourceBytes(string name) private static byte[] ReadResourceBytes(string name)
{ {
using (var resource = typeof(CMapParser).Assembly.GetManifestResourceStream(name)) using (var resource = typeof(CMapParser).Assembly.GetManifestResourceStream(name))
@@ -119,7 +140,7 @@ end";
foreach (var resource in resources) foreach (var resource in resources)
{ {
if (resource.Contains(".CMap.")) if (resource.Contains(".CMap.") && !resource.EndsWith("Identity-H"))
{ {
yield return new object[] {resource}; yield return new object[] {resource};
} }

View File

@@ -56,7 +56,7 @@
var page = document.Pages.GetPage(1); var page = document.Pages.GetPage(1);
Assert.Equal(1, page.Number); Assert.Equal(1, page.Number);
var text = string.Join(string.Empty, page.Content.Text); var text = string.Join(string.Empty, page.Content.Letters.Select(x => x.Value)).Replace("\u200B", string.Empty);
Assert.Equal("This is the document title There is some lede text here And then another line of text.".Replace(" ", string.Empty), text.Replace(" ", string.Empty)); Assert.Equal("This is the document title There is some lede text here And then another line of text.".Replace(" ", string.Empty), text.Replace(" ", string.Empty));
} }

View File

@@ -0,0 +1,26 @@
namespace UglyToad.Pdf.Content
{
using Geometry;
public class Letter
{
public string Value { get; }
public PdfPoint Location { get; }
public decimal Width { get; }
public decimal FontSize { get; }
public string FontName { get; }
public Letter(string value, PdfPoint location, decimal width, decimal fontSize, string fontName)
{
Value = value;
Location = location;
Width = width;
FontSize = fontSize;
FontName = fontName;
}
}
}

View File

@@ -14,7 +14,7 @@
internal PageContent Content { get; } internal PageContent Content { get; }
public IReadOnlyList<string> Text => Content?.Text ?? new string[0]; public IReadOnlyList<Letter> Text => Content?.Letters ?? new Letter[0];
internal Page(int number, MediaBox mediaBox, PageContent content) internal Page(int number, MediaBox mediaBox, PageContent content)
{ {

View File

@@ -14,6 +14,6 @@
{ {
internal IReadOnlyList<IGraphicsStateOperation> GraphicsStateOperations { get; set; } internal IReadOnlyList<IGraphicsStateOperation> GraphicsStateOperations { get; set; }
public IReadOnlyList<string> Text { get; set; } public IReadOnlyList<Letter> Letters { get; set; }
} }
} }

View File

@@ -93,7 +93,9 @@
content = context.Process(operations); content = context.Process(operations);
} }
return new Page(number, mediaBox, content); var page = new Page(number, mediaBox, content);
return page;
} }
} }
} }

View File

@@ -39,11 +39,17 @@
[NotNull] [NotNull]
public IReadOnlyList<CodespaceRange> CodespaceRanges { get; } public IReadOnlyList<CodespaceRange> CodespaceRanges { get; }
/// <summary>
/// Associates ranges of character codes with their corresponding CID values.
/// </summary>
[NotNull] [NotNull]
public IReadOnlyList<CidRange> CidRanges { get; } public IReadOnlyList<CidRange> CidRanges { get; }
/// <summary>
/// Overrides CID mappings for single character codes.
/// </summary>
[NotNull] [NotNull]
public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; } public IReadOnlyDictionary<int, CidCharacterMapping> CidCharacterMappings { get; }
/// <summary> /// <summary>
/// Controls whether the font associated with the CMap writes horizontally or vertically. /// Controls whether the font associated with the CMap writes horizontally or vertically.
@@ -54,11 +60,16 @@
public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0; public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0;
private readonly int minCodeLength = 4; private readonly int minCodeLength;
private readonly int maxCodeLength; private readonly int maxCodeLength;
public CMap(CharacterIdentifierSystemInfo info, int type, int wMode, string name, string version, IReadOnlyDictionary<int, string> baseFontCharacterMap, IReadOnlyList<CodespaceRange> codespaceRanges, IReadOnlyList<CidRange> cidRanges, IReadOnlyList<CidCharacterMapping> cidCharacterMappings) public CMap(CharacterIdentifierSystemInfo info, int type, int wMode, string name, string version, IReadOnlyDictionary<int, string> baseFontCharacterMap, IReadOnlyList<CodespaceRange> codespaceRanges, IReadOnlyList<CidRange> cidRanges, IReadOnlyList<CidCharacterMapping> cidCharacterMappings)
{ {
if (cidCharacterMappings == null)
{
throw new ArgumentNullException(nameof(cidCharacterMappings));
}
Info = info; Info = info;
Type = type; Type = type;
WritingMode = (WritingMode)wMode; WritingMode = (WritingMode)wMode;
@@ -67,17 +78,18 @@
BaseFontCharacterMap = baseFontCharacterMap ?? throw new ArgumentNullException(nameof(baseFontCharacterMap)); BaseFontCharacterMap = baseFontCharacterMap ?? throw new ArgumentNullException(nameof(baseFontCharacterMap));
CodespaceRanges = codespaceRanges ?? throw new ArgumentNullException(nameof(codespaceRanges)); CodespaceRanges = codespaceRanges ?? throw new ArgumentNullException(nameof(codespaceRanges));
CidRanges = cidRanges ?? throw new ArgumentNullException(nameof(cidRanges)); CidRanges = cidRanges ?? throw new ArgumentNullException(nameof(cidRanges));
CidCharacterMappings = cidCharacterMappings ?? throw new ArgumentNullException(nameof(cidCharacterMappings));
maxCodeLength = CodespaceRanges.Max(x => x.CodeLength); maxCodeLength = CodespaceRanges.Max(x => x.CodeLength);
minCodeLength = CodespaceRanges.Min(x => x.CodeLength); minCodeLength = CodespaceRanges.Min(x => x.CodeLength);
var characterMappings = new Dictionary<int, CidCharacterMapping>();
foreach (var characterMapping in cidCharacterMappings)
{
characterMappings[characterMapping.SourceCharacterCode] = characterMapping;
} }
// CID mappings CidCharacterMappings = characterMappings;
private readonly Dictionary<int, int> codeToCid = new Dictionary<int, int>(); }
private readonly List<CidRange> codeToCidRanges = new List<CidRange>();
private static readonly string SPACE = " ";
private int spaceMapping = -1;
/// <summary> /// <summary>
/// Returns the sequence of Unicode characters for the given character code. /// Returns the sequence of Unicode characters for the given character code.
@@ -92,25 +104,23 @@
return found; return found;
} }
/** /// <summary>
* Returns the CID for the given character code. /// Converts a character code to a CID.
* /// </summary>
* @param code character code /// <param name="code">The character code.</param>
* @return CID /// <returns>The corresponding CID for the character code.</returns>
*/
public int ConvertToCid(int code) public int ConvertToCid(int code)
{ {
if (codeToCid.TryGetValue(code, out var cid)) if (CidCharacterMappings.TryGetValue(code, out var mapping))
{ {
return cid; return mapping.DestinationCid;
} }
foreach (CidRange range in codeToCidRanges) foreach (CidRange range in CidRanges)
{ {
int ch = range.Map((char)code); if (range.TryMap(code, out var cid))
if (ch != -1)
{ {
return ch; return cid;
} }
} }

View File

@@ -57,7 +57,8 @@
public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; set; } public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; set; }
public IReadOnlyList<CidRange> CidRanges { get; set; } private List<CidRange> cidRanges = new List<CidRange>();
public IReadOnlyList<CidRange> CidRanges => cidRanges;
public Dictionary<int, string> BaseFontCharacterMap { get; } = new Dictionary<int, string>(); public Dictionary<int, string> BaseFontCharacterMap { get; } = new Dictionary<int, string>();
@@ -100,8 +101,8 @@
public void UseCMap(CMap other) public void UseCMap(CMap other)
{ {
CodespaceRanges = Combine(CodespaceRanges, other.CodespaceRanges); CodespaceRanges = Combine(CodespaceRanges, other.CodespaceRanges);
CidCharacterMappings = Combine(CidCharacterMappings, other.CidCharacterMappings); CidCharacterMappings = Combine(CidCharacterMappings, other.CidCharacterMappings.Values.ToList());
CidRanges = Combine(CidRanges, other.CidRanges); cidRanges.AddRange(other.CidRanges);
if (other.BaseFontCharacterMap != null) if (other.BaseFontCharacterMap != null)
{ {
@@ -153,5 +154,10 @@
? OtherEncodings.BytesAsLatin1String(bytes) ? OtherEncodings.BytesAsLatin1String(bytes)
: Encoding.BigEndianUnicode.GetString(bytes); : Encoding.BigEndianUnicode.GetString(bytes);
} }
public void AddCidRange(CidRange range)
{
cidRanges.Add(range);
}
} }
} }

View File

@@ -1,14 +1,32 @@
namespace UglyToad.Pdf.Fonts.Cmap namespace UglyToad.Pdf.Fonts.Cmap
{ {
public class CidCharacterMapping /// <summary>
/// Maps from a single character code to its CID.
/// </summary>
public struct CidCharacterMapping
{ {
public int Source { get; } /// <summary>
public int Destination { get; } /// The character code.
/// </summary>
public int SourceCharacterCode { get; }
public CidCharacterMapping(int source, int destination) /// <summary>
/// The CID to map to.
/// </summary>
public int DestinationCid { get; }
/// <summary>
/// Creates a new single mapping from a character code to a CID.
/// </summary>
public CidCharacterMapping(int sourceCharacterCode, int destinationCid)
{ {
Source = source; SourceCharacterCode = sourceCharacterCode;
Destination = destination; DestinationCid = destinationCid;
}
public override string ToString()
{
return $"Code {SourceCharacterCode} -> CID {DestinationCid}";
} }
} }
} }

View File

@@ -1,48 +1,77 @@
namespace UglyToad.Pdf.Fonts.Cmap namespace UglyToad.Pdf.Fonts.Cmap
{ {
public class CidRange using System;
/// <summary>
/// Associates the beginning and end of a range of character codes with the starting CID for the range.
/// </summary>
public struct CidRange
{ {
private readonly char from; /// <summary>
/// The beginning of the range of character codes.
/// </summary>
private readonly int firstCharacterCode;
private readonly char to; /// <summary>
/// The end of the range of character codes.
/// </summary>
private readonly int lastCharacterCode;
/// <summary>
/// The CID associated with the beginning character code.
/// </summary>
private readonly int cid; private readonly int cid;
public CidRange(char from, char to, int cid) /// <summary>
/// Creates a new <see cref="CidRange"/> to associate a range of character codes to a range of CIDs.
/// </summary>
/// <param name="firstCharacterCode">The first character code in the range.</param>
/// <param name="lastCharacterCode">The last character code in the range.</param>
/// <param name="cid">The first CID for the range.</param>
public CidRange(int firstCharacterCode, int lastCharacterCode, int cid)
{ {
this.from = from; if (lastCharacterCode < firstCharacterCode)
this.to = to; {
throw new ArgumentOutOfRangeException(nameof(lastCharacterCode), "The last character code cannot be lower than the first character code: " +
$"First: {firstCharacterCode}, Last: {lastCharacterCode}, CID: {cid}");
}
this.firstCharacterCode = firstCharacterCode;
this.lastCharacterCode = lastCharacterCode;
this.cid = cid; this.cid = cid;
} }
/// <summary> /// <summary>
/// Maps the given Unicode character to the corresponding CID in this range. /// Determines if this <see cref="CidRange"/> contains a mapping for the character code.
/// </summary> /// </summary>
/// <param name="ch">Unicode character</param> public bool Contains(int characterCode)
/// <returns>corresponding CID, or -1 if the character is out of range</returns>
public int Map(char ch)
{ {
if (from <= ch && ch <= to) return firstCharacterCode <= characterCode && characterCode <= lastCharacterCode;
{
return cid + (ch - from);
}
return -1;
} }
/// <summary> /// <summary>
/// Maps the given CID to the corresponding Unicode character in this range. /// Attempts to map the given character code to the corresponding CID in this range.
/// </summary> /// </summary>
/// <param name="code">CID</param> /// <param name="characterCode">Character code</param>
/// <returns>corresponding Unicode character, or -1 if the CID is out of range</returns> /// <param name="cidValue">The CID if found.</param>
public int Unmap(int code) /// <returns><see langword="true"/> if the character code maps to a CID in this range or <see langword="false"/> if the character is out of range.</returns>
public bool TryMap(int characterCode, out int cidValue)
{ {
if (cid <= code && code <= cid + (to - from)) cidValue = 0;
if (Contains(characterCode))
{ {
return from + (code - cid); cidValue = cid + (characterCode - firstCharacterCode);
}
return -1; return true;
} }
return false;
} }
public override string ToString()
{
return $"CID {cid}: Code {firstCharacterCode} -> {lastCharacterCode}";
}
}
} }

View File

@@ -18,8 +18,14 @@
/// </summary> /// </summary>
public IReadOnlyList<byte> End { get; } public IReadOnlyList<byte> End { get; }
/// <summary>
/// The lower-bound of this range as an integer.
/// </summary>
public int StartInt { get; } public int StartInt { get; }
/// <summary>
/// The upper-bound of this range as an integer.
/// </summary>
public int EndInt { get; } public int EndInt { get; }
/// <summary> /// <summary>
@@ -55,25 +61,31 @@
/// <summary> /// <summary>
/// Returns true if the given code bytes match this codespace range. /// Returns true if the given code bytes match this codespace range.
/// </summary> /// </summary>
public bool IsFullMatch(byte[] code, int codeLen) public bool IsFullMatch(byte[] code, int codeLength)
{ {
if (code == null) if (code == null)
{ {
throw new ArgumentNullException(nameof(code)); throw new ArgumentNullException(nameof(code));
} }
// code must be the same length as the bounding codes // the code must be the same length as the bounding codes
if (codeLen == CodeLength) if (codeLength != CodeLength)
{ {
int value = code.ToInt(codeLen); return false;
}
var value = code.ToInt(codeLength);
if (value >= StartInt && value <= EndInt) if (value >= StartInt && value <= EndInt)
{ {
return true; return true;
} }
}
return false; return false;
} }
public override string ToString()
{
return $"Length {CodeLength}: {StartInt} -> {EndInt}";
}
} }
} }

View File

@@ -1,8 +1,7 @@
namespace UglyToad.Pdf.Fonts.Parser.Parts namespace UglyToad.Pdf.Fonts.Parser.Parts
{ {
using System;
using System.Collections.Generic;
using Cmap; using Cmap;
using Exceptions;
using Tokenization.Scanner; using Tokenization.Scanner;
using Tokenization.Tokens; using Tokenization.Tokens;
@@ -10,37 +9,30 @@
{ {
public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing) public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
{ {
var ranges = new List<CidRange>();
for (var i = 0; i < numeric.Int; i++) for (var i = 0; i < numeric.Int; i++)
{ {
if (!scanner.TryReadToken(out HexToken startHexToken)) if (!scanner.TryReadToken(out HexToken startHexToken))
{ {
// TODO: message throw new InvalidFontFormatException("Could not find the starting hex token for the CIDRange in this font.");
throw new InvalidOperationException();
} }
if (!scanner.TryReadToken(out HexToken endHexToken)) if (!scanner.TryReadToken(out HexToken endHexToken))
{ {
// TODO: message throw new InvalidFontFormatException("Could not find the end hex token for the CIDRange in this font.");
throw new InvalidOperationException();
} }
if (!scanner.TryReadToken(out NumericToken mappedCode)) if (!scanner.TryReadToken(out NumericToken mappedCode))
{ {
// TODO: message throw new InvalidFontFormatException("Could not find the starting CID numeric token for the CIDRange in this font.");
throw new InvalidOperationException();
} }
var start = HexToken.ConvertHexBytesToInt(startHexToken); var start = HexToken.ConvertHexBytesToInt(startHexToken);
var end = HexToken.ConvertHexBytesToInt(endHexToken); var end = HexToken.ConvertHexBytesToInt(endHexToken);
var range = new CidRange((char)start, (char)end, mappedCode.Int); var range = new CidRange(start, end, mappedCode.Int);
ranges.Add(range); builder.AddCidRange(range);
} }
builder.CidRanges = ranges;
} }
} }
} }

View File

@@ -20,7 +20,7 @@
public int StackSize => graphicsStack.Count; public int StackSize => graphicsStack.Count;
public List<string> Texts = new List<string>(); public List<Letter> Letters = new List<Letter>();
public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore) public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore)
{ {
@@ -37,7 +37,7 @@
return new PageContent return new PageContent
{ {
GraphicsStateOperations = operations, GraphicsStateOperations = operations,
Text = Texts Letters = Letters
}; };
} }
@@ -87,8 +87,6 @@
font.TryGetUnicode(code, out var unicode); font.TryGetUnicode(code, out var unicode);
var width = font.GetWidth(code);
var wordSpacing = 0m; var wordSpacing = 0m;
if (code == ' ' && codeLength == 1) if (code == ' ' && codeLength == 1)
{ {
@@ -104,7 +102,7 @@
var displacement = font.GetDisplacement(code); var displacement = font.GetDisplacement(code);
ShowGlyph(renderingMatrix, font, code, unicode, displacement); ShowGlyph(renderingMatrix, font, code, unicode, displacement, fontSize);
decimal tx, ty; decimal tx, ty;
if (font.IsVertical) if (font.IsVertical)
@@ -124,15 +122,13 @@
} }
} }
private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font, private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font, int characterCode, string unicode, PdfVector displacement, decimal fontSize)
int characterCode, string unicode, PdfVector displacement)
{ {
if (unicode.Length == 1 && (unicode[0] == '\0' || unicode[0] == '\u200B'))
{
return;
}
var location = new PdfPoint(renderingMatrix.E, renderingMatrix.F); var location = new PdfPoint(renderingMatrix.E, renderingMatrix.F);
Texts.Add(unicode);
var letter = new Letter(unicode, location, displacement.X, fontSize, font.Name.Name);
Letters.Add(letter);
} }
} }
} }