change the content stream processor to extract letters rather than just strings.

This commit is contained in:
Eliot Jones
2017-12-27 19:11:27 +00:00
parent d826ac1b79
commit 940c51e2fb
14 changed files with 301 additions and 97 deletions

View File

@@ -0,0 +1,92 @@
// ReSharper disable ObjectCreationAsStatement
namespace UglyToad.Pdf.Tests.Fonts.Cmap
{
using System;
using Pdf.Fonts.Cmap;
using Xunit;
public class CidRangeTests
{
[Fact]
public void EndCannotBeLowerThanStart()
{
Action action = () => new CidRange(0, -56, 1);
Assert.Throws<ArgumentOutOfRangeException>(action);
}
[Fact]
public void ContainsFalseForLowerNumber()
{
var range = new CidRange(0, 69, 0);
var result = range.Contains(-12);
Assert.False(result);
}
[Fact]
public void ContainsFalseForHigherNumber()
{
var range = new CidRange(0, 69, 0);
var result = range.Contains(100);
Assert.False(result);
}
[Fact]
public void ContainsTrueForNumberInRange()
{
var range = new CidRange(0, 69, 0);
var result = range.Contains(52);
Assert.True(result);
}
[Fact]
public void TryMapFalseForNumberLowerThanRange()
{
var range = new CidRange(0, 69, 0);
var result = range.TryMap(-12, out var _);
Assert.False(result);
}
[Fact]
public void TryMapFalseForNumberHigherThanRange()
{
var range = new CidRange(0, 69, 0);
var result = range.TryMap(250, out var _);
Assert.False(result);
}
[Fact]
public void TryMapMapsCorrectlyForNumberInRange()
{
var range = new CidRange(0, 69, 0);
var result = range.TryMap(52, out var cid);
Assert.True(result);
Assert.Equal(52, cid);
}
[Fact]
public void TryMapMapsCorrectlyForNumberInRangeWithCidOffset()
{
var range = new CidRange(0, 69, 9);
var result = range.TryMap(52, out var cid);
Assert.True(result);
Assert.Equal(61, cid);
}
}
}

View File

@@ -102,6 +102,27 @@ end";
Assert.NotNull(cmap);
}
[Fact]
public void CanParseIdentityHorizontalCMap()
{
var input = new ByteArrayInputBytes(ReadResourceBytes("UglyToad.Pdf.Resources.CMap.Identity-H"));
var cmap = cMapParser.Parse(input, false);
Assert.Equal(1, cmap.CodespaceRanges.Count);
var range = cmap.CodespaceRanges[0];
Assert.Equal(0, range.StartInt);
Assert.Equal(65535, range.EndInt);
Assert.Equal(2, range.CodeLength);
Assert.Equal(256, cmap.CidRanges.Count);
Assert.Equal("10.003", cmap.Version);
}
private static byte[] ReadResourceBytes(string name)
{
using (var resource = typeof(CMapParser).Assembly.GetManifestResourceStream(name))
@@ -119,7 +140,7 @@ end";
foreach (var resource in resources)
{
if (resource.Contains(".CMap."))
if (resource.Contains(".CMap.") && !resource.EndsWith("Identity-H"))
{
yield return new object[] {resource};
}

View File

@@ -56,7 +56,7 @@
var page = document.Pages.GetPage(1);
Assert.Equal(1, page.Number);
var text = string.Join(string.Empty, page.Content.Text);
var text = string.Join(string.Empty, page.Content.Letters.Select(x => x.Value)).Replace("\u200B", string.Empty);
Assert.Equal("This is the document title There is some lede text here And then another line of text.".Replace(" ", string.Empty), text.Replace(" ", string.Empty));
}

View File

@@ -0,0 +1,26 @@
namespace UglyToad.Pdf.Content
{
using Geometry;
public class Letter
{
public string Value { get; }
public PdfPoint Location { get; }
public decimal Width { get; }
public decimal FontSize { get; }
public string FontName { get; }
public Letter(string value, PdfPoint location, decimal width, decimal fontSize, string fontName)
{
Value = value;
Location = location;
Width = width;
FontSize = fontSize;
FontName = fontName;
}
}
}

View File

@@ -14,7 +14,7 @@
internal PageContent Content { get; }
public IReadOnlyList<string> Text => Content?.Text ?? new string[0];
public IReadOnlyList<Letter> Text => Content?.Letters ?? new Letter[0];
internal Page(int number, MediaBox mediaBox, PageContent content)
{

View File

@@ -14,6 +14,6 @@
{
internal IReadOnlyList<IGraphicsStateOperation> GraphicsStateOperations { get; set; }
public IReadOnlyList<string> Text { get; set; }
public IReadOnlyList<Letter> Letters { get; set; }
}
}

View File

@@ -93,7 +93,9 @@
content = context.Process(operations);
}
return new Page(number, mediaBox, content);
var page = new Page(number, mediaBox, content);
return page;
}
}
}

View File

@@ -39,11 +39,17 @@
[NotNull]
public IReadOnlyList<CodespaceRange> CodespaceRanges { get; }
/// <summary>
/// Associates ranges of character codes with their corresponding CID values.
/// </summary>
[NotNull]
public IReadOnlyList<CidRange> CidRanges { get; }
/// <summary>
/// Overrides CID mappings for single character codes.
/// </summary>
[NotNull]
public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; }
public IReadOnlyDictionary<int, CidCharacterMapping> CidCharacterMappings { get; }
/// <summary>
/// Controls whether the font associated with the CMap writes horizontally or vertically.
@@ -54,11 +60,16 @@
public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0;
private readonly int minCodeLength = 4;
private readonly int minCodeLength;
private readonly int maxCodeLength;
public CMap(CharacterIdentifierSystemInfo info, int type, int wMode, string name, string version, IReadOnlyDictionary<int, string> baseFontCharacterMap, IReadOnlyList<CodespaceRange> codespaceRanges, IReadOnlyList<CidRange> cidRanges, IReadOnlyList<CidCharacterMapping> cidCharacterMappings)
{
if (cidCharacterMappings == null)
{
throw new ArgumentNullException(nameof(cidCharacterMappings));
}
Info = info;
Type = type;
WritingMode = (WritingMode)wMode;
@@ -67,17 +78,18 @@
BaseFontCharacterMap = baseFontCharacterMap ?? throw new ArgumentNullException(nameof(baseFontCharacterMap));
CodespaceRanges = codespaceRanges ?? throw new ArgumentNullException(nameof(codespaceRanges));
CidRanges = cidRanges ?? throw new ArgumentNullException(nameof(cidRanges));
CidCharacterMappings = cidCharacterMappings ?? throw new ArgumentNullException(nameof(cidCharacterMappings));
maxCodeLength = CodespaceRanges.Max(x => x.CodeLength);
minCodeLength = CodespaceRanges.Min(x => x.CodeLength);
}
// CID mappings
private readonly Dictionary<int, int> codeToCid = new Dictionary<int, int>();
private readonly List<CidRange> codeToCidRanges = new List<CidRange>();
private static readonly string SPACE = " ";
private int spaceMapping = -1;
var characterMappings = new Dictionary<int, CidCharacterMapping>();
foreach (var characterMapping in cidCharacterMappings)
{
characterMappings[characterMapping.SourceCharacterCode] = characterMapping;
}
CidCharacterMappings = characterMappings;
}
/// <summary>
/// Returns the sequence of Unicode characters for the given character code.
@@ -92,32 +104,30 @@
return found;
}
/**
* Returns the CID for the given character code.
*
* @param code character code
* @return CID
*/
/// <summary>
/// Converts a character code to a CID.
/// </summary>
/// <param name="code">The character code.</param>
/// <returns>The corresponding CID for the character code.</returns>
public int ConvertToCid(int code)
{
if (codeToCid.TryGetValue(code, out var cid))
if (CidCharacterMappings.TryGetValue(code, out var mapping))
{
return cid;
return mapping.DestinationCid;
}
foreach (CidRange range in codeToCidRanges)
foreach (CidRange range in CidRanges)
{
int ch = range.Map((char)code);
if (ch != -1)
if (range.TryMap(code, out var cid))
{
return ch;
return cid;
}
}
return 0;
}
public override string ToString()
{
return Name;

View File

@@ -57,7 +57,8 @@
public IReadOnlyList<CidCharacterMapping> CidCharacterMappings { get; set; }
public IReadOnlyList<CidRange> CidRanges { get; set; }
private List<CidRange> cidRanges = new List<CidRange>();
public IReadOnlyList<CidRange> CidRanges => cidRanges;
public Dictionary<int, string> BaseFontCharacterMap { get; } = new Dictionary<int, string>();
@@ -100,8 +101,8 @@
public void UseCMap(CMap other)
{
CodespaceRanges = Combine(CodespaceRanges, other.CodespaceRanges);
CidCharacterMappings = Combine(CidCharacterMappings, other.CidCharacterMappings);
CidRanges = Combine(CidRanges, other.CidRanges);
CidCharacterMappings = Combine(CidCharacterMappings, other.CidCharacterMappings.Values.ToList());
cidRanges.AddRange(other.CidRanges);
if (other.BaseFontCharacterMap != null)
{
@@ -153,5 +154,10 @@
? OtherEncodings.BytesAsLatin1String(bytes)
: Encoding.BigEndianUnicode.GetString(bytes);
}
public void AddCidRange(CidRange range)
{
cidRanges.Add(range);
}
}
}

View File

@@ -1,14 +1,32 @@
namespace UglyToad.Pdf.Fonts.Cmap
{
public class CidCharacterMapping
/// <summary>
/// Maps from a single character code to its CID.
/// </summary>
public struct CidCharacterMapping
{
public int Source { get; }
public int Destination { get; }
/// <summary>
/// The character code.
/// </summary>
public int SourceCharacterCode { get; }
public CidCharacterMapping(int source, int destination)
/// <summary>
/// The CID to map to.
/// </summary>
public int DestinationCid { get; }
/// <summary>
/// Creates a new single mapping from a character code to a CID.
/// </summary>
public CidCharacterMapping(int sourceCharacterCode, int destinationCid)
{
Source = source;
Destination = destination;
SourceCharacterCode = sourceCharacterCode;
DestinationCid = destinationCid;
}
public override string ToString()
{
return $"Code {SourceCharacterCode} -> CID {DestinationCid}";
}
}
}

View File

@@ -1,48 +1,77 @@
namespace UglyToad.Pdf.Fonts.Cmap
{
public class CidRange
using System;
/// <summary>
/// Associates the beginning and end of a range of character codes with the starting CID for the range.
/// </summary>
public struct CidRange
{
private readonly char from;
/// <summary>
/// The beginning of the range of character codes.
/// </summary>
private readonly int firstCharacterCode;
private readonly char to;
/// <summary>
/// The end of the range of character codes.
/// </summary>
private readonly int lastCharacterCode;
/// <summary>
/// The CID associated with the beginning character code.
/// </summary>
private readonly int cid;
public CidRange(char from, char to, int cid)
/// <summary>
/// Creates a new <see cref="CidRange"/> to associate a range of character codes to a range of CIDs.
/// </summary>
/// <param name="firstCharacterCode">The first character code in the range.</param>
/// <param name="lastCharacterCode">The last character code in the range.</param>
/// <param name="cid">The first CID for the range.</param>
public CidRange(int firstCharacterCode, int lastCharacterCode, int cid)
{
this.from = from;
this.to = to;
if (lastCharacterCode < firstCharacterCode)
{
throw new ArgumentOutOfRangeException(nameof(lastCharacterCode), "The last character code cannot be lower than the first character code: " +
$"First: {firstCharacterCode}, Last: {lastCharacterCode}, CID: {cid}");
}
this.firstCharacterCode = firstCharacterCode;
this.lastCharacterCode = lastCharacterCode;
this.cid = cid;
}
/// <summary>
/// Maps the given Unicode character to the corresponding CID in this range.
/// Determines if this <see cref="CidRange"/> contains a mapping for the character code.
/// </summary>
/// <param name="ch">Unicode character</param>
/// <returns>corresponding CID, or -1 if the character is out of range</returns>
public int Map(char ch)
public bool Contains(int characterCode)
{
if (from <= ch && ch <= to)
{
return cid + (ch - from);
}
return -1;
return firstCharacterCode <= characterCode && characterCode <= lastCharacterCode;
}
/// <summary>
/// Maps the given CID to the corresponding Unicode character in this range.
/// Attempts to map the given character code to the corresponding CID in this range.
/// </summary>
/// <param name="code">CID</param>
/// <returns>corresponding Unicode character, or -1 if the CID is out of range</returns>
public int Unmap(int code)
/// <param name="characterCode">Character code</param>
/// <param name="cidValue">The CID if found.</param>
/// <returns><see langword="true"/> if the character code maps to a CID in this range or <see langword="false"/> if the character is out of range.</returns>
public bool TryMap(int characterCode, out int cidValue)
{
if (cid <= code && code <= cid + (to - from))
cidValue = 0;
if (Contains(characterCode))
{
return from + (code - cid);
cidValue = cid + (characterCode - firstCharacterCode);
return true;
}
return -1;
return false;
}
public override string ToString()
{
return $"CID {cid}: Code {firstCharacterCode} -> {lastCharacterCode}";
}
}
}

View File

@@ -18,8 +18,14 @@
/// </summary>
public IReadOnlyList<byte> End { get; }
/// <summary>
/// The lower-bound of this range as an integer.
/// </summary>
public int StartInt { get; }
/// <summary>
/// The upper-bound of this range as an integer.
/// </summary>
public int EndInt { get; }
/// <summary>
@@ -55,25 +61,31 @@
/// <summary>
/// Returns true if the given code bytes match this codespace range.
/// </summary>
public bool IsFullMatch(byte[] code, int codeLen)
public bool IsFullMatch(byte[] code, int codeLength)
{
if (code == null)
{
throw new ArgumentNullException(nameof(code));
}
// code must be the same length as the bounding codes
if (codeLen == CodeLength)
// the code must be the same length as the bounding codes
if (codeLength != CodeLength)
{
int value = code.ToInt(codeLen);
if (value >= StartInt && value <= EndInt)
{
return true;
}
return false;
}
var value = code.ToInt(codeLength);
if (value >= StartInt && value <= EndInt)
{
return true;
}
return false;
}
public override string ToString()
{
return $"Length {CodeLength}: {StartInt} -> {EndInt}";
}
}
}

View File

@@ -1,8 +1,7 @@
namespace UglyToad.Pdf.Fonts.Parser.Parts
{
using System;
using System.Collections.Generic;
using Cmap;
using Exceptions;
using Tokenization.Scanner;
using Tokenization.Tokens;
@@ -10,37 +9,30 @@
{
public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
{
var ranges = new List<CidRange>();
for (var i = 0; i < numeric.Int; i++)
{
if (!scanner.TryReadToken(out HexToken startHexToken))
{
// TODO: message
throw new InvalidOperationException();
throw new InvalidFontFormatException("Could not find the starting hex token for the CIDRange in this font.");
}
if (!scanner.TryReadToken(out HexToken endHexToken))
{
// TODO: message
throw new InvalidOperationException();
throw new InvalidFontFormatException("Could not find the end hex token for the CIDRange in this font.");
}
if (!scanner.TryReadToken(out NumericToken mappedCode))
{
// TODO: message
throw new InvalidOperationException();
throw new InvalidFontFormatException("Could not find the starting CID numeric token for the CIDRange in this font.");
}
var start = HexToken.ConvertHexBytesToInt(startHexToken);
var end = HexToken.ConvertHexBytesToInt(endHexToken);
var range = new CidRange((char)start, (char)end, mappedCode.Int);
var range = new CidRange(start, end, mappedCode.Int);
ranges.Add(range);
builder.AddCidRange(range);
}
builder.CidRanges = ranges;
}
}
}

View File

@@ -20,7 +20,7 @@
public int StackSize => graphicsStack.Count;
public List<string> Texts = new List<string>();
public List<Letter> Letters = new List<Letter>();
public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore)
{
@@ -37,7 +37,7 @@
return new PageContent
{
GraphicsStateOperations = operations,
Text = Texts
Letters = Letters
};
}
@@ -87,8 +87,6 @@
font.TryGetUnicode(code, out var unicode);
var width = font.GetWidth(code);
var wordSpacing = 0m;
if (code == ' ' && codeLength == 1)
{
@@ -104,7 +102,7 @@
var displacement = font.GetDisplacement(code);
ShowGlyph(renderingMatrix, font, code, unicode, displacement);
ShowGlyph(renderingMatrix, font, code, unicode, displacement, fontSize);
decimal tx, ty;
if (font.IsVertical)
@@ -124,15 +122,13 @@
}
}
private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font,
int characterCode, string unicode, PdfVector displacement)
private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font, int characterCode, string unicode, PdfVector displacement, decimal fontSize)
{
if (unicode.Length == 1 && (unicode[0] == '\0' || unicode[0] == '\u200B'))
{
return;
}
var location = new PdfPoint(renderingMatrix.E, renderingMatrix.F);
Texts.Add(unicode);
var letter = new Letter(unicode, location, displacement.X, fontSize, font.Name.Name);
Letters.Add(letter);
}
}
}