fix bug with computing text positions

This commit is contained in:
Eliot Jones
2018-01-02 22:23:08 +00:00
parent d03c04cca1
commit 5ab8d69ea5
11 changed files with 136 additions and 20 deletions

View File

@@ -1,6 +1,7 @@
namespace UglyToad.Pdf.Tests.Integration
{
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Content;
@@ -51,5 +52,66 @@
Assert.Equal("Hello ﺪﻤﺤﻣ World. ", text);
}
}
[Fact]
public void LetterPositionsAreCorrectPdfBox()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
var pdfBoxData = GetPdfBoxPositionData();
var index = 0;
foreach (var pageLetter in page.Letters)
{
if (index >= pdfBoxData.Count)
{
break;
}
var myX = pageLetter.Location.X;
var theirX = pdfBoxData[index].X;
var myLetter = pageLetter.Value;
var theirLetter = pdfBoxData[index].Text;
if (myLetter == " " && theirLetter != " ")
{
continue;
}
Assert.Equal(theirLetter, myLetter);
Assert.Equal(theirX, myX, 2);
index++;
}
}
}
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
{
const string data = @"90 90.65997 14.42556 H 19 FFJICI+TimesNewRomanPSMT
104.4395 90.65997 8.871117 e 19 FFJICI+TimesNewRomanPSMT
113.3247 90.65997 5.554443 l 19 FFJICI+TimesNewRomanPSMT
118.8931 90.65997 5.554443 l 19 FFJICI+TimesNewRomanPSMT
124.4615 90.65997 9.989998 o 19 FFJICI+TimesNewRomanPSMT
139.4505 90.65997 6.733261 ﺪ 19 FFJIAH+TimesNewRomanPSMT
146.1778 90.65997 7.872116 ﻤ 19 FFJIAH+TimesNewRomanPSMT
154.0439 90.65997 10.5894 ﺤ 19 FFJIAH+TimesNewRomanPSMT
164.6273 90.65997 7.872116 ﻣ 19 FFJIAH+TimesNewRomanPSMT
177.4964 90.65997 18.86111 W 19 FFJICI+TimesNewRomanPSMT
196.3575 90.65997 9.990005 o 19 FFJICI+TimesNewRomanPSMT
206.4275 90.65997 6.653336 r 19 FFJICI+TimesNewRomanPSMT
213.0808 90.65997 5.554443 l 19 FFJICI+TimesNewRomanPSMT
218.6352 90.65997 9.990005 d 19 FFJICI+TimesNewRomanPSMT
228.6252 90.65997 4.994995 . 19 FFJICI+TimesNewRomanPSMT";
var result = data.Split(new[] {"\r", "\n", "\r\n"}, StringSplitOptions.RemoveEmptyEntries)
.Select(AssertablePositionData.Parse)
.ToList();
return result;
}
}
}

View File

@@ -1,15 +0,0 @@
128 64 32 16 8 4 2 1
0 0 0 0 | 0 0 0 0
03
0 0 0 0 | 0 0 1 1
20
0 0 1 0 | 0 0 0 0
37
0 0 1 1 | 0 1 1 1
54
0 1 0 1 | 0 1 0 0
41
0 1 0 0 | 0 0 0 1

View File

@@ -8,6 +8,7 @@
using Graphics;
using IO;
using Parser;
using Util;
internal class PageFactory : IPageFactory
{
@@ -61,6 +62,8 @@
var contents = contentStream.Decode(filterProvider);
var texty = OtherEncodings.BytesAsLatin1String(contents);
var operations = pageContentParser.Parse(new ByteArrayInputBytes(contents));
var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit);

View File

@@ -134,6 +134,35 @@
return new TransformationMatrix(result);
}
public decimal GetScalingFactorX()
{
var xScale = A;
/**
* BM: if the trm is rotated, the calculation is a little more complicated
*
* The rotation matrix multiplied with the scaling matrix is:
* ( x 0 0) ( cos sin 0) ( x*cos x*sin 0)
* ( 0 y 0) * (-sin cos 0) = (-y*sin y*cos 0)
* ( 0 0 1) ( 0 0 1) ( 0 0 1)
*
* So, if you want to deduce x from the matrix you take
* M(0,0) = x*cos and M(0,1) = x*sin and use the theorem of Pythagoras
*
* sqrt(M(0,0)^2+M(0,1)^2) =
* sqrt(x2*cos2+x2*sin2) =
* sqrt(x2*(cos2+sin2)) = <- here is the trick cos2+sin2 is one
* sqrt(x2) =
* abs(x)
*/
if (!(B == 0m && C == 0m))
{
xScale = (decimal)Math.Sqrt((double)(A*A + B*B));
}
return xScale;
}
public override bool Equals(object obj)
{
if (!(obj is TransformationMatrix m))

View File

@@ -1,5 +1,6 @@
namespace UglyToad.Pdf.Fonts.CidFonts
{
using Core;
using Cos;
/// <summary>
@@ -32,6 +33,8 @@
/// </summary>
CharacterIdentifierSystemInfo SystemInfo { get; }
TransformationMatrix FontMatrix { get; }
CidFontType CidFontType { get; }
FontDescriptor Descriptor { get; }

View File

@@ -1,5 +1,6 @@
namespace UglyToad.Pdf.Fonts.CidFonts
{
using Core;
using Cos;
/// <inheritdoc/>
@@ -13,9 +14,15 @@
public CosName SubType { get; }
public CosName BaseFont { get; }
public CharacterIdentifierSystemInfo SystemInfo { get; }
public TransformationMatrix FontMatrix { get; }
public CidFontType CidFontType => CidFontType.Type0;
public FontDescriptor Descriptor { get; }
public Type0CidFont()
{
throw new System.NotImplementedException();
}
public decimal GetWidthFromFont(int characterCode)
{
throw new System.NotImplementedException();

View File

@@ -1,6 +1,7 @@
namespace UglyToad.Pdf.Fonts.CidFonts
{
using System.Collections.Generic;
using Core;
using Cos;
/// <inheritdoc />
@@ -18,6 +19,7 @@
public CosName SubType { get; }
public CosName BaseFont { get; }
public CharacterIdentifierSystemInfo SystemInfo { get; }
public TransformationMatrix FontMatrix { get; }
public CidFontType CidFontType => CidFontType.Type2;
public FontDescriptor Descriptor { get; }
@@ -34,10 +36,15 @@
this.fontProgram = fontProgram;
this.verticalWritingMetrics = verticalWritingMetrics;
this.widths = widths;
// TODO: This should maybe take units per em into account?
var scale = 1 / 1000m;
FontMatrix = TransformationMatrix.FromValues(scale, 0, 0, scale, 0, 0);
}
public decimal GetWidthFromFont(int characterCode)
{
// TODO: Read the font width from the font program.
throw new System.NotImplementedException();
}

View File

@@ -3,6 +3,7 @@
using System;
using CidFonts;
using Cmap;
using Core;
using Cos;
using Geometry;
using IO;
@@ -84,5 +85,10 @@
return fromFont;
}
public TransformationMatrix GetFontMatrix()
{
return CidFont.FontMatrix;
}
}
}

View File

@@ -1,5 +1,6 @@
namespace UglyToad.Pdf.Fonts
{
using Core;
using Cos;
using Geometry;
using IO;
@@ -17,5 +18,7 @@
PdfVector GetDisplacement(int characterCode);
decimal GetWidth(int characterCode);
TransformationMatrix GetFontMatrix();
}
}

View File

@@ -3,6 +3,7 @@
using System;
using Cmap;
using Composite;
using Core;
using Cos;
using Encodings;
using Geometry;
@@ -11,6 +12,8 @@
internal class TrueTypeSimpleFont : IFont
{
private static readonly TransformationMatrix FontMatrix =
TransformationMatrix.FromValues(1/1000m, 0, 0, 1/1000m, 0, 0);
private readonly int firstCharacterCode;
private readonly int lastCharacterCode;
private readonly decimal[] widths;
@@ -108,5 +111,11 @@
return widths[index];
}
public TransformationMatrix GetFontMatrix()
{
// TODO: should this also use units per em?
return FontMatrix;
}
}
}

View File

@@ -89,11 +89,13 @@
}
var fontSize = currentState.FontState.FontSize;
var horizontalScaling = currentState.FontState.HorizontalScaling;
var horizontalScaling = currentState.FontState.HorizontalScaling / 100m;
var characterSpacing = currentState.FontState.CharacterSpacing;
var transformationMatrix = currentState.CurrentTransformationMatrix;
var fontMatrix = font.GetFontMatrix();
// TODO: this does not seem correct, produces the correct result for now but we need to revisit.
// see: https://stackoverflow.com/questions/48010235/pdf-specification-get-font-size-in-points
var pointSize = decimal.Round(fontSize * transformationMatrix.A, 2);
@@ -114,12 +116,12 @@
if (font.IsVertical)
{
throw new NotImplementedException("Vertical fonts are currently unsupported, please submit a pull request or issue with an example file.");
throw new NotImplementedException("Vertical fonts are# currently unsupported, please submit a pull request or issue with an example file.");
}
var displacement = font.GetDisplacement(code);
var width = (displacement.X * fontSize) * transformationMatrix.A;
var width = displacement.X * fontSize * TextMatrices.TextMatrix.GetScalingFactorX() * transformationMatrix.A;
ShowGlyph(renderingMatrix, font, code, unicode, width, fontSize, pointSize);
@@ -148,7 +150,7 @@
var textState = currentState.FontState;
var fontSize = textState.FontSize;
var horizontalScaling = textState.HorizontalScaling;
var horizontalScaling = textState.HorizontalScaling/100m;
var font = resourceStore.GetFont(textState.FontName);
var isVertical = font.IsVertical;