add more documents to test font size and add tests to check our text positions against other providers

This commit is contained in:
Eliot Jones
2017-12-28 16:58:52 +00:00
parent b1d28a5af8
commit 17d1d77abc
16 changed files with 367 additions and 46 deletions

View File

@@ -0,0 +1,70 @@
namespace UglyToad.Pdf.Tests.Graphics.Operations.TextState
{
using System;
using Pdf.Cos;
using Pdf.Graphics.Operations.TextState;
using Xunit;
public class SetFontAndSizeTests
{
private static readonly CosName Font1Name = CosName.Create("Font1");
[Fact]
public void HasCorrectSymbol()
{
var symbol = SetFontAndSize.Symbol;
Assert.Equal("Tf", symbol);
}
[Fact]
public void SetsValues()
{
var setFontAndSize = new SetFontAndSize(Font1Name, 12.75m);
Assert.Equal("Font1", setFontAndSize.Font.Name);
Assert.Equal(12.75m, setFontAndSize.Size);
}
[Fact]
public void HasCorrectOperator()
{
var setFontAndSize = new SetFontAndSize(Font1Name, 12);
Assert.Equal("Tf", setFontAndSize.Operator);
}
[Fact]
public void NameNullThrows()
{
// ReSharper disable once ObjectCreationAsStatement
Action action = () => new SetFontAndSize(null, 6);
Assert.Throws<ArgumentNullException>(action);
}
[Fact]
public void StringRepresentationIsCorrect()
{
var setFontAndSize = new SetFontAndSize(Font1Name, 12.76m);
Assert.Equal("/Font1 12.76 Tf", setFontAndSize.ToString());
}
[Fact]
public void RunSetsFontAndFontSize()
{
var setFontAndSize = new SetFontAndSize(Font1Name, 69.42m);
var context = new TestOperationContext();
var store = new TestResourceStore();
setFontAndSize.Run(context, store);
var state = context.GetCurrentState();
Assert.Equal(69.42m, state.FontState.FontSize);
Assert.Equal(Font1Name, state.FontState.FontName);
}
}
}

View File

@@ -162,11 +162,7 @@
{ {
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents")); var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
var files = Directory.GetFiles(documentFolder); return Path.Combine(documentFolder, "Single Page Simple - from google drive.pdf");
var file = files[n];
return file;
} }
} }
} }

View File

@@ -92,6 +92,95 @@ namespace UglyToad.Pdf.Tests.Integration
} }
} }
[Fact]
public void LettersHavePdfBoxPositions()
{
var file = GetFilename();
var pdfBoxData = GetPdfBoxPositionData();
var index = 0;
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
var page = document.GetPage(1);
foreach (var letter in page.Letters)
{
// Something a bit weird with how we or PdfBox handle hidden characters and spaces.
if (IgnoredHiddenCharacters.Contains(letter.Value) || string.IsNullOrWhiteSpace(letter.Value))
{
continue;
}
var datum = pdfBoxData[index];
while (IgnoredHiddenCharacters.Contains(datum.Text))
{
index++;
datum = pdfBoxData[index];
}
Assert.Equal(datum.Text, letter.Value);
Assert.Equal(datum.X, letter.Location.X, 2);
var transformed = page.Height - letter.Location.Y;
Assert.Equal(datum.Y, transformed, 2);
Assert.Equal(datum.Width, letter.Width, 2);
Assert.Equal(datum.FontName, letter.FontName);
// I think we have font size wrong for now, or right, but differently correct...
index++;
}
}
}
[Fact]
public void LettersHaveOtherProviderPositions()
{
var file = GetFilename();
var pdfBoxData = GetOtherPositionData1();
var index = 0;
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
var page = document.GetPage(1);
foreach (var letter in page.Letters)
{
// Something a bit weird with how we or this provider handle hidden characters and spaces.
if (IgnoredHiddenCharacters.Contains(letter.Value) || string.IsNullOrWhiteSpace(letter.Value))
{
continue;
}
var datum = pdfBoxData[index];
while (IgnoredHiddenCharacters.Contains(datum.Text) || datum.Text == " ")
{
index++;
datum = pdfBoxData[index];
}
Assert.Equal(datum.Text, letter.Value);
Assert.Equal(datum.X, letter.Location.X, 2);
var transformed = page.Height - letter.Location.Y;
Assert.Equal(datum.Y, transformed, 2);
// Until we get width from glyphs we're a bit out.
Assert.True(Math.Abs(datum.Width - letter.Width) < 0.03m);
index++;
}
}
}
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData() private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
{ {
// X Y Width Letter FontSize Font // X Y Width Letter FontSize Font

View File

@@ -10,6 +10,8 @@
<ItemGroup> <ItemGroup>
<None Remove="Fonts\TrueType\google-simple-doc.ttf" /> <None Remove="Fonts\TrueType\google-simple-doc.ttf" />
<None Remove="Fonts\TrueType\Roboto-Regular.ttf" /> <None Remove="Fonts\TrueType\Roboto-Regular.ttf" />
<None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
<None Remove="Integration\Documents\Font Size Text - from google chrome print pdf.pdf" />
<None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" /> <None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" />
</ItemGroup> </ItemGroup>
@@ -20,6 +22,12 @@
<EmbeddedResource Include="Fonts\TrueType\Roboto-Regular.ttf"> <EmbeddedResource Include="Fonts\TrueType\Roboto-Regular.ttf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</EmbeddedResource> </EmbeddedResource>
<Content Include="Integration\Documents\Font Size Test - from libre office.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Font Size Text - from google chrome print pdf.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Single Page Simple - from google drive.pdf"> <Content Include="Integration\Documents\Single Page Simple - from google drive.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content> </Content>

View File

@@ -0,0 +1,20 @@
namespace UglyToad.Pdf.Content
{
using System;
using Geometry;
using Util.JetBrains.Annotations;
/// <summary>
/// Defines the visible region, contents expanding beyond the crop box should be clipped.
/// </summary>
public class CropBox
{
[NotNull]
public PdfRectangle Bounds { get; }
public CropBox(PdfRectangle bounds)
{
Bounds = bounds ?? throw new ArgumentNullException(nameof(bounds));
}
}
}

View File

@@ -4,23 +4,46 @@
public class Letter public class Letter
{ {
/// <summary>
/// The text for this letter or unicode character.
/// </summary>
public string Value { get; } public string Value { get; }
public PdfPoint Location { get; } public PdfPoint Location { get; }
/// <summary>
/// The width of the letter.
/// </summary>
public decimal Width { get; } public decimal Width { get; }
public decimal FontSize { get; } /// <summary>
/// Size defined by the Tj operator prior to our possibly incorrect transformation.
/// </summary>
internal decimal FontSize { get; }
/// <summary>
/// The name of the font.
/// </summary>
public string FontName { get; } public string FontName { get; }
public Letter(string value, PdfPoint location, decimal width, decimal fontSize, string fontName) /// <summary>
/// The size of the font in points.
/// </summary>
public decimal PointSize { get; }
internal Letter(string value, PdfPoint location, decimal width, decimal fontSize, string fontName, decimal pointSize)
{ {
Value = value; Value = value;
Location = location; Location = location;
Width = width; Width = width;
FontSize = fontSize; FontSize = fontSize;
FontName = fontName; FontName = fontName;
PointSize = pointSize;
}
public override string ToString()
{
return $"{Location} {Width} {Value} {FontName} {PointSize}";
} }
} }
} }

View File

@@ -12,6 +12,8 @@
internal MediaBox MediaBox { get; } internal MediaBox MediaBox { get; }
internal CropBox CropBox { get; }
internal PageContent Content { get; } internal PageContent Content { get; }
public IReadOnlyList<Letter> Letters => Content?.Letters ?? new Letter[0]; public IReadOnlyList<Letter> Letters => Content?.Letters ?? new Letter[0];
@@ -26,7 +28,7 @@
/// </summary> /// </summary>
public decimal Height { get; } public decimal Height { get; }
internal Page(int number, MediaBox mediaBox, PageContent content) internal Page(int number, MediaBox mediaBox, CropBox cropBox, PageContent content)
{ {
if (number <= 0) if (number <= 0)
{ {
@@ -35,6 +37,7 @@
Number = number; Number = number;
MediaBox = mediaBox; MediaBox = mediaBox;
CropBox = cropBox;
Content = content; Content = content;
Width = mediaBox.Bounds.Width; Width = mediaBox.Bounds.Width;

View File

@@ -40,6 +40,75 @@
throw new InvalidOperationException($"Page {number} had its type was specified as {type} rather than 'Page'."); throw new InvalidOperationException($"Page {number} had its type was specified as {type} rather than 'Page'.");
} }
MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers, isLenientParsing);
CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox);
if (dictionary.GetItemOrDefault(CosName.RESOURCES) is PdfDictionary resource)
{
resourceStore.LoadResourceDictionary(resource, reader, isLenientParsing);
}
UserSpaceUnit userSpaceUnit = GetUserSpaceUnits(dictionary);
PageContent content = default(PageContent);
var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject;
if (contentObject != null)
{
var contentStream = pdfObjectParser.Parse(contentObject.ToIndirectReference(), reader, false) as PdfRawStream;
if (contentStream == null)
{
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
}
var contents = contentStream.Decode(filterProvider);
var operations = pageContentParser.Parse(new ByteArrayInputBytes(contents));
var context = new ContentStreamProcessor(mediaBox.Bounds, resourceStore, userSpaceUnit);
content = context.Process(operations);
}
var page = new Page(number, mediaBox, cropBox, content);
return page;
}
private static UserSpaceUnit GetUserSpaceUnits(PdfDictionary dictionary)
{
var spaceUnits = UserSpaceUnit.Default;
if (dictionary.TryGetValue(CosName.USER_UNIT, out var userUnitCosBase) && userUnitCosBase is ICosNumber userUnitNumber)
{
spaceUnits = new UserSpaceUnit(userUnitNumber.AsInt());
}
return spaceUnits;
}
private static CropBox GetCropBox(PdfDictionary dictionary, PageTreeMembers pageTreeMembers, MediaBox mediaBox)
{
CropBox cropBox;
if (dictionary.TryGetItemOfType(CosName.CROP_BOX, out COSArray cropBoxArray))
{
var x1 = cropBoxArray.getInt(0);
var y1 = cropBoxArray.getInt(1);
var x2 = cropBoxArray.getInt(2);
var y2 = cropBoxArray.getInt(3);
cropBox = new CropBox(new PdfRectangle(x1, y1, x2, y2));
}
else
{
cropBox = pageTreeMembers.GetCropBox() ?? new CropBox(mediaBox.Bounds);
}
return cropBox;
}
private static MediaBox GetMediaBox(int number, PdfDictionary dictionary, PageTreeMembers pageTreeMembers, bool isLenientParsing)
{
MediaBox mediaBox; MediaBox mediaBox;
if (dictionary.TryGetItemOfType(CosName.MEDIA_BOX, out COSArray mediaboxArray)) if (dictionary.TryGetItemOfType(CosName.MEDIA_BOX, out COSArray mediaboxArray))
{ {
@@ -67,35 +136,7 @@
} }
} }
if (dictionary.GetItemOrDefault(CosName.RESOURCES) is PdfDictionary resource) return mediaBox;
{
resourceStore.LoadResourceDictionary(resource, reader, isLenientParsing);
}
PageContent content = default(PageContent);
var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject;
if (contentObject != null)
{
var contentStream = pdfObjectParser.Parse(contentObject.ToIndirectReference(), reader, false) as PdfRawStream;
if (contentStream == null)
{
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
}
var contents = contentStream.Decode(filterProvider);
var operations = pageContentParser.Parse(new ByteArrayInputBytes(contents));
var context = new ContentStreamProcessor(mediaBox.Bounds, resourceStore);
content = context.Process(operations);
}
var page = new Page(number, mediaBox, content);
return page;
} }
} }
} }

View File

@@ -12,5 +12,10 @@
// TODO: tree inheritance // TODO: tree inheritance
throw new NotImplementedException("Track inherited members"); throw new NotImplementedException("Track inherited members");
} }
public CropBox GetCropBox()
{
return null;
}
} }
} }

View File

@@ -18,9 +18,15 @@
private readonly decimal[] value; private readonly decimal[] value;
/// <summary>
/// The scale for the X dimension.
/// </summary>
public decimal A => value[0]; public decimal A => value[0];
public decimal B => value[1]; public decimal B => value[1];
public decimal C => value[3]; public decimal C => value[3];
/// <summary>
/// The scale for the Y dimension.
/// </summary>
public decimal D => value[4]; public decimal D => value[4];
public decimal E => value[6]; public decimal E => value[6];
public decimal F => value[7]; public decimal F => value[7];

View File

@@ -518,6 +518,7 @@ namespace UglyToad.Pdf.Cos
public static readonly CosName UNIX = new CosName("Unix"); public static readonly CosName UNIX = new CosName("Unix");
public static readonly CosName URI = new CosName("URI"); public static readonly CosName URI = new CosName("URI");
public static readonly CosName URL = new CosName("URL"); public static readonly CosName URL = new CosName("URL");
public static readonly CosName USER_UNIT = new CosName("UserUnit");
// V // V
public static readonly CosName V = new CosName("V"); public static readonly CosName V = new CosName("V");
public static readonly CosName VERISIGN_PPKVS = new CosName("VeriSign.PPKVS"); public static readonly CosName VERISIGN_PPKVS = new CosName("VeriSign.PPKVS");

View File

@@ -0,0 +1,36 @@
namespace UglyToad.Pdf.Geometry
{
using System;
/// <summary>
/// By default user space units correspond to 1/72nd of an inch (a typographic point).
/// The UserUnit entry in a page dictionary can define the space units as a different multiple of 1/72 (1 point).
/// </summary>
public struct UserSpaceUnit
{
public static readonly UserSpaceUnit Default = new UserSpaceUnit(1);
/// <summary>
/// The number of points (1/72nd of an inch) corresponding to a single unit in user space.
/// </summary>
public int PointMultiples { get; }
/// <summary>
/// Create a new unit specification for a page.
/// </summary>
public UserSpaceUnit(int pointMultiples)
{
if (pointMultiples <= 0)
{
throw new ArgumentOutOfRangeException("Cannot have a zero or negative value of point multiples: " + pointMultiples);
}
PointMultiples = pointMultiples;
}
public override string ToString()
{
return PointMultiples.ToString();
}
}
}

View File

@@ -13,6 +13,7 @@
internal class ContentStreamProcessor : IOperationContext internal class ContentStreamProcessor : IOperationContext
{ {
private readonly IResourceStore resourceStore; private readonly IResourceStore resourceStore;
private readonly UserSpaceUnit userSpaceUnit;
private Stack<CurrentGraphicsState> graphicsStack = new Stack<CurrentGraphicsState>(); private Stack<CurrentGraphicsState> graphicsStack = new Stack<CurrentGraphicsState>();
@@ -22,9 +23,10 @@
public List<Letter> Letters = new List<Letter>(); public List<Letter> Letters = new List<Letter>();
public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore) public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore, UserSpaceUnit userSpaceUnit)
{ {
this.resourceStore = resourceStore; this.resourceStore = resourceStore;
this.userSpaceUnit = userSpaceUnit;
graphicsStack.Push(new CurrentGraphicsState()); graphicsStack.Push(new CurrentGraphicsState());
} }
@@ -77,10 +79,18 @@
{ {
var font = resourceStore.GetFont(GetCurrentState().FontState.FontName); var font = resourceStore.GetFont(GetCurrentState().FontState.FontName);
var fontSize = GetCurrentState().FontState.FontSize; var currentState = GetCurrentState();
var horizontalScaling = GetCurrentState().FontState.HorizontalScaling;
var characterSpacing = GetCurrentState().FontState.CharacterSpacing;
var fontSize = currentState.FontState.FontSize;
var horizontalScaling = currentState.FontState.HorizontalScaling;
var characterSpacing = currentState.FontState.CharacterSpacing;
var transformationMatrix = currentState.CurrentTransformationMatrix;
// TODO: this does not seem correct, produces the correct result for now but we need to revisit.
// see: https://stackoverflow.com/questions/48010235/pdf-specification-get-font-size-in-points
var pointSize = decimal.Round(fontSize * transformationMatrix.A, 2);
while (bytes.MoveNext()) while (bytes.MoveNext())
{ {
var code = font.ReadCharacterCode(bytes, out int codeLength); var code = font.ReadCharacterCode(bytes, out int codeLength);
@@ -102,7 +112,9 @@
var displacement = font.GetDisplacement(code); var displacement = font.GetDisplacement(code);
ShowGlyph(renderingMatrix, font, code, unicode, displacement, fontSize); var width = (displacement.X * fontSize) * transformationMatrix.A;
ShowGlyph(renderingMatrix, font, code, unicode, width, fontSize, pointSize);
decimal tx, ty; decimal tx, ty;
if (font.IsVertical) if (font.IsVertical)
@@ -122,11 +134,12 @@
} }
} }
private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font, int characterCode, string unicode, PdfVector displacement, decimal fontSize) private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font, int characterCode, string unicode, decimal width, decimal fontSize,
decimal pointSize)
{ {
var location = new PdfPoint(renderingMatrix.E, renderingMatrix.F); var location = new PdfPoint(renderingMatrix.E, renderingMatrix.F);
var letter = new Letter(unicode, location, displacement.X, fontSize, font.Name.Name); var letter = new Letter(unicode, location, width, fontSize, font.Name.Name, pointSize);
Letters.Add(letter); Letters.Add(letter);
} }

View File

@@ -1,7 +1,9 @@
namespace UglyToad.Pdf.Graphics.Operations.TextState namespace UglyToad.Pdf.Graphics.Operations.TextState
{ {
using System;
using Content; using Content;
using Cos; using Cos;
using Util.JetBrains.Annotations;
internal class SetFontAndSize : IGraphicsStateOperation internal class SetFontAndSize : IGraphicsStateOperation
{ {
@@ -9,13 +11,21 @@
public string Operator => Symbol; public string Operator => Symbol;
/// <summary>
/// The name of the font as defined in the resource dictionary.
/// </summary>
[NotNull]
public CosName Font { get; } public CosName Font { get; }
/// <summary>
/// The font program defines glyphs for a standard size. This standard size is set so that each line of text will occupy 1 unit in user space.
/// The size is the scale factor used to scale glyphs from the standard size to the display size rather than the font size in points.
/// </summary>
public decimal Size { get; } public decimal Size { get; }
public SetFontAndSize(CosName font, decimal size) public SetFontAndSize(CosName font, decimal size)
{ {
Font = font; Font = font ?? throw new ArgumentNullException(nameof(font));
Size = size; Size = size;
} }