mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 03:34:52 +08:00
add more documents to test font size and add tests to check our text positions against other providers
This commit is contained in:
@@ -0,0 +1,70 @@
|
||||
namespace UglyToad.Pdf.Tests.Graphics.Operations.TextState
|
||||
{
|
||||
using System;
|
||||
using Pdf.Cos;
|
||||
using Pdf.Graphics.Operations.TextState;
|
||||
using Xunit;
|
||||
|
||||
public class SetFontAndSizeTests
|
||||
{
|
||||
private static readonly CosName Font1Name = CosName.Create("Font1");
|
||||
|
||||
[Fact]
|
||||
public void HasCorrectSymbol()
|
||||
{
|
||||
var symbol = SetFontAndSize.Symbol;
|
||||
|
||||
Assert.Equal("Tf", symbol);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SetsValues()
|
||||
{
|
||||
var setFontAndSize = new SetFontAndSize(Font1Name, 12.75m);
|
||||
|
||||
Assert.Equal("Font1", setFontAndSize.Font.Name);
|
||||
Assert.Equal(12.75m, setFontAndSize.Size);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HasCorrectOperator()
|
||||
{
|
||||
var setFontAndSize = new SetFontAndSize(Font1Name, 12);
|
||||
|
||||
Assert.Equal("Tf", setFontAndSize.Operator);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void NameNullThrows()
|
||||
{
|
||||
// ReSharper disable once ObjectCreationAsStatement
|
||||
Action action = () => new SetFontAndSize(null, 6);
|
||||
|
||||
Assert.Throws<ArgumentNullException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void StringRepresentationIsCorrect()
|
||||
{
|
||||
var setFontAndSize = new SetFontAndSize(Font1Name, 12.76m);
|
||||
|
||||
Assert.Equal("/Font1 12.76 Tf", setFontAndSize.ToString());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RunSetsFontAndFontSize()
|
||||
{
|
||||
var setFontAndSize = new SetFontAndSize(Font1Name, 69.42m);
|
||||
|
||||
var context = new TestOperationContext();
|
||||
var store = new TestResourceStore();
|
||||
|
||||
setFontAndSize.Run(context, store);
|
||||
|
||||
var state = context.GetCurrentState();
|
||||
|
||||
Assert.Equal(69.42m, state.FontState.FontSize);
|
||||
Assert.Equal(Font1Name, state.FontState.FontName);
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
Binary file not shown.
@@ -162,11 +162,7 @@
|
||||
{
|
||||
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
|
||||
|
||||
var files = Directory.GetFiles(documentFolder);
|
||||
|
||||
var file = files[n];
|
||||
|
||||
return file;
|
||||
return Path.Combine(documentFolder, "Single Page Simple - from google drive.pdf");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -92,6 +92,95 @@ namespace UglyToad.Pdf.Tests.Integration
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void LettersHavePdfBoxPositions()
|
||||
{
|
||||
var file = GetFilename();
|
||||
|
||||
var pdfBoxData = GetPdfBoxPositionData();
|
||||
var index = 0;
|
||||
|
||||
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
foreach (var letter in page.Letters)
|
||||
{
|
||||
// Something a bit weird with how we or PdfBox handle hidden characters and spaces.
|
||||
if (IgnoredHiddenCharacters.Contains(letter.Value) || string.IsNullOrWhiteSpace(letter.Value))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var datum = pdfBoxData[index];
|
||||
|
||||
while (IgnoredHiddenCharacters.Contains(datum.Text))
|
||||
{
|
||||
index++;
|
||||
datum = pdfBoxData[index];
|
||||
}
|
||||
|
||||
Assert.Equal(datum.Text, letter.Value);
|
||||
Assert.Equal(datum.X, letter.Location.X, 2);
|
||||
|
||||
var transformed = page.Height - letter.Location.Y;
|
||||
Assert.Equal(datum.Y, transformed, 2);
|
||||
|
||||
Assert.Equal(datum.Width, letter.Width, 2);
|
||||
|
||||
Assert.Equal(datum.FontName, letter.FontName);
|
||||
|
||||
// I think we have font size wrong for now, or right, but differently correct...
|
||||
|
||||
index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void LettersHaveOtherProviderPositions()
|
||||
{
|
||||
var file = GetFilename();
|
||||
|
||||
var pdfBoxData = GetOtherPositionData1();
|
||||
var index = 0;
|
||||
|
||||
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
foreach (var letter in page.Letters)
|
||||
{
|
||||
// Something a bit weird with how we or this provider handle hidden characters and spaces.
|
||||
if (IgnoredHiddenCharacters.Contains(letter.Value) || string.IsNullOrWhiteSpace(letter.Value))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var datum = pdfBoxData[index];
|
||||
|
||||
while (IgnoredHiddenCharacters.Contains(datum.Text) || datum.Text == " ")
|
||||
{
|
||||
index++;
|
||||
datum = pdfBoxData[index];
|
||||
}
|
||||
|
||||
Assert.Equal(datum.Text, letter.Value);
|
||||
Assert.Equal(datum.X, letter.Location.X, 2);
|
||||
|
||||
var transformed = page.Height - letter.Location.Y;
|
||||
Assert.Equal(datum.Y, transformed, 2);
|
||||
|
||||
// Until we get width from glyphs we're a bit out.
|
||||
Assert.True(Math.Abs(datum.Width - letter.Width) < 0.03m);
|
||||
|
||||
index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
|
||||
{
|
||||
// X Y Width Letter FontSize Font
|
||||
|
@@ -10,6 +10,8 @@
|
||||
<ItemGroup>
|
||||
<None Remove="Fonts\TrueType\google-simple-doc.ttf" />
|
||||
<None Remove="Fonts\TrueType\Roboto-Regular.ttf" />
|
||||
<None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
|
||||
<None Remove="Integration\Documents\Font Size Text - from google chrome print pdf.pdf" />
|
||||
<None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" />
|
||||
</ItemGroup>
|
||||
|
||||
@@ -20,6 +22,12 @@
|
||||
<EmbeddedResource Include="Fonts\TrueType\Roboto-Regular.ttf">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</EmbeddedResource>
|
||||
<Content Include="Integration\Documents\Font Size Test - from libre office.pdf">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</Content>
|
||||
<Content Include="Integration\Documents\Font Size Text - from google chrome print pdf.pdf">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</Content>
|
||||
<Content Include="Integration\Documents\Single Page Simple - from google drive.pdf">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</Content>
|
||||
|
20
src/UglyToad.Pdf/Content/CropBox.cs
Normal file
20
src/UglyToad.Pdf/Content/CropBox.cs
Normal file
@@ -0,0 +1,20 @@
|
||||
namespace UglyToad.Pdf.Content
|
||||
{
|
||||
using System;
|
||||
using Geometry;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
/// <summary>
|
||||
/// Defines the visible region, contents expanding beyond the crop box should be clipped.
|
||||
/// </summary>
|
||||
public class CropBox
|
||||
{
|
||||
[NotNull]
|
||||
public PdfRectangle Bounds { get; }
|
||||
|
||||
public CropBox(PdfRectangle bounds)
|
||||
{
|
||||
Bounds = bounds ?? throw new ArgumentNullException(nameof(bounds));
|
||||
}
|
||||
}
|
||||
}
|
@@ -4,23 +4,46 @@
|
||||
|
||||
public class Letter
|
||||
{
|
||||
/// <summary>
|
||||
/// The text for this letter or unicode character.
|
||||
/// </summary>
|
||||
public string Value { get; }
|
||||
|
||||
public PdfPoint Location { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The width of the letter.
|
||||
/// </summary>
|
||||
public decimal Width { get; }
|
||||
|
||||
public decimal FontSize { get; }
|
||||
/// <summary>
|
||||
/// Size defined by the Tj operator prior to our possibly incorrect transformation.
|
||||
/// </summary>
|
||||
internal decimal FontSize { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The name of the font.
|
||||
/// </summary>
|
||||
public string FontName { get; }
|
||||
|
||||
public Letter(string value, PdfPoint location, decimal width, decimal fontSize, string fontName)
|
||||
/// <summary>
|
||||
/// The size of the font in points.
|
||||
/// </summary>
|
||||
public decimal PointSize { get; }
|
||||
|
||||
internal Letter(string value, PdfPoint location, decimal width, decimal fontSize, string fontName, decimal pointSize)
|
||||
{
|
||||
Value = value;
|
||||
Location = location;
|
||||
Width = width;
|
||||
FontSize = fontSize;
|
||||
FontName = fontName;
|
||||
PointSize = pointSize;
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"{Location} {Width} {Value} {FontName} {PointSize}";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -12,6 +12,8 @@
|
||||
|
||||
internal MediaBox MediaBox { get; }
|
||||
|
||||
internal CropBox CropBox { get; }
|
||||
|
||||
internal PageContent Content { get; }
|
||||
|
||||
public IReadOnlyList<Letter> Letters => Content?.Letters ?? new Letter[0];
|
||||
@@ -26,7 +28,7 @@
|
||||
/// </summary>
|
||||
public decimal Height { get; }
|
||||
|
||||
internal Page(int number, MediaBox mediaBox, PageContent content)
|
||||
internal Page(int number, MediaBox mediaBox, CropBox cropBox, PageContent content)
|
||||
{
|
||||
if (number <= 0)
|
||||
{
|
||||
@@ -35,6 +37,7 @@
|
||||
|
||||
Number = number;
|
||||
MediaBox = mediaBox;
|
||||
CropBox = cropBox;
|
||||
Content = content;
|
||||
|
||||
Width = mediaBox.Bounds.Width;
|
||||
|
@@ -40,6 +40,75 @@
|
||||
throw new InvalidOperationException($"Page {number} had its type was specified as {type} rather than 'Page'.");
|
||||
}
|
||||
|
||||
MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers, isLenientParsing);
|
||||
CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox);
|
||||
|
||||
if (dictionary.GetItemOrDefault(CosName.RESOURCES) is PdfDictionary resource)
|
||||
{
|
||||
resourceStore.LoadResourceDictionary(resource, reader, isLenientParsing);
|
||||
}
|
||||
|
||||
UserSpaceUnit userSpaceUnit = GetUserSpaceUnits(dictionary);
|
||||
|
||||
PageContent content = default(PageContent);
|
||||
|
||||
var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject;
|
||||
if (contentObject != null)
|
||||
{
|
||||
var contentStream = pdfObjectParser.Parse(contentObject.ToIndirectReference(), reader, false) as PdfRawStream;
|
||||
|
||||
if (contentStream == null)
|
||||
{
|
||||
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
|
||||
}
|
||||
|
||||
var contents = contentStream.Decode(filterProvider);
|
||||
|
||||
var operations = pageContentParser.Parse(new ByteArrayInputBytes(contents));
|
||||
|
||||
var context = new ContentStreamProcessor(mediaBox.Bounds, resourceStore, userSpaceUnit);
|
||||
|
||||
content = context.Process(operations);
|
||||
}
|
||||
|
||||
var page = new Page(number, mediaBox, cropBox, content);
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
private static UserSpaceUnit GetUserSpaceUnits(PdfDictionary dictionary)
|
||||
{
|
||||
var spaceUnits = UserSpaceUnit.Default;
|
||||
if (dictionary.TryGetValue(CosName.USER_UNIT, out var userUnitCosBase) && userUnitCosBase is ICosNumber userUnitNumber)
|
||||
{
|
||||
spaceUnits = new UserSpaceUnit(userUnitNumber.AsInt());
|
||||
}
|
||||
|
||||
return spaceUnits;
|
||||
}
|
||||
|
||||
private static CropBox GetCropBox(PdfDictionary dictionary, PageTreeMembers pageTreeMembers, MediaBox mediaBox)
|
||||
{
|
||||
CropBox cropBox;
|
||||
if (dictionary.TryGetItemOfType(CosName.CROP_BOX, out COSArray cropBoxArray))
|
||||
{
|
||||
var x1 = cropBoxArray.getInt(0);
|
||||
var y1 = cropBoxArray.getInt(1);
|
||||
var x2 = cropBoxArray.getInt(2);
|
||||
var y2 = cropBoxArray.getInt(3);
|
||||
|
||||
cropBox = new CropBox(new PdfRectangle(x1, y1, x2, y2));
|
||||
}
|
||||
else
|
||||
{
|
||||
cropBox = pageTreeMembers.GetCropBox() ?? new CropBox(mediaBox.Bounds);
|
||||
}
|
||||
|
||||
return cropBox;
|
||||
}
|
||||
|
||||
private static MediaBox GetMediaBox(int number, PdfDictionary dictionary, PageTreeMembers pageTreeMembers, bool isLenientParsing)
|
||||
{
|
||||
MediaBox mediaBox;
|
||||
if (dictionary.TryGetItemOfType(CosName.MEDIA_BOX, out COSArray mediaboxArray))
|
||||
{
|
||||
@@ -67,35 +136,7 @@
|
||||
}
|
||||
}
|
||||
|
||||
if (dictionary.GetItemOrDefault(CosName.RESOURCES) is PdfDictionary resource)
|
||||
{
|
||||
resourceStore.LoadResourceDictionary(resource, reader, isLenientParsing);
|
||||
}
|
||||
|
||||
PageContent content = default(PageContent);
|
||||
|
||||
var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject;
|
||||
if (contentObject != null)
|
||||
{
|
||||
var contentStream = pdfObjectParser.Parse(contentObject.ToIndirectReference(), reader, false) as PdfRawStream;
|
||||
|
||||
if (contentStream == null)
|
||||
{
|
||||
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
|
||||
}
|
||||
|
||||
var contents = contentStream.Decode(filterProvider);
|
||||
|
||||
var operations = pageContentParser.Parse(new ByteArrayInputBytes(contents));
|
||||
|
||||
var context = new ContentStreamProcessor(mediaBox.Bounds, resourceStore);
|
||||
|
||||
content = context.Process(operations);
|
||||
}
|
||||
|
||||
var page = new Page(number, mediaBox, content);
|
||||
|
||||
return page;
|
||||
return mediaBox;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -12,5 +12,10 @@
|
||||
// TODO: tree inheritance
|
||||
throw new NotImplementedException("Track inherited members");
|
||||
}
|
||||
|
||||
public CropBox GetCropBox()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
@@ -18,9 +18,15 @@
|
||||
|
||||
private readonly decimal[] value;
|
||||
|
||||
/// <summary>
|
||||
/// The scale for the X dimension.
|
||||
/// </summary>
|
||||
public decimal A => value[0];
|
||||
public decimal B => value[1];
|
||||
public decimal C => value[3];
|
||||
/// <summary>
|
||||
/// The scale for the Y dimension.
|
||||
/// </summary>
|
||||
public decimal D => value[4];
|
||||
public decimal E => value[6];
|
||||
public decimal F => value[7];
|
||||
|
@@ -518,6 +518,7 @@ namespace UglyToad.Pdf.Cos
|
||||
public static readonly CosName UNIX = new CosName("Unix");
|
||||
public static readonly CosName URI = new CosName("URI");
|
||||
public static readonly CosName URL = new CosName("URL");
|
||||
public static readonly CosName USER_UNIT = new CosName("UserUnit");
|
||||
// V
|
||||
public static readonly CosName V = new CosName("V");
|
||||
public static readonly CosName VERISIGN_PPKVS = new CosName("VeriSign.PPKVS");
|
||||
|
36
src/UglyToad.Pdf/Geometry/UserSpaceUnit.cs
Normal file
36
src/UglyToad.Pdf/Geometry/UserSpaceUnit.cs
Normal file
@@ -0,0 +1,36 @@
|
||||
namespace UglyToad.Pdf.Geometry
|
||||
{
|
||||
using System;
|
||||
|
||||
/// <summary>
|
||||
/// By default user space units correspond to 1/72nd of an inch (a typographic point).
|
||||
/// The UserUnit entry in a page dictionary can define the space units as a different multiple of 1/72 (1 point).
|
||||
/// </summary>
|
||||
public struct UserSpaceUnit
|
||||
{
|
||||
public static readonly UserSpaceUnit Default = new UserSpaceUnit(1);
|
||||
|
||||
/// <summary>
|
||||
/// The number of points (1/72nd of an inch) corresponding to a single unit in user space.
|
||||
/// </summary>
|
||||
public int PointMultiples { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new unit specification for a page.
|
||||
/// </summary>
|
||||
public UserSpaceUnit(int pointMultiples)
|
||||
{
|
||||
if (pointMultiples <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException("Cannot have a zero or negative value of point multiples: " + pointMultiples);
|
||||
}
|
||||
|
||||
PointMultiples = pointMultiples;
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return PointMultiples.ToString();
|
||||
}
|
||||
}
|
||||
}
|
@@ -13,6 +13,7 @@
|
||||
internal class ContentStreamProcessor : IOperationContext
|
||||
{
|
||||
private readonly IResourceStore resourceStore;
|
||||
private readonly UserSpaceUnit userSpaceUnit;
|
||||
|
||||
private Stack<CurrentGraphicsState> graphicsStack = new Stack<CurrentGraphicsState>();
|
||||
|
||||
@@ -22,9 +23,10 @@
|
||||
|
||||
public List<Letter> Letters = new List<Letter>();
|
||||
|
||||
public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore)
|
||||
public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore, UserSpaceUnit userSpaceUnit)
|
||||
{
|
||||
this.resourceStore = resourceStore;
|
||||
this.userSpaceUnit = userSpaceUnit;
|
||||
graphicsStack.Push(new CurrentGraphicsState());
|
||||
}
|
||||
|
||||
@@ -77,9 +79,17 @@
|
||||
{
|
||||
var font = resourceStore.GetFont(GetCurrentState().FontState.FontName);
|
||||
|
||||
var fontSize = GetCurrentState().FontState.FontSize;
|
||||
var horizontalScaling = GetCurrentState().FontState.HorizontalScaling;
|
||||
var characterSpacing = GetCurrentState().FontState.CharacterSpacing;
|
||||
var currentState = GetCurrentState();
|
||||
|
||||
var fontSize = currentState.FontState.FontSize;
|
||||
var horizontalScaling = currentState.FontState.HorizontalScaling;
|
||||
var characterSpacing = currentState.FontState.CharacterSpacing;
|
||||
|
||||
var transformationMatrix = currentState.CurrentTransformationMatrix;
|
||||
|
||||
// TODO: this does not seem correct, produces the correct result for now but we need to revisit.
|
||||
// see: https://stackoverflow.com/questions/48010235/pdf-specification-get-font-size-in-points
|
||||
var pointSize = decimal.Round(fontSize * transformationMatrix.A, 2);
|
||||
|
||||
while (bytes.MoveNext())
|
||||
{
|
||||
@@ -102,7 +112,9 @@
|
||||
|
||||
var displacement = font.GetDisplacement(code);
|
||||
|
||||
ShowGlyph(renderingMatrix, font, code, unicode, displacement, fontSize);
|
||||
var width = (displacement.X * fontSize) * transformationMatrix.A;
|
||||
|
||||
ShowGlyph(renderingMatrix, font, code, unicode, width, fontSize, pointSize);
|
||||
|
||||
decimal tx, ty;
|
||||
if (font.IsVertical)
|
||||
@@ -122,11 +134,12 @@
|
||||
}
|
||||
}
|
||||
|
||||
private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font, int characterCode, string unicode, PdfVector displacement, decimal fontSize)
|
||||
private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font, int characterCode, string unicode, decimal width, decimal fontSize,
|
||||
decimal pointSize)
|
||||
{
|
||||
var location = new PdfPoint(renderingMatrix.E, renderingMatrix.F);
|
||||
|
||||
var letter = new Letter(unicode, location, displacement.X, fontSize, font.Name.Name);
|
||||
var letter = new Letter(unicode, location, width, fontSize, font.Name.Name, pointSize);
|
||||
|
||||
Letters.Add(letter);
|
||||
}
|
||||
|
@@ -1,7 +1,9 @@
|
||||
namespace UglyToad.Pdf.Graphics.Operations.TextState
|
||||
{
|
||||
using System;
|
||||
using Content;
|
||||
using Cos;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
internal class SetFontAndSize : IGraphicsStateOperation
|
||||
{
|
||||
@@ -9,13 +11,21 @@
|
||||
|
||||
public string Operator => Symbol;
|
||||
|
||||
/// <summary>
|
||||
/// The name of the font as defined in the resource dictionary.
|
||||
/// </summary>
|
||||
[NotNull]
|
||||
public CosName Font { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The font program defines glyphs for a standard size. This standard size is set so that each line of text will occupy 1 unit in user space.
|
||||
/// The size is the scale factor used to scale glyphs from the standard size to the display size rather than the font size in points.
|
||||
/// </summary>
|
||||
public decimal Size { get; }
|
||||
|
||||
public SetFontAndSize(CosName font, decimal size)
|
||||
{
|
||||
Font = font;
|
||||
Font = font ?? throw new ArgumentNullException(nameof(font));
|
||||
Size = size;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user