diff --git a/src/UglyToad.Pdf.Tests/Graphics/Operations/TextState/SetFontAndSizeTests.cs b/src/UglyToad.Pdf.Tests/Graphics/Operations/TextState/SetFontAndSizeTests.cs new file mode 100644 index 00000000..25e8f65a --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Graphics/Operations/TextState/SetFontAndSizeTests.cs @@ -0,0 +1,70 @@ +namespace UglyToad.Pdf.Tests.Graphics.Operations.TextState +{ + using System; + using Pdf.Cos; + using Pdf.Graphics.Operations.TextState; + using Xunit; + + public class SetFontAndSizeTests + { + private static readonly CosName Font1Name = CosName.Create("Font1"); + + [Fact] + public void HasCorrectSymbol() + { + var symbol = SetFontAndSize.Symbol; + + Assert.Equal("Tf", symbol); + } + + [Fact] + public void SetsValues() + { + var setFontAndSize = new SetFontAndSize(Font1Name, 12.75m); + + Assert.Equal("Font1", setFontAndSize.Font.Name); + Assert.Equal(12.75m, setFontAndSize.Size); + } + + [Fact] + public void HasCorrectOperator() + { + var setFontAndSize = new SetFontAndSize(Font1Name, 12); + + Assert.Equal("Tf", setFontAndSize.Operator); + } + + [Fact] + public void NameNullThrows() + { + // ReSharper disable once ObjectCreationAsStatement + Action action = () => new SetFontAndSize(null, 6); + + Assert.Throws(action); + } + + [Fact] + public void StringRepresentationIsCorrect() + { + var setFontAndSize = new SetFontAndSize(Font1Name, 12.76m); + + Assert.Equal("/Font1 12.76 Tf", setFontAndSize.ToString()); + } + + [Fact] + public void RunSetsFontAndFontSize() + { + var setFontAndSize = new SetFontAndSize(Font1Name, 69.42m); + + var context = new TestOperationContext(); + var store = new TestResourceStore(); + + setFontAndSize.Run(context, store); + + var state = context.GetCurrentState(); + + Assert.Equal(69.42m, state.FontState.FontSize); + Assert.Equal(Font1Name, state.FontState.FontName); + } + } +} diff --git a/src/UglyToad.Pdf.Tests/Integration/Documents/Font Size Test - from libre office.pdf b/src/UglyToad.Pdf.Tests/Integration/Documents/Font Size Test - from libre office.pdf new file mode 100644 index 00000000..19776c66 Binary files /dev/null and b/src/UglyToad.Pdf.Tests/Integration/Documents/Font Size Test - from libre office.pdf differ diff --git a/src/UglyToad.Pdf.Tests/Integration/Documents/Font Size Text - from google chrome print pdf.pdf b/src/UglyToad.Pdf.Tests/Integration/Documents/Font Size Text - from google chrome print pdf.pdf new file mode 100644 index 00000000..0d2aaf68 Binary files /dev/null and b/src/UglyToad.Pdf.Tests/Integration/Documents/Font Size Text - from google chrome print pdf.pdf differ diff --git a/src/UglyToad.Pdf.Tests/Integration/PdfParserTests.cs b/src/UglyToad.Pdf.Tests/Integration/PdfParserTests.cs index d8d3c189..32309e5e 100644 --- a/src/UglyToad.Pdf.Tests/Integration/PdfParserTests.cs +++ b/src/UglyToad.Pdf.Tests/Integration/PdfParserTests.cs @@ -162,11 +162,7 @@ { var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents")); - var files = Directory.GetFiles(documentFolder); - - var file = files[n]; - - return file; + return Path.Combine(documentFolder, "Single Page Simple - from google drive.pdf"); } } } diff --git a/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs b/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs index a2e30c16..70ea2064 100644 --- a/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs +++ b/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs @@ -92,6 +92,95 @@ namespace UglyToad.Pdf.Tests.Integration } } + [Fact] + public void LettersHavePdfBoxPositions() + { + var file = GetFilename(); + + var pdfBoxData = GetPdfBoxPositionData(); + var index = 0; + + using (var document = PdfDocument.Open(File.ReadAllBytes(file))) + { + var page = document.GetPage(1); + + foreach (var letter in page.Letters) + { + // Something a bit weird with how we or PdfBox handle hidden characters and spaces. + if (IgnoredHiddenCharacters.Contains(letter.Value) || string.IsNullOrWhiteSpace(letter.Value)) + { + continue; + } + + var datum = pdfBoxData[index]; + + while (IgnoredHiddenCharacters.Contains(datum.Text)) + { + index++; + datum = pdfBoxData[index]; + } + + Assert.Equal(datum.Text, letter.Value); + Assert.Equal(datum.X, letter.Location.X, 2); + + var transformed = page.Height - letter.Location.Y; + Assert.Equal(datum.Y, transformed, 2); + + Assert.Equal(datum.Width, letter.Width, 2); + + Assert.Equal(datum.FontName, letter.FontName); + + // I think we have font size wrong for now, or right, but differently correct... + + index++; + } + } + } + + [Fact] + public void LettersHaveOtherProviderPositions() + { + var file = GetFilename(); + + var pdfBoxData = GetOtherPositionData1(); + var index = 0; + + using (var document = PdfDocument.Open(File.ReadAllBytes(file))) + { + var page = document.GetPage(1); + + foreach (var letter in page.Letters) + { + // Something a bit weird with how we or this provider handle hidden characters and spaces. + if (IgnoredHiddenCharacters.Contains(letter.Value) || string.IsNullOrWhiteSpace(letter.Value)) + { + continue; + } + + var datum = pdfBoxData[index]; + + while (IgnoredHiddenCharacters.Contains(datum.Text) || datum.Text == " ") + { + index++; + datum = pdfBoxData[index]; + } + + Assert.Equal(datum.Text, letter.Value); + Assert.Equal(datum.X, letter.Location.X, 2); + + var transformed = page.Height - letter.Location.Y; + Assert.Equal(datum.Y, transformed, 2); + + // Until we get width from glyphs we're a bit out. + Assert.True(Math.Abs(datum.Width - letter.Width) < 0.03m); + + index++; + } + } + } + + + private static IReadOnlyList GetPdfBoxPositionData() { // X Y Width Letter FontSize Font diff --git a/src/UglyToad.Pdf.Tests/UglyToad.Pdf.Tests.csproj b/src/UglyToad.Pdf.Tests/UglyToad.Pdf.Tests.csproj index eaea7a91..44d0b0ce 100644 --- a/src/UglyToad.Pdf.Tests/UglyToad.Pdf.Tests.csproj +++ b/src/UglyToad.Pdf.Tests/UglyToad.Pdf.Tests.csproj @@ -10,6 +10,8 @@ + + @@ -20,6 +22,12 @@ PreserveNewest + + PreserveNewest + + + PreserveNewest + PreserveNewest diff --git a/src/UglyToad.Pdf/Content/CropBox.cs b/src/UglyToad.Pdf/Content/CropBox.cs new file mode 100644 index 00000000..f9635fd0 --- /dev/null +++ b/src/UglyToad.Pdf/Content/CropBox.cs @@ -0,0 +1,20 @@ +namespace UglyToad.Pdf.Content +{ + using System; + using Geometry; + using Util.JetBrains.Annotations; + + /// + /// Defines the visible region, contents expanding beyond the crop box should be clipped. + /// + public class CropBox + { + [NotNull] + public PdfRectangle Bounds { get; } + + public CropBox(PdfRectangle bounds) + { + Bounds = bounds ?? throw new ArgumentNullException(nameof(bounds)); + } + } +} \ No newline at end of file diff --git a/src/UglyToad.Pdf/Content/Letter.cs b/src/UglyToad.Pdf/Content/Letter.cs index 2fa398b5..c2a2de1a 100644 --- a/src/UglyToad.Pdf/Content/Letter.cs +++ b/src/UglyToad.Pdf/Content/Letter.cs @@ -4,23 +4,46 @@ public class Letter { + /// + /// The text for this letter or unicode character. + /// public string Value { get; } public PdfPoint Location { get; } + /// + /// The width of the letter. + /// public decimal Width { get; } - public decimal FontSize { get; } + /// + /// Size defined by the Tj operator prior to our possibly incorrect transformation. + /// + internal decimal FontSize { get; } + /// + /// The name of the font. + /// public string FontName { get; } - public Letter(string value, PdfPoint location, decimal width, decimal fontSize, string fontName) + /// + /// The size of the font in points. + /// + public decimal PointSize { get; } + + internal Letter(string value, PdfPoint location, decimal width, decimal fontSize, string fontName, decimal pointSize) { Value = value; Location = location; Width = width; FontSize = fontSize; FontName = fontName; + PointSize = pointSize; + } + + public override string ToString() + { + return $"{Location} {Width} {Value} {FontName} {PointSize}"; } } } diff --git a/src/UglyToad.Pdf/Content/Page.cs b/src/UglyToad.Pdf/Content/Page.cs index bc098d89..133f0395 100644 --- a/src/UglyToad.Pdf/Content/Page.cs +++ b/src/UglyToad.Pdf/Content/Page.cs @@ -12,6 +12,8 @@ internal MediaBox MediaBox { get; } + internal CropBox CropBox { get; } + internal PageContent Content { get; } public IReadOnlyList Letters => Content?.Letters ?? new Letter[0]; @@ -26,7 +28,7 @@ /// public decimal Height { get; } - internal Page(int number, MediaBox mediaBox, PageContent content) + internal Page(int number, MediaBox mediaBox, CropBox cropBox, PageContent content) { if (number <= 0) { @@ -35,6 +37,7 @@ Number = number; MediaBox = mediaBox; + CropBox = cropBox; Content = content; Width = mediaBox.Bounds.Width; diff --git a/src/UglyToad.Pdf/Content/PageFactory.cs b/src/UglyToad.Pdf/Content/PageFactory.cs index 84a1dcf6..32e18ef4 100644 --- a/src/UglyToad.Pdf/Content/PageFactory.cs +++ b/src/UglyToad.Pdf/Content/PageFactory.cs @@ -40,6 +40,75 @@ throw new InvalidOperationException($"Page {number} had its type was specified as {type} rather than 'Page'."); } + MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers, isLenientParsing); + CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox); + + if (dictionary.GetItemOrDefault(CosName.RESOURCES) is PdfDictionary resource) + { + resourceStore.LoadResourceDictionary(resource, reader, isLenientParsing); + } + + UserSpaceUnit userSpaceUnit = GetUserSpaceUnits(dictionary); + + PageContent content = default(PageContent); + + var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject; + if (contentObject != null) + { + var contentStream = pdfObjectParser.Parse(contentObject.ToIndirectReference(), reader, false) as PdfRawStream; + + if (contentStream == null) + { + throw new InvalidOperationException("Failed to parse the content for the page: " + number); + } + + var contents = contentStream.Decode(filterProvider); + + var operations = pageContentParser.Parse(new ByteArrayInputBytes(contents)); + + var context = new ContentStreamProcessor(mediaBox.Bounds, resourceStore, userSpaceUnit); + + content = context.Process(operations); + } + + var page = new Page(number, mediaBox, cropBox, content); + + return page; + } + + private static UserSpaceUnit GetUserSpaceUnits(PdfDictionary dictionary) + { + var spaceUnits = UserSpaceUnit.Default; + if (dictionary.TryGetValue(CosName.USER_UNIT, out var userUnitCosBase) && userUnitCosBase is ICosNumber userUnitNumber) + { + spaceUnits = new UserSpaceUnit(userUnitNumber.AsInt()); + } + + return spaceUnits; + } + + private static CropBox GetCropBox(PdfDictionary dictionary, PageTreeMembers pageTreeMembers, MediaBox mediaBox) + { + CropBox cropBox; + if (dictionary.TryGetItemOfType(CosName.CROP_BOX, out COSArray cropBoxArray)) + { + var x1 = cropBoxArray.getInt(0); + var y1 = cropBoxArray.getInt(1); + var x2 = cropBoxArray.getInt(2); + var y2 = cropBoxArray.getInt(3); + + cropBox = new CropBox(new PdfRectangle(x1, y1, x2, y2)); + } + else + { + cropBox = pageTreeMembers.GetCropBox() ?? new CropBox(mediaBox.Bounds); + } + + return cropBox; + } + + private static MediaBox GetMediaBox(int number, PdfDictionary dictionary, PageTreeMembers pageTreeMembers, bool isLenientParsing) + { MediaBox mediaBox; if (dictionary.TryGetItemOfType(CosName.MEDIA_BOX, out COSArray mediaboxArray)) { @@ -67,35 +136,7 @@ } } - if (dictionary.GetItemOrDefault(CosName.RESOURCES) is PdfDictionary resource) - { - resourceStore.LoadResourceDictionary(resource, reader, isLenientParsing); - } - - PageContent content = default(PageContent); - - var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject; - if (contentObject != null) - { - var contentStream = pdfObjectParser.Parse(contentObject.ToIndirectReference(), reader, false) as PdfRawStream; - - if (contentStream == null) - { - throw new InvalidOperationException("Failed to parse the content for the page: " + number); - } - - var contents = contentStream.Decode(filterProvider); - - var operations = pageContentParser.Parse(new ByteArrayInputBytes(contents)); - - var context = new ContentStreamProcessor(mediaBox.Bounds, resourceStore); - - content = context.Process(operations); - } - - var page = new Page(number, mediaBox, content); - - return page; + return mediaBox; } } } diff --git a/src/UglyToad.Pdf/Content/PageTreeMembers.cs b/src/UglyToad.Pdf/Content/PageTreeMembers.cs index 5ce5c2f6..c53ace47 100644 --- a/src/UglyToad.Pdf/Content/PageTreeMembers.cs +++ b/src/UglyToad.Pdf/Content/PageTreeMembers.cs @@ -12,5 +12,10 @@ // TODO: tree inheritance throw new NotImplementedException("Track inherited members"); } + + public CropBox GetCropBox() + { + return null; + } } } \ No newline at end of file diff --git a/src/UglyToad.Pdf/Core/TransformationMatrix.cs b/src/UglyToad.Pdf/Core/TransformationMatrix.cs index 3fedd02d..8ff4d3ef 100644 --- a/src/UglyToad.Pdf/Core/TransformationMatrix.cs +++ b/src/UglyToad.Pdf/Core/TransformationMatrix.cs @@ -18,9 +18,15 @@ private readonly decimal[] value; + /// + /// The scale for the X dimension. + /// public decimal A => value[0]; public decimal B => value[1]; public decimal C => value[3]; + /// + /// The scale for the Y dimension. + /// public decimal D => value[4]; public decimal E => value[6]; public decimal F => value[7]; diff --git a/src/UglyToad.Pdf/Cos/CosName.cs b/src/UglyToad.Pdf/Cos/CosName.cs index 7eb00c72..1da1b4bb 100644 --- a/src/UglyToad.Pdf/Cos/CosName.cs +++ b/src/UglyToad.Pdf/Cos/CosName.cs @@ -518,6 +518,7 @@ namespace UglyToad.Pdf.Cos public static readonly CosName UNIX = new CosName("Unix"); public static readonly CosName URI = new CosName("URI"); public static readonly CosName URL = new CosName("URL"); + public static readonly CosName USER_UNIT = new CosName("UserUnit"); // V public static readonly CosName V = new CosName("V"); public static readonly CosName VERISIGN_PPKVS = new CosName("VeriSign.PPKVS"); diff --git a/src/UglyToad.Pdf/Geometry/UserSpaceUnit.cs b/src/UglyToad.Pdf/Geometry/UserSpaceUnit.cs new file mode 100644 index 00000000..de93793f --- /dev/null +++ b/src/UglyToad.Pdf/Geometry/UserSpaceUnit.cs @@ -0,0 +1,36 @@ +namespace UglyToad.Pdf.Geometry +{ + using System; + + /// + /// By default user space units correspond to 1/72nd of an inch (a typographic point). + /// The UserUnit entry in a page dictionary can define the space units as a different multiple of 1/72 (1 point). + /// + public struct UserSpaceUnit + { + public static readonly UserSpaceUnit Default = new UserSpaceUnit(1); + + /// + /// The number of points (1/72nd of an inch) corresponding to a single unit in user space. + /// + public int PointMultiples { get; } + + /// + /// Create a new unit specification for a page. + /// + public UserSpaceUnit(int pointMultiples) + { + if (pointMultiples <= 0) + { + throw new ArgumentOutOfRangeException("Cannot have a zero or negative value of point multiples: " + pointMultiples); + } + + PointMultiples = pointMultiples; + } + + public override string ToString() + { + return PointMultiples.ToString(); + } + } +} diff --git a/src/UglyToad.Pdf/Graphics/ContentStreamProcessor.cs b/src/UglyToad.Pdf/Graphics/ContentStreamProcessor.cs index 15a27cf8..c4406bf3 100644 --- a/src/UglyToad.Pdf/Graphics/ContentStreamProcessor.cs +++ b/src/UglyToad.Pdf/Graphics/ContentStreamProcessor.cs @@ -13,6 +13,7 @@ internal class ContentStreamProcessor : IOperationContext { private readonly IResourceStore resourceStore; + private readonly UserSpaceUnit userSpaceUnit; private Stack graphicsStack = new Stack(); @@ -22,9 +23,10 @@ public List Letters = new List(); - public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore) + public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore, UserSpaceUnit userSpaceUnit) { this.resourceStore = resourceStore; + this.userSpaceUnit = userSpaceUnit; graphicsStack.Push(new CurrentGraphicsState()); } @@ -77,10 +79,18 @@ { var font = resourceStore.GetFont(GetCurrentState().FontState.FontName); - var fontSize = GetCurrentState().FontState.FontSize; - var horizontalScaling = GetCurrentState().FontState.HorizontalScaling; - var characterSpacing = GetCurrentState().FontState.CharacterSpacing; + var currentState = GetCurrentState(); + var fontSize = currentState.FontState.FontSize; + var horizontalScaling = currentState.FontState.HorizontalScaling; + var characterSpacing = currentState.FontState.CharacterSpacing; + + var transformationMatrix = currentState.CurrentTransformationMatrix; + + // TODO: this does not seem correct, produces the correct result for now but we need to revisit. + // see: https://stackoverflow.com/questions/48010235/pdf-specification-get-font-size-in-points + var pointSize = decimal.Round(fontSize * transformationMatrix.A, 2); + while (bytes.MoveNext()) { var code = font.ReadCharacterCode(bytes, out int codeLength); @@ -102,7 +112,9 @@ var displacement = font.GetDisplacement(code); - ShowGlyph(renderingMatrix, font, code, unicode, displacement, fontSize); + var width = (displacement.X * fontSize) * transformationMatrix.A; + + ShowGlyph(renderingMatrix, font, code, unicode, width, fontSize, pointSize); decimal tx, ty; if (font.IsVertical) @@ -122,11 +134,12 @@ } } - private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font, int characterCode, string unicode, PdfVector displacement, decimal fontSize) + private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font, int characterCode, string unicode, decimal width, decimal fontSize, + decimal pointSize) { var location = new PdfPoint(renderingMatrix.E, renderingMatrix.F); - - var letter = new Letter(unicode, location, displacement.X, fontSize, font.Name.Name); + + var letter = new Letter(unicode, location, width, fontSize, font.Name.Name, pointSize); Letters.Add(letter); } diff --git a/src/UglyToad.Pdf/Graphics/Operations/TextState/SetFontAndSize.cs b/src/UglyToad.Pdf/Graphics/Operations/TextState/SetFontAndSize.cs index 6f1159ba..a309dbea 100644 --- a/src/UglyToad.Pdf/Graphics/Operations/TextState/SetFontAndSize.cs +++ b/src/UglyToad.Pdf/Graphics/Operations/TextState/SetFontAndSize.cs @@ -1,7 +1,9 @@ namespace UglyToad.Pdf.Graphics.Operations.TextState { + using System; using Content; using Cos; + using Util.JetBrains.Annotations; internal class SetFontAndSize : IGraphicsStateOperation { @@ -9,13 +11,21 @@ public string Operator => Symbol; + /// + /// The name of the font as defined in the resource dictionary. + /// + [NotNull] public CosName Font { get; } + /// + /// The font program defines glyphs for a standard size. This standard size is set so that each line of text will occupy 1 unit in user space. + /// The size is the scale factor used to scale glyphs from the standard size to the display size rather than the font size in points. + /// public decimal Size { get; } public SetFontAndSize(CosName font, decimal size) { - Font = font; + Font = font ?? throw new ArgumentNullException(nameof(font)); Size = size; }