diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/Single Page Hyperlinks - from open office.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/Single Page Hyperlinks - from open office.pdf new file mode 100644 index 00000000..cc183201 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/Single Page Hyperlinks - from open office.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Integration/SinglePageHyperlinksOpenOffice.cs b/src/UglyToad.PdfPig.Tests/Integration/SinglePageHyperlinksOpenOffice.cs new file mode 100644 index 00000000..e0737914 --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/SinglePageHyperlinksOpenOffice.cs @@ -0,0 +1,52 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using Xunit; + + public class SinglePageHyperlinksOpenOffice + { + private static string GetFilename() + { + return IntegrationHelpers.GetDocumentPath("Single Page Hyperlinks - from open office.pdf"); + } + + [Fact] + public void GetsCorrectText() + { + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) + { + var page = document.GetPage(1); + + Assert.Equal("https://duckduckgo.com/ a link aboveGitHub", page.Text); + } + } + + [Fact] + public void GetsHyperlinks() + { + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) + { + var page = document.GetPage(1); + + var links = page.GetHyperlinks(); + + Assert.Equal(2, links.Count); + + var ddg = links[0]; + + Assert.Equal("https://duckduckgo.com/", ddg.Text); + Assert.Equal("https://duckduckgo.com/", ddg.Uri); + Assert.Equal("https://duckduckgo.com/ ".Length, ddg.Letters.Count); + + Assert.NotNull(ddg.Annotation); + + var github = links[1]; + + Assert.Equal("GitHub", github.Text); + Assert.Equal("https://github.com/", github.Uri); + Assert.Equal(6, github.Letters.Count); + + Assert.NotNull(github.Annotation); + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/Integration/SwedishTouringCarChampionshipTests.cs b/src/UglyToad.PdfPig.Tests/Integration/SwedishTouringCarChampionshipTests.cs index ddda24b2..9af4cb79 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/SwedishTouringCarChampionshipTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/SwedishTouringCarChampionshipTests.cs @@ -55,5 +55,39 @@ Assert.Contains("Söderberg", page.Text); } } + + [Fact] + public void GetsHyperlinks() + { + using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) + { + var page = document.GetPage(1); + + var links = page.GetHyperlinks(); + + Assert.Equal(4, links.Count); + + var pageLink = links[0]; + + Assert.Equal("Swedish Touring Car Championship", pageLink.Text); + Assert.Equal("https://en.wikipedia.org/wiki/Swedish_Touring_Car_Championship", pageLink.Uri); + + var year2005 = links[1]; + + Assert.Equal("2005", year2005.Text); + Assert.Equal("https://en.wikipedia.org/wiki/2005_Swedish_Touring_Car_Championship", year2005.Uri); + + var year2007 = links[2]; + + Assert.Equal("2007", year2007.Text); + Assert.Equal("https://en.wikipedia.org/wiki/2007_Swedish_Touring_Car_Championship", year2007.Uri); + + var fullLink = links[3]; + + Assert.Equal("The 2006 Swedish Touring Car Championship season was the 11th Swedish Touring Car Championship (STCC) season. " + + "In total nine racing weekends at six different circuits were held; each", fullLink.Text); + Assert.Equal("https://en.wikipedia.org/wiki/Swedish_Touring_Car_Championship", fullLink.Uri); + } + } } } diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 84ff6e61..4a8f7575 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -65,6 +65,7 @@ "UglyToad.PdfPig.Content.Catalog", "UglyToad.PdfPig.Content.CropBox", "UglyToad.PdfPig.Content.DocumentInformation", + "UglyToad.PdfPig.Content.Hyperlink", "UglyToad.PdfPig.Content.InlineImage", "UglyToad.PdfPig.Content.IPdfImage", "UglyToad.PdfPig.Content.Letter", diff --git a/src/UglyToad.PdfPig/Annotations/HyperlinkFactory.cs b/src/UglyToad.PdfPig/Annotations/HyperlinkFactory.cs new file mode 100644 index 00000000..f9fa0526 --- /dev/null +++ b/src/UglyToad.PdfPig/Annotations/HyperlinkFactory.cs @@ -0,0 +1,65 @@ +namespace UglyToad.PdfPig.Annotations +{ + using System.Collections.Generic; + using System.Linq; + using Content; + using Geometry; + using Tokenization.Scanner; + using Tokens; + using Util; + + internal static class HyperlinkFactory + { + public static IReadOnlyList GetHyperlinks(Page page, IPdfTokenScanner pdfScanner, AnnotationProvider annotationProvider) + { + var result = new List(); + + var annotations = annotationProvider.GetAnnotations(); + + foreach (var annotation in annotations) + { + if (annotation.Type != AnnotationType.Link) + { + continue; + } + + // Must be a link annotation with an action of type /URI. + if (!annotation.AnnotationDictionary.TryGet(NameToken.A, pdfScanner, out DictionaryToken actionDictionary) + || !actionDictionary.TryGet(NameToken.S, pdfScanner, out NameToken actionType) + || actionType != NameToken.Uri) + { + continue; + } + + // (Required) The uniform resource identifier to resolve, encoded in 7-bit ASCII. + if (!actionDictionary.TryGet(NameToken.Uri, pdfScanner, out IDataToken uriStringToken)) + { + continue; + } + + var bounds = annotation.Rectangle; + + // Build in tolerance for letters close to the link region. + var tolerantBounds = new PdfRectangle(bounds.TopLeft.Translate(-0.5m, 0), bounds.BottomRight.Translate(0.5m, 0)); + + var linkLetters = new List(); + + foreach (var letter in page.Letters) + { + if (tolerantBounds.Contains(letter.Location, true)) + { + linkLetters.Add(letter); + } + } + + var words = DefaultWordExtractor.Instance.GetWords(linkLetters); + + var presentationText = string.Join(" ", words.Select(x => x.Text)); + + result.Add(new Hyperlink(bounds, linkLetters, presentationText, uriStringToken.Data, annotation)); + } + + return result; + } + } +} diff --git a/src/UglyToad.PdfPig/Content/Hyperlink.cs b/src/UglyToad.PdfPig/Content/Hyperlink.cs new file mode 100644 index 00000000..01895900 --- /dev/null +++ b/src/UglyToad.PdfPig/Content/Hyperlink.cs @@ -0,0 +1,57 @@ +namespace UglyToad.PdfPig.Content +{ + using System; + using System.Collections.Generic; + using Annotations; + using Geometry; + + /// + /// Full details for a link annotation which references an external resource. + /// A link to an external resource in a document. + /// + public class Hyperlink + { + /// + /// The area on the page which when clicked will open the hyperlink. + /// + public PdfRectangle Bounds { get; } + + /// + /// The text in the link region (if any). + /// + public string Text { get; } + + /// + /// The letters in the link region. + /// + public IReadOnlyList Letters { get; } + + /// + /// The URI the link directs to. + /// + public string Uri { get; set; } + + /// + /// The underlying link annotation. + /// + public Annotation Annotation { get; } + + /// + /// Create a new . + /// + public Hyperlink(PdfRectangle bounds, IReadOnlyList letters, string text, string uri, Annotation annotation) + { + Bounds = bounds; + Text = text ?? string.Empty; + Letters = letters ?? throw new ArgumentNullException(nameof(letters)); + Uri = uri ?? string.Empty; + Annotation = annotation ?? throw new ArgumentNullException(nameof(annotation)); + } + + /// + public override string ToString() + { + return $"Link: {Text} ({Uri})"; + } + } +} diff --git a/src/UglyToad.PdfPig/Content/Page.cs b/src/UglyToad.PdfPig/Content/Page.cs index b7843679..375229ef 100644 --- a/src/UglyToad.PdfPig/Content/Page.cs +++ b/src/UglyToad.PdfPig/Content/Page.cs @@ -9,12 +9,15 @@ using Util; using Util.JetBrains.Annotations; using Geometry; + using Tokenization.Scanner; /// /// Contains the content and provides access to methods of a single page in the . /// public class Page { + private readonly AnnotationProvider annotationProvider; + private readonly IPdfTokenScanner pdfScanner; private readonly Lazy textLazy; /// @@ -78,13 +81,14 @@ public Experimental ExperimentalAccess { get; } internal Page(int number, DictionaryToken dictionary, MediaBox mediaBox, CropBox cropBox, PageRotationDegrees rotation, PageContent content, - AnnotationProvider annotationProvider) + AnnotationProvider annotationProvider, + IPdfTokenScanner pdfScanner) { if (number <= 0) { throw new ArgumentOutOfRangeException(nameof(number), "Page number cannot be 0 or negative."); } - + Dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary)); Number = number; @@ -99,6 +103,8 @@ Size = mediaBox.Bounds.GetPageSize(); ExperimentalAccess = new Experimental(this, annotationProvider); + this.annotationProvider = annotationProvider; + this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner)); } private static string GetText(PageContent content) @@ -133,6 +139,15 @@ return (wordExtractor ?? DefaultWordExtractor.Instance).GetWords(Letters); } + /// + /// Get the hyperlinks which link to external resources on the page. + /// These are based on the annotations on the page with a type of '/Link'. + /// + public IReadOnlyList GetHyperlinks() + { + return HyperlinkFactory.GetHyperlinks(this, pdfScanner, annotationProvider); + } + /// /// Gets any images on the page. /// diff --git a/src/UglyToad.PdfPig/Geometry/PdfPoint.cs b/src/UglyToad.PdfPig/Geometry/PdfPoint.cs index 8fd13aed..9663e0ac 100644 --- a/src/UglyToad.PdfPig/Geometry/PdfPoint.cs +++ b/src/UglyToad.PdfPig/Geometry/PdfPoint.cs @@ -101,7 +101,7 @@ { if (obj is PdfPoint point) { - return point.X == this.X && point.Y == this.Y; + return point.X == X && point.Y == Y; } return false; } @@ -114,9 +114,7 @@ return (X, Y).GetHashCode(); } - /// - /// Get a string representation of this point. - /// + /// public override string ToString() { return $"(x:{X}, y:{Y})"; diff --git a/src/UglyToad.PdfPig/Geometry/PdfRectangle.cs b/src/UglyToad.PdfPig/Geometry/PdfRectangle.cs index 5ccc8430..f50f5385 100644 --- a/src/UglyToad.PdfPig/Geometry/PdfRectangle.cs +++ b/src/UglyToad.PdfPig/Geometry/PdfRectangle.cs @@ -134,12 +134,10 @@ /// A new rectangle shifted on the y axis by the given delta value. public PdfRectangle Translate(decimal dx, decimal dy) { - return new PdfRectangle(this.BottomLeft.Translate(dx, dy), this.TopRight.Translate(dx, dy)); + return new PdfRectangle(BottomLeft.Translate(dx, dy), TopRight.Translate(dx, dy)); } - /// - /// To string override. - /// + /// public override string ToString() { return $"[{TopLeft}, {Width}, {Height}]"; diff --git a/src/UglyToad.PdfPig/Parser/PageFactory.cs b/src/UglyToad.PdfPig/Parser/PageFactory.cs index e3dc71c5..be0336fe 100644 --- a/src/UglyToad.PdfPig/Parser/PageFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PageFactory.cs @@ -126,7 +126,9 @@ content = GetContent(bytes, cropBox, userSpaceUnit, rotation, isLenientParsing); } - var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, new AnnotationProvider(pdfScanner, dictionary, isLenientParsing)); + var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, + new AnnotationProvider(pdfScanner, dictionary, isLenientParsing), + pdfScanner); for (var i = 0; i < stackDepth; i++) {