add convenience method to retrieve all hyperlinks and their text from annotations on a page

This commit is contained in:
Eliot Jones
2019-12-18 11:41:02 +00:00
parent 777bf9b63d
commit 1fb416eee3
10 changed files with 233 additions and 11 deletions

View File

@@ -0,0 +1,52 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using Xunit;
public class SinglePageHyperlinksOpenOffice
{
private static string GetFilename()
{
return IntegrationHelpers.GetDocumentPath("Single Page Hyperlinks - from open office.pdf");
}
[Fact]
public void GetsCorrectText()
{
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(1);
Assert.Equal("https://duckduckgo.com/ a link aboveGitHub", page.Text);
}
}
[Fact]
public void GetsHyperlinks()
{
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(1);
var links = page.GetHyperlinks();
Assert.Equal(2, links.Count);
var ddg = links[0];
Assert.Equal("https://duckduckgo.com/", ddg.Text);
Assert.Equal("https://duckduckgo.com/", ddg.Uri);
Assert.Equal("https://duckduckgo.com/ ".Length, ddg.Letters.Count);
Assert.NotNull(ddg.Annotation);
var github = links[1];
Assert.Equal("GitHub", github.Text);
Assert.Equal("https://github.com/", github.Uri);
Assert.Equal(6, github.Letters.Count);
Assert.NotNull(github.Annotation);
}
}
}
}

View File

@@ -55,5 +55,39 @@
Assert.Contains("Söderberg", page.Text);
}
}
[Fact]
public void GetsHyperlinks()
{
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(1);
var links = page.GetHyperlinks();
Assert.Equal(4, links.Count);
var pageLink = links[0];
Assert.Equal("Swedish Touring Car Championship", pageLink.Text);
Assert.Equal("https://en.wikipedia.org/wiki/Swedish_Touring_Car_Championship", pageLink.Uri);
var year2005 = links[1];
Assert.Equal("2005", year2005.Text);
Assert.Equal("https://en.wikipedia.org/wiki/2005_Swedish_Touring_Car_Championship", year2005.Uri);
var year2007 = links[2];
Assert.Equal("2007", year2007.Text);
Assert.Equal("https://en.wikipedia.org/wiki/2007_Swedish_Touring_Car_Championship", year2007.Uri);
var fullLink = links[3];
Assert.Equal("The 2006 Swedish Touring Car Championship season was the 11th Swedish Touring Car Championship (STCC) season. " +
"In total nine racing weekends at six different circuits were held; each", fullLink.Text);
Assert.Equal("https://en.wikipedia.org/wiki/Swedish_Touring_Car_Championship", fullLink.Uri);
}
}
}
}

View File

@@ -65,6 +65,7 @@
"UglyToad.PdfPig.Content.Catalog",
"UglyToad.PdfPig.Content.CropBox",
"UglyToad.PdfPig.Content.DocumentInformation",
"UglyToad.PdfPig.Content.Hyperlink",
"UglyToad.PdfPig.Content.InlineImage",
"UglyToad.PdfPig.Content.IPdfImage",
"UglyToad.PdfPig.Content.Letter",

View File

@@ -0,0 +1,65 @@
namespace UglyToad.PdfPig.Annotations
{
using System.Collections.Generic;
using System.Linq;
using Content;
using Geometry;
using Tokenization.Scanner;
using Tokens;
using Util;
internal static class HyperlinkFactory
{
public static IReadOnlyList<Hyperlink> GetHyperlinks(Page page, IPdfTokenScanner pdfScanner, AnnotationProvider annotationProvider)
{
var result = new List<Hyperlink>();
var annotations = annotationProvider.GetAnnotations();
foreach (var annotation in annotations)
{
if (annotation.Type != AnnotationType.Link)
{
continue;
}
// Must be a link annotation with an action of type /URI.
if (!annotation.AnnotationDictionary.TryGet(NameToken.A, pdfScanner, out DictionaryToken actionDictionary)
|| !actionDictionary.TryGet(NameToken.S, pdfScanner, out NameToken actionType)
|| actionType != NameToken.Uri)
{
continue;
}
// (Required) The uniform resource identifier to resolve, encoded in 7-bit ASCII.
if (!actionDictionary.TryGet(NameToken.Uri, pdfScanner, out IDataToken<string> uriStringToken))
{
continue;
}
var bounds = annotation.Rectangle;
// Build in tolerance for letters close to the link region.
var tolerantBounds = new PdfRectangle(bounds.TopLeft.Translate(-0.5m, 0), bounds.BottomRight.Translate(0.5m, 0));
var linkLetters = new List<Letter>();
foreach (var letter in page.Letters)
{
if (tolerantBounds.Contains(letter.Location, true))
{
linkLetters.Add(letter);
}
}
var words = DefaultWordExtractor.Instance.GetWords(linkLetters);
var presentationText = string.Join(" ", words.Select(x => x.Text));
result.Add(new Hyperlink(bounds, linkLetters, presentationText, uriStringToken.Data, annotation));
}
return result;
}
}
}

View File

@@ -0,0 +1,57 @@
namespace UglyToad.PdfPig.Content
{
using System;
using System.Collections.Generic;
using Annotations;
using Geometry;
/// <summary>
/// Full details for a link annotation which references an external resource.
/// A link to an external resource in a document.
/// </summary>
public class Hyperlink
{
/// <summary>
/// The area on the page which when clicked will open the hyperlink.
/// </summary>
public PdfRectangle Bounds { get; }
/// <summary>
/// The text in the link region (if any).
/// </summary>
public string Text { get; }
/// <summary>
/// The letters in the link region.
/// </summary>
public IReadOnlyList<Letter> Letters { get; }
/// <summary>
/// The URI the link directs to.
/// </summary>
public string Uri { get; set; }
/// <summary>
/// The underlying link annotation.
/// </summary>
public Annotation Annotation { get; }
/// <summary>
/// Create a new <see cref="Hyperlink"/>.
/// </summary>
public Hyperlink(PdfRectangle bounds, IReadOnlyList<Letter> letters, string text, string uri, Annotation annotation)
{
Bounds = bounds;
Text = text ?? string.Empty;
Letters = letters ?? throw new ArgumentNullException(nameof(letters));
Uri = uri ?? string.Empty;
Annotation = annotation ?? throw new ArgumentNullException(nameof(annotation));
}
/// <inheritdoc />
public override string ToString()
{
return $"Link: {Text} ({Uri})";
}
}
}

View File

@@ -9,12 +9,15 @@
using Util;
using Util.JetBrains.Annotations;
using Geometry;
using Tokenization.Scanner;
/// <summary>
/// Contains the content and provides access to methods of a single page in the <see cref="PdfDocument"/>.
/// </summary>
public class Page
{
private readonly AnnotationProvider annotationProvider;
private readonly IPdfTokenScanner pdfScanner;
private readonly Lazy<string> textLazy;
/// <summary>
@@ -78,13 +81,14 @@
public Experimental ExperimentalAccess { get; }
internal Page(int number, DictionaryToken dictionary, MediaBox mediaBox, CropBox cropBox, PageRotationDegrees rotation, PageContent content,
AnnotationProvider annotationProvider)
AnnotationProvider annotationProvider,
IPdfTokenScanner pdfScanner)
{
if (number <= 0)
{
throw new ArgumentOutOfRangeException(nameof(number), "Page number cannot be 0 or negative.");
}
Dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary));
Number = number;
@@ -99,6 +103,8 @@
Size = mediaBox.Bounds.GetPageSize();
ExperimentalAccess = new Experimental(this, annotationProvider);
this.annotationProvider = annotationProvider;
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
}
private static string GetText(PageContent content)
@@ -133,6 +139,15 @@
return (wordExtractor ?? DefaultWordExtractor.Instance).GetWords(Letters);
}
/// <summary>
/// Get the hyperlinks which link to external resources on the page.
/// These are based on the annotations on the page with a type of '/Link'.
/// </summary>
public IReadOnlyList<Hyperlink> GetHyperlinks()
{
return HyperlinkFactory.GetHyperlinks(this, pdfScanner, annotationProvider);
}
/// <summary>
/// Gets any images on the page.
/// </summary>

View File

@@ -101,7 +101,7 @@
{
if (obj is PdfPoint point)
{
return point.X == this.X && point.Y == this.Y;
return point.X == X && point.Y == Y;
}
return false;
}
@@ -114,9 +114,7 @@
return (X, Y).GetHashCode();
}
/// <summary>
/// Get a string representation of this point.
/// </summary>
/// <inheritdoc />
public override string ToString()
{
return $"(x:{X}, y:{Y})";

View File

@@ -134,12 +134,10 @@
/// <returns>A new rectangle shifted on the y axis by the given delta value.</returns>
public PdfRectangle Translate(decimal dx, decimal dy)
{
return new PdfRectangle(this.BottomLeft.Translate(dx, dy), this.TopRight.Translate(dx, dy));
return new PdfRectangle(BottomLeft.Translate(dx, dy), TopRight.Translate(dx, dy));
}
/// <summary>
/// To string override.
/// </summary>
/// <inheritdoc />
public override string ToString()
{
return $"[{TopLeft}, {Width}, {Height}]";

View File

@@ -126,7 +126,9 @@
content = GetContent(bytes, cropBox, userSpaceUnit, rotation, isLenientParsing);
}
var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, new AnnotationProvider(pdfScanner, dictionary, isLenientParsing));
var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content,
new AnnotationProvider(pdfScanner, dictionary, isLenientParsing),
pdfScanner);
for (var i = 0; i < stackDepth; i++)
{