mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
add convenience method to retrieve all hyperlinks and their text from annotations on a page
This commit is contained in:
Binary file not shown.
@@ -0,0 +1,52 @@
|
||||
namespace UglyToad.PdfPig.Tests.Integration
|
||||
{
|
||||
using Xunit;
|
||||
|
||||
public class SinglePageHyperlinksOpenOffice
|
||||
{
|
||||
private static string GetFilename()
|
||||
{
|
||||
return IntegrationHelpers.GetDocumentPath("Single Page Hyperlinks - from open office.pdf");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetsCorrectText()
|
||||
{
|
||||
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
Assert.Equal("https://duckduckgo.com/ a link aboveGitHub", page.Text);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetsHyperlinks()
|
||||
{
|
||||
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
var links = page.GetHyperlinks();
|
||||
|
||||
Assert.Equal(2, links.Count);
|
||||
|
||||
var ddg = links[0];
|
||||
|
||||
Assert.Equal("https://duckduckgo.com/", ddg.Text);
|
||||
Assert.Equal("https://duckduckgo.com/", ddg.Uri);
|
||||
Assert.Equal("https://duckduckgo.com/ ".Length, ddg.Letters.Count);
|
||||
|
||||
Assert.NotNull(ddg.Annotation);
|
||||
|
||||
var github = links[1];
|
||||
|
||||
Assert.Equal("GitHub", github.Text);
|
||||
Assert.Equal("https://github.com/", github.Uri);
|
||||
Assert.Equal(6, github.Letters.Count);
|
||||
|
||||
Assert.NotNull(github.Annotation);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -55,5 +55,39 @@
|
||||
Assert.Contains("Söderberg", page.Text);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetsHyperlinks()
|
||||
{
|
||||
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
var links = page.GetHyperlinks();
|
||||
|
||||
Assert.Equal(4, links.Count);
|
||||
|
||||
var pageLink = links[0];
|
||||
|
||||
Assert.Equal("Swedish Touring Car Championship", pageLink.Text);
|
||||
Assert.Equal("https://en.wikipedia.org/wiki/Swedish_Touring_Car_Championship", pageLink.Uri);
|
||||
|
||||
var year2005 = links[1];
|
||||
|
||||
Assert.Equal("2005", year2005.Text);
|
||||
Assert.Equal("https://en.wikipedia.org/wiki/2005_Swedish_Touring_Car_Championship", year2005.Uri);
|
||||
|
||||
var year2007 = links[2];
|
||||
|
||||
Assert.Equal("2007", year2007.Text);
|
||||
Assert.Equal("https://en.wikipedia.org/wiki/2007_Swedish_Touring_Car_Championship", year2007.Uri);
|
||||
|
||||
var fullLink = links[3];
|
||||
|
||||
Assert.Equal("The 2006 Swedish Touring Car Championship season was the 11th Swedish Touring Car Championship (STCC) season. " +
|
||||
"In total nine racing weekends at six different circuits were held; each", fullLink.Text);
|
||||
Assert.Equal("https://en.wikipedia.org/wiki/Swedish_Touring_Car_Championship", fullLink.Uri);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,6 +65,7 @@
|
||||
"UglyToad.PdfPig.Content.Catalog",
|
||||
"UglyToad.PdfPig.Content.CropBox",
|
||||
"UglyToad.PdfPig.Content.DocumentInformation",
|
||||
"UglyToad.PdfPig.Content.Hyperlink",
|
||||
"UglyToad.PdfPig.Content.InlineImage",
|
||||
"UglyToad.PdfPig.Content.IPdfImage",
|
||||
"UglyToad.PdfPig.Content.Letter",
|
||||
|
||||
65
src/UglyToad.PdfPig/Annotations/HyperlinkFactory.cs
Normal file
65
src/UglyToad.PdfPig/Annotations/HyperlinkFactory.cs
Normal file
@@ -0,0 +1,65 @@
|
||||
namespace UglyToad.PdfPig.Annotations
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
using Geometry;
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
using Util;
|
||||
|
||||
internal static class HyperlinkFactory
|
||||
{
|
||||
public static IReadOnlyList<Hyperlink> GetHyperlinks(Page page, IPdfTokenScanner pdfScanner, AnnotationProvider annotationProvider)
|
||||
{
|
||||
var result = new List<Hyperlink>();
|
||||
|
||||
var annotations = annotationProvider.GetAnnotations();
|
||||
|
||||
foreach (var annotation in annotations)
|
||||
{
|
||||
if (annotation.Type != AnnotationType.Link)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Must be a link annotation with an action of type /URI.
|
||||
if (!annotation.AnnotationDictionary.TryGet(NameToken.A, pdfScanner, out DictionaryToken actionDictionary)
|
||||
|| !actionDictionary.TryGet(NameToken.S, pdfScanner, out NameToken actionType)
|
||||
|| actionType != NameToken.Uri)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// (Required) The uniform resource identifier to resolve, encoded in 7-bit ASCII.
|
||||
if (!actionDictionary.TryGet(NameToken.Uri, pdfScanner, out IDataToken<string> uriStringToken))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var bounds = annotation.Rectangle;
|
||||
|
||||
// Build in tolerance for letters close to the link region.
|
||||
var tolerantBounds = new PdfRectangle(bounds.TopLeft.Translate(-0.5m, 0), bounds.BottomRight.Translate(0.5m, 0));
|
||||
|
||||
var linkLetters = new List<Letter>();
|
||||
|
||||
foreach (var letter in page.Letters)
|
||||
{
|
||||
if (tolerantBounds.Contains(letter.Location, true))
|
||||
{
|
||||
linkLetters.Add(letter);
|
||||
}
|
||||
}
|
||||
|
||||
var words = DefaultWordExtractor.Instance.GetWords(linkLetters);
|
||||
|
||||
var presentationText = string.Join(" ", words.Select(x => x.Text));
|
||||
|
||||
result.Add(new Hyperlink(bounds, linkLetters, presentationText, uriStringToken.Data, annotation));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
57
src/UglyToad.PdfPig/Content/Hyperlink.cs
Normal file
57
src/UglyToad.PdfPig/Content/Hyperlink.cs
Normal file
@@ -0,0 +1,57 @@
|
||||
namespace UglyToad.PdfPig.Content
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Annotations;
|
||||
using Geometry;
|
||||
|
||||
/// <summary>
|
||||
/// Full details for a link annotation which references an external resource.
|
||||
/// A link to an external resource in a document.
|
||||
/// </summary>
|
||||
public class Hyperlink
|
||||
{
|
||||
/// <summary>
|
||||
/// The area on the page which when clicked will open the hyperlink.
|
||||
/// </summary>
|
||||
public PdfRectangle Bounds { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The text in the link region (if any).
|
||||
/// </summary>
|
||||
public string Text { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The letters in the link region.
|
||||
/// </summary>
|
||||
public IReadOnlyList<Letter> Letters { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The URI the link directs to.
|
||||
/// </summary>
|
||||
public string Uri { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The underlying link annotation.
|
||||
/// </summary>
|
||||
public Annotation Annotation { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="Hyperlink"/>.
|
||||
/// </summary>
|
||||
public Hyperlink(PdfRectangle bounds, IReadOnlyList<Letter> letters, string text, string uri, Annotation annotation)
|
||||
{
|
||||
Bounds = bounds;
|
||||
Text = text ?? string.Empty;
|
||||
Letters = letters ?? throw new ArgumentNullException(nameof(letters));
|
||||
Uri = uri ?? string.Empty;
|
||||
Annotation = annotation ?? throw new ArgumentNullException(nameof(annotation));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
return $"Link: {Text} ({Uri})";
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9,12 +9,15 @@
|
||||
using Util;
|
||||
using Util.JetBrains.Annotations;
|
||||
using Geometry;
|
||||
using Tokenization.Scanner;
|
||||
|
||||
/// <summary>
|
||||
/// Contains the content and provides access to methods of a single page in the <see cref="PdfDocument"/>.
|
||||
/// </summary>
|
||||
public class Page
|
||||
{
|
||||
private readonly AnnotationProvider annotationProvider;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
private readonly Lazy<string> textLazy;
|
||||
|
||||
/// <summary>
|
||||
@@ -78,13 +81,14 @@
|
||||
public Experimental ExperimentalAccess { get; }
|
||||
|
||||
internal Page(int number, DictionaryToken dictionary, MediaBox mediaBox, CropBox cropBox, PageRotationDegrees rotation, PageContent content,
|
||||
AnnotationProvider annotationProvider)
|
||||
AnnotationProvider annotationProvider,
|
||||
IPdfTokenScanner pdfScanner)
|
||||
{
|
||||
if (number <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(number), "Page number cannot be 0 or negative.");
|
||||
}
|
||||
|
||||
|
||||
Dictionary = dictionary ?? throw new ArgumentNullException(nameof(dictionary));
|
||||
|
||||
Number = number;
|
||||
@@ -99,6 +103,8 @@
|
||||
|
||||
Size = mediaBox.Bounds.GetPageSize();
|
||||
ExperimentalAccess = new Experimental(this, annotationProvider);
|
||||
this.annotationProvider = annotationProvider;
|
||||
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
|
||||
}
|
||||
|
||||
private static string GetText(PageContent content)
|
||||
@@ -133,6 +139,15 @@
|
||||
return (wordExtractor ?? DefaultWordExtractor.Instance).GetWords(Letters);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the hyperlinks which link to external resources on the page.
|
||||
/// These are based on the annotations on the page with a type of '/Link'.
|
||||
/// </summary>
|
||||
public IReadOnlyList<Hyperlink> GetHyperlinks()
|
||||
{
|
||||
return HyperlinkFactory.GetHyperlinks(this, pdfScanner, annotationProvider);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets any images on the page.
|
||||
/// </summary>
|
||||
|
||||
@@ -101,7 +101,7 @@
|
||||
{
|
||||
if (obj is PdfPoint point)
|
||||
{
|
||||
return point.X == this.X && point.Y == this.Y;
|
||||
return point.X == X && point.Y == Y;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -114,9 +114,7 @@
|
||||
return (X, Y).GetHashCode();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get a string representation of this point.
|
||||
/// </summary>
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
return $"(x:{X}, y:{Y})";
|
||||
|
||||
@@ -134,12 +134,10 @@
|
||||
/// <returns>A new rectangle shifted on the y axis by the given delta value.</returns>
|
||||
public PdfRectangle Translate(decimal dx, decimal dy)
|
||||
{
|
||||
return new PdfRectangle(this.BottomLeft.Translate(dx, dy), this.TopRight.Translate(dx, dy));
|
||||
return new PdfRectangle(BottomLeft.Translate(dx, dy), TopRight.Translate(dx, dy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// To string override.
|
||||
/// </summary>
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
return $"[{TopLeft}, {Width}, {Height}]";
|
||||
|
||||
@@ -126,7 +126,9 @@
|
||||
content = GetContent(bytes, cropBox, userSpaceUnit, rotation, isLenientParsing);
|
||||
}
|
||||
|
||||
var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, new AnnotationProvider(pdfScanner, dictionary, isLenientParsing));
|
||||
var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content,
|
||||
new AnnotationProvider(pdfScanner, dictionary, isLenientParsing),
|
||||
pdfScanner);
|
||||
|
||||
for (var i = 0; i < stackDepth; i++)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user