mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-14 10:55:04 +08:00
#15 add classes to extract words and initial tests
This commit is contained in:
@@ -29,6 +29,61 @@
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Page1HasCorrectWords()
|
||||
{
|
||||
var expected = new List<string>
|
||||
{
|
||||
"European",
|
||||
"Comission",
|
||||
"Farmer's",
|
||||
"Hand",
|
||||
"Book",
|
||||
"on",
|
||||
"Pig",
|
||||
"Production",
|
||||
"(For",
|
||||
"the",
|
||||
"small",
|
||||
"holders",
|
||||
"at",
|
||||
"village",
|
||||
"level)",
|
||||
"GCP/NEP/065/EC",
|
||||
"Food",
|
||||
"and",
|
||||
"Agriculture",
|
||||
"Organization",
|
||||
"of",
|
||||
"the",
|
||||
"United",
|
||||
"Nations"
|
||||
};
|
||||
|
||||
using (var document = PdfDocument.Open(GetFilename()))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
var words = page.GetWords().ToList();
|
||||
|
||||
Assert.Equal(expected, words.Select(x => x.Text));
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Page4HasCorrectWords()
|
||||
{
|
||||
var expected = WordsPage4.Split(new[] {"\r", "\r\n", "\n", " "}, StringSplitOptions.RemoveEmptyEntries);
|
||||
using (var document = PdfDocument.Open(GetFilename()))
|
||||
{
|
||||
var page = document.GetPage(4);
|
||||
|
||||
var words = page.GetWords().ToList();
|
||||
|
||||
Assert.Equal(expected, words.Select(x => x.Text));
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanReadPage9()
|
||||
{
|
||||
@@ -243,5 +298,27 @@
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private const string WordsPage4 = @"Disclaimer
|
||||
The designations employed end the presentation of the material in this information
|
||||
product do not imply the expression of any opinion whatsoever on the part of the
|
||||
Food and Agriculture Organization of the United Nations (FAO) concerning the
|
||||
legal or development status of any country, territory, city or area of its authorities,
|
||||
or concerning the delimitation of its frontiers or boundaries. The mention of
|
||||
specific companies or products of manufacturers, whether or not these have been
|
||||
patented, does not imply that these have been endorsed or recommended by FAO
|
||||
in preference to others of similar nature that are not mentioned.
|
||||
The views expressed in this publication are those of the author(s) and do not
|
||||
necessarily reflects the views of FAO.
|
||||
All rights reserved. Reproduction and dissemination of materials in this information
|
||||
product for educational or other non-commercial purposes are authorized without
|
||||
any prior written permission from the copyright holders provided the source is
|
||||
fully acknowledged. Reproduction in this information product for resale or other
|
||||
commercial purposes is prohibited without written permission of the copyright
|
||||
holders. Applications for such permission should be addressed to: Chief, Electronic
|
||||
Publishing Policy and Support Branch Communication Division, FAO, Viale delle
|
||||
Terme di Caracalla, 00153 Rome, Italy or by e-mail to: copyright@fao.org
|
||||
FAO 2009
|
||||
design&print: wps, eMail: printnepal@gmail.com";
|
||||
}
|
||||
}
|
@@ -38,6 +38,7 @@
|
||||
"UglyToad.PdfPig.Content.Letter",
|
||||
"UglyToad.PdfPig.Content.Page",
|
||||
"UglyToad.PdfPig.Content.PageSize",
|
||||
"UglyToad.PdfPig.Content.Word",
|
||||
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
|
||||
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
|
||||
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
|
||||
@@ -54,7 +55,8 @@
|
||||
"UglyToad.PdfPig.Tokens.NumericToken",
|
||||
"UglyToad.PdfPig.Tokens.ObjectToken",
|
||||
"UglyToad.PdfPig.Tokens.StreamToken",
|
||||
"UglyToad.PdfPig.Tokens.StringToken"
|
||||
"UglyToad.PdfPig.Tokens.StringToken",
|
||||
"UglyToad.PdfPig.Util.IWordExtractor"
|
||||
};
|
||||
|
||||
foreach (var publicTypeName in publicTypeNames)
|
||||
|
@@ -3,6 +3,7 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Util;
|
||||
|
||||
/// <summary>
|
||||
/// Contains the content and provides access to methods of a single page in the <see cref="PdfDocument"/>.
|
||||
@@ -73,5 +74,20 @@
|
||||
|
||||
return string.Join(string.Empty, content.Letters.Select(x => x.Value));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Use the default <see cref="IWordExtractor"/> to get the words for this page.
|
||||
/// </summary>
|
||||
/// <returns>The words on this page.</returns>
|
||||
public IEnumerable<Word> GetWords() => GetWords(DefaultWordExtractor.Instance);
|
||||
/// <summary>
|
||||
/// Use a custom <see cref="IWordExtractor"/> to get the words for this page.
|
||||
/// </summary>
|
||||
/// <param name="wordExtractor">The word extractor to use to generate words.</param>
|
||||
/// <returns>The words on this page.</returns>
|
||||
public IEnumerable<Word> GetWords(IWordExtractor wordExtractor)
|
||||
{
|
||||
return (wordExtractor ?? DefaultWordExtractor.Instance).GetWords(Letters);
|
||||
}
|
||||
}
|
||||
}
|
61
src/UglyToad.PdfPig/Content/Word.cs
Normal file
61
src/UglyToad.PdfPig/Content/Word.cs
Normal file
@@ -0,0 +1,61 @@
|
||||
namespace UglyToad.PdfPig.Content
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Geometry;
|
||||
|
||||
/// <summary>
|
||||
/// A word.
|
||||
/// </summary>
|
||||
public class Word
|
||||
{
|
||||
/// <summary>
|
||||
/// The text of the word.
|
||||
/// </summary>
|
||||
public string Text { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The rectangle completely containing the word.
|
||||
/// </summary>
|
||||
public PdfRectangle BoundingBox { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The name of the font for the word.
|
||||
/// </summary>
|
||||
public string FontName { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="Word"/>.
|
||||
/// </summary>
|
||||
/// <param name="letters">The letters contained in the word.</param>
|
||||
public Word(IReadOnlyList<Letter> letters)
|
||||
{
|
||||
if (letters == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(letters));
|
||||
}
|
||||
|
||||
if (letters.Count == 0)
|
||||
{
|
||||
throw new ArgumentException("Empty letters provided.", nameof(letters));
|
||||
}
|
||||
|
||||
Text = string.Join(string.Empty, letters.Select(x => x.Value));
|
||||
|
||||
var minX = letters.Min(x => x.Origin.X);
|
||||
var minY = letters.Min(x => x.Origin.Y);
|
||||
var maxX = letters.Max(x => x.Origin.X + x.Width);
|
||||
var maxY = letters.Max(x => x.GlyphRectangle.Top);
|
||||
|
||||
BoundingBox = new PdfRectangle(minX, minY, maxX, maxY);
|
||||
FontName = letters[0].FontName;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
return Text;
|
||||
}
|
||||
}
|
||||
}
|
107
src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs
Normal file
107
src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs
Normal file
@@ -0,0 +1,107 @@
|
||||
namespace UglyToad.PdfPig.Util
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Content;
|
||||
|
||||
internal class DefaultWordExtractor : IWordExtractor
|
||||
{
|
||||
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
|
||||
{
|
||||
var lettersOrder = letters.OrderByDescending(x => x.Origin.Y)
|
||||
.ThenBy(x => x.Origin.X);
|
||||
|
||||
var lettersSoFar = new List<Letter>(10);
|
||||
|
||||
var y = default(decimal?);
|
||||
var lastX = default(decimal?);
|
||||
var lastLetter = default(Letter);
|
||||
foreach (var letter in lettersOrder)
|
||||
{
|
||||
if (!y.HasValue)
|
||||
{
|
||||
y = letter.Origin.Y;
|
||||
}
|
||||
|
||||
if (!lastX.HasValue)
|
||||
{
|
||||
lastX = letter.Origin.X;
|
||||
}
|
||||
|
||||
if (lastLetter == null)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(letter.Value))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
lettersSoFar.Add(letter);
|
||||
lastLetter = letter;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (letter.Origin.Y > y.Value + 0.5m)
|
||||
{
|
||||
if (lettersSoFar.Count > 0)
|
||||
{
|
||||
yield return GenerateWord(lettersSoFar);
|
||||
lettersSoFar.Clear();
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(letter.Value))
|
||||
{
|
||||
lettersSoFar.Add(letter);
|
||||
}
|
||||
|
||||
y = letter.Origin.Y;
|
||||
lastX = letter.Origin.X;
|
||||
lastLetter = letter;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
var gap = letter.Origin.X - (lastLetter.Origin.X + lastLetter.Width);
|
||||
var nextToLeft = letter.Origin.X < lastX.Value - 1;
|
||||
var nextBigSpace = gap > Math.Max(lastLetter.Width, letter.Width) * 0.9m;
|
||||
var nextIsWhiteSpace = string.IsNullOrWhiteSpace(letter.Value);
|
||||
var nextFontDiffers = !string.Equals(letter.FontName, lastLetter.FontName, StringComparison.OrdinalIgnoreCase) && gap > letter.Width * 0.1m;
|
||||
var nextFontSizeDiffers = Math.Abs(letter.FontSize - lastLetter.FontSize) > 0.1m;
|
||||
|
||||
if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers)
|
||||
{
|
||||
if (lettersSoFar.Count > 0)
|
||||
{
|
||||
yield return GenerateWord(lettersSoFar);
|
||||
lettersSoFar.Clear();
|
||||
}
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(letter.Value))
|
||||
{
|
||||
lettersSoFar.Add(letter);
|
||||
}
|
||||
|
||||
lastLetter = letter;
|
||||
|
||||
lastX = letter.Origin.X;
|
||||
}
|
||||
|
||||
if (lettersSoFar.Count > 0)
|
||||
{
|
||||
yield return GenerateWord(lettersSoFar);
|
||||
}
|
||||
}
|
||||
|
||||
private static Word GenerateWord(List<Letter> letters)
|
||||
{
|
||||
return new Word(letters);
|
||||
}
|
||||
|
||||
public static IWordExtractor Instance { get; } = new DefaultWordExtractor();
|
||||
|
||||
private DefaultWordExtractor()
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
18
src/UglyToad.PdfPig/Util/IWordExtractor.cs
Normal file
18
src/UglyToad.PdfPig/Util/IWordExtractor.cs
Normal file
@@ -0,0 +1,18 @@
|
||||
namespace UglyToad.PdfPig.Util
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using Content;
|
||||
|
||||
/// <summary>
|
||||
/// An approach used to generate words from a set of letters.
|
||||
/// </summary>
|
||||
public interface IWordExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Generate words from the input set of letters.
|
||||
/// </summary>
|
||||
/// <param name="letters">The letters to generate words for.</param>
|
||||
/// <returns>An enumerable of words from this approach.</returns>
|
||||
IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user