#15 add classes to extract words and initial tests

This commit is contained in:
Eliot Jones
2018-11-24 20:51:27 +00:00
parent c56da9a25a
commit 17909f8565
6 changed files with 282 additions and 1 deletions

View File

@@ -29,6 +29,61 @@
}
}
[Fact]
public void Page1HasCorrectWords()
{
var expected = new List<string>
{
"European",
"Comission",
"Farmer's",
"Hand",
"Book",
"on",
"Pig",
"Production",
"(For",
"the",
"small",
"holders",
"at",
"village",
"level)",
"GCP/NEP/065/EC",
"Food",
"and",
"Agriculture",
"Organization",
"of",
"the",
"United",
"Nations"
};
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
var words = page.GetWords().ToList();
Assert.Equal(expected, words.Select(x => x.Text));
}
}
[Fact]
public void Page4HasCorrectWords()
{
var expected = WordsPage4.Split(new[] {"\r", "\r\n", "\n", " "}, StringSplitOptions.RemoveEmptyEntries);
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(4);
var words = page.GetWords().ToList();
Assert.Equal(expected, words.Select(x => x.Text));
}
}
[Fact]
public void CanReadPage9()
{
@@ -243,5 +298,27 @@
return result;
}
private const string WordsPage4 = @"Disclaimer
The designations employed end the presentation of the material in this information
product do not imply the expression of any opinion whatsoever on the part of the
Food and Agriculture Organization of the United Nations (FAO) concerning the
legal or development status of any country, territory, city or area of its authorities,
or concerning the delimitation of its frontiers or boundaries. The mention of
specific companies or products of manufacturers, whether or not these have been
patented, does not imply that these have been endorsed or recommended by FAO
in preference to others of similar nature that are not mentioned.
The views expressed in this publication are those of the author(s) and do not
necessarily reflects the views of FAO.
All rights reserved. Reproduction and dissemination of materials in this information
product for educational or other non-commercial purposes are authorized without
any prior written permission from the copyright holders provided the source is
fully acknowledged. Reproduction in this information product for resale or other
commercial purposes is prohibited without written permission of the copyright
holders. Applications for such permission should be addressed to: Chief, Electronic
Publishing Policy and Support Branch Communication Division, FAO, Viale delle
Terme di Caracalla, 00153 Rome, Italy or by e-mail to: copyright@fao.org
FAO 2009
design&print: wps, eMail: printnepal@gmail.com";
}
}

View File

@@ -38,6 +38,7 @@
"UglyToad.PdfPig.Content.Letter",
"UglyToad.PdfPig.Content.Page",
"UglyToad.PdfPig.Content.PageSize",
"UglyToad.PdfPig.Content.Word",
"UglyToad.PdfPig.CrossReference.CrossReferenceTable",
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
@@ -54,7 +55,8 @@
"UglyToad.PdfPig.Tokens.NumericToken",
"UglyToad.PdfPig.Tokens.ObjectToken",
"UglyToad.PdfPig.Tokens.StreamToken",
"UglyToad.PdfPig.Tokens.StringToken"
"UglyToad.PdfPig.Tokens.StringToken",
"UglyToad.PdfPig.Util.IWordExtractor"
};
foreach (var publicTypeName in publicTypeNames)

View File

@@ -3,6 +3,7 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Util;
/// <summary>
/// Contains the content and provides access to methods of a single page in the <see cref="PdfDocument"/>.
@@ -73,5 +74,20 @@
return string.Join(string.Empty, content.Letters.Select(x => x.Value));
}
/// <summary>
/// Use the default <see cref="IWordExtractor"/> to get the words for this page.
/// </summary>
/// <returns>The words on this page.</returns>
public IEnumerable<Word> GetWords() => GetWords(DefaultWordExtractor.Instance);
/// <summary>
/// Use a custom <see cref="IWordExtractor"/> to get the words for this page.
/// </summary>
/// <param name="wordExtractor">The word extractor to use to generate words.</param>
/// <returns>The words on this page.</returns>
public IEnumerable<Word> GetWords(IWordExtractor wordExtractor)
{
return (wordExtractor ?? DefaultWordExtractor.Instance).GetWords(Letters);
}
}
}

View File

@@ -0,0 +1,61 @@
namespace UglyToad.PdfPig.Content
{
using System;
using System.Collections.Generic;
using System.Linq;
using Geometry;
/// <summary>
/// A word.
/// </summary>
public class Word
{
/// <summary>
/// The text of the word.
/// </summary>
public string Text { get; }
/// <summary>
/// The rectangle completely containing the word.
/// </summary>
public PdfRectangle BoundingBox { get; }
/// <summary>
/// The name of the font for the word.
/// </summary>
public string FontName { get; }
/// <summary>
/// Create a new <see cref="Word"/>.
/// </summary>
/// <param name="letters">The letters contained in the word.</param>
public Word(IReadOnlyList<Letter> letters)
{
if (letters == null)
{
throw new ArgumentNullException(nameof(letters));
}
if (letters.Count == 0)
{
throw new ArgumentException("Empty letters provided.", nameof(letters));
}
Text = string.Join(string.Empty, letters.Select(x => x.Value));
var minX = letters.Min(x => x.Origin.X);
var minY = letters.Min(x => x.Origin.Y);
var maxX = letters.Max(x => x.Origin.X + x.Width);
var maxY = letters.Max(x => x.GlyphRectangle.Top);
BoundingBox = new PdfRectangle(minX, minY, maxX, maxY);
FontName = letters[0].FontName;
}
/// <inheritdoc />
public override string ToString()
{
return Text;
}
}
}

View File

@@ -0,0 +1,107 @@
namespace UglyToad.PdfPig.Util
{
using System;
using System.Collections.Generic;
using System.Linq;
using Content;
internal class DefaultWordExtractor : IWordExtractor
{
public IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters)
{
var lettersOrder = letters.OrderByDescending(x => x.Origin.Y)
.ThenBy(x => x.Origin.X);
var lettersSoFar = new List<Letter>(10);
var y = default(decimal?);
var lastX = default(decimal?);
var lastLetter = default(Letter);
foreach (var letter in lettersOrder)
{
if (!y.HasValue)
{
y = letter.Origin.Y;
}
if (!lastX.HasValue)
{
lastX = letter.Origin.X;
}
if (lastLetter == null)
{
if (string.IsNullOrWhiteSpace(letter.Value))
{
continue;
}
lettersSoFar.Add(letter);
lastLetter = letter;
continue;
}
if (letter.Origin.Y > y.Value + 0.5m)
{
if (lettersSoFar.Count > 0)
{
yield return GenerateWord(lettersSoFar);
lettersSoFar.Clear();
}
if (!string.IsNullOrWhiteSpace(letter.Value))
{
lettersSoFar.Add(letter);
}
y = letter.Origin.Y;
lastX = letter.Origin.X;
lastLetter = letter;
continue;
}
var gap = letter.Origin.X - (lastLetter.Origin.X + lastLetter.Width);
var nextToLeft = letter.Origin.X < lastX.Value - 1;
var nextBigSpace = gap > Math.Max(lastLetter.Width, letter.Width) * 0.9m;
var nextIsWhiteSpace = string.IsNullOrWhiteSpace(letter.Value);
var nextFontDiffers = !string.Equals(letter.FontName, lastLetter.FontName, StringComparison.OrdinalIgnoreCase) && gap > letter.Width * 0.1m;
var nextFontSizeDiffers = Math.Abs(letter.FontSize - lastLetter.FontSize) > 0.1m;
if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers)
{
if (lettersSoFar.Count > 0)
{
yield return GenerateWord(lettersSoFar);
lettersSoFar.Clear();
}
}
if (!string.IsNullOrWhiteSpace(letter.Value))
{
lettersSoFar.Add(letter);
}
lastLetter = letter;
lastX = letter.Origin.X;
}
if (lettersSoFar.Count > 0)
{
yield return GenerateWord(lettersSoFar);
}
}
private static Word GenerateWord(List<Letter> letters)
{
return new Word(letters);
}
public static IWordExtractor Instance { get; } = new DefaultWordExtractor();
private DefaultWordExtractor()
{
}
}
}

View File

@@ -0,0 +1,18 @@
namespace UglyToad.PdfPig.Util
{
using System.Collections.Generic;
using Content;
/// <summary>
/// An approach used to generate words from a set of letters.
/// </summary>
public interface IWordExtractor
{
/// <summary>
/// Generate words from the input set of letters.
/// </summary>
/// <param name="letters">The letters to generate words for.</param>
/// <returns>An enumerable of words from this approach.</returns>
IEnumerable<Word> GetWords(IReadOnlyList<Letter> letters);
}
}