mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
Improve KdTree memory allocation using ArraySegments
This commit is contained in:
@@ -0,0 +1,58 @@
|
|||||||
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Useful <see cref="ArraySegment{T}"/> extensions.
|
||||||
|
/// </summary>
|
||||||
|
public static class ArraySegmentExtensions
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Returns a specified number of contiguous elements from the start of a sequence.
|
||||||
|
/// </summary>
|
||||||
|
/// <typeparam name="T">The type of the elements of <see name="source"/>.</typeparam>
|
||||||
|
/// <param name="source">An <see cref="ArraySegment{T}"/> to return elements from.</param>
|
||||||
|
/// <param name="count">The number of elements to return.</param>
|
||||||
|
/// <returns>An <see cref="ArraySegment{T}"/> that contains the specified number of elements from the start of the input sequence.</returns>
|
||||||
|
public static ArraySegment<T> Take<T>(this ArraySegment<T> source, int count)
|
||||||
|
{
|
||||||
|
return new ArraySegment<T>(source.Array, source.Offset, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Bypasses a specified number of elements in a sequence and then returns the remaining elements.
|
||||||
|
/// </summary>
|
||||||
|
/// <typeparam name="T">The type of the elements of <see name="source"/>.</typeparam>
|
||||||
|
/// <param name="source">An <see cref="ArraySegment{T}"/> to return elements from.</param>
|
||||||
|
/// <param name="count">The number of elements to skip before returning the remaining elements.</param>
|
||||||
|
/// <returns>An <see cref="ArraySegment{T}"/> that contains the elements that occur after the specified index in the input sequence.</returns>
|
||||||
|
public static ArraySegment<T> Skip<T>(this ArraySegment<T> source, int count)
|
||||||
|
{
|
||||||
|
return new ArraySegment<T>(source.Array, source.Offset + count, source.Count - count);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Sorts the elements in a <see cref="ArraySegment{T}"/> using the specified <see cref="IComparer{T}"/>.
|
||||||
|
/// </summary>
|
||||||
|
/// <typeparam name="T">The type of the elements of <see name="source"/>.</typeparam>
|
||||||
|
/// <param name="source">The <see cref="ArraySegment{T}"/> to sort.</param>
|
||||||
|
/// <param name="comparer">The implementation to use when comparing elements.</param>
|
||||||
|
public static void Sort<T>(this ArraySegment<T> source, IComparer<T> comparer)
|
||||||
|
{
|
||||||
|
Array.Sort(source.Array, source.Offset, source.Count, comparer);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Returns the element at a specified index in a sequence.
|
||||||
|
/// </summary>
|
||||||
|
/// <typeparam name="T">The type of the elements of <see name="source"/>.</typeparam>
|
||||||
|
/// <param name="source">The <see cref="ArraySegment{T}"/> to get the element from.</param>
|
||||||
|
/// <param name="index">The index of the element to retrieve.</param>
|
||||||
|
/// <returns>The element at the specified position in the <see name="source"/> sequence.</returns>
|
||||||
|
public static T GetAt<T>(this ArraySegment<T> source, int index)
|
||||||
|
{
|
||||||
|
return source.Array[source.Offset + index];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -54,6 +54,9 @@
|
|||||||
/// <typeparam name="T"></typeparam>
|
/// <typeparam name="T"></typeparam>
|
||||||
public class KdTree<T>
|
public class KdTree<T>
|
||||||
{
|
{
|
||||||
|
private readonly KdTreeComparerY kdTreeComparerY = new KdTreeComparerY();
|
||||||
|
private readonly KdTreeComparerX kdTreeComparerX = new KdTreeComparerX();
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// The root of the tree.
|
/// The root of the tree.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
@@ -77,40 +80,49 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
Count = elements.Count;
|
Count = elements.Count;
|
||||||
Root = BuildTree(Enumerable.Range(0, elements.Count).Zip(elements, (e, p) => (e, elementsPointFunc(p), p)).ToArray(), 0);
|
|
||||||
|
KdTreeElement<T>[] array = new KdTreeElement<T>[Count];
|
||||||
|
|
||||||
|
for (int i = 0; i < Count; i++)
|
||||||
|
{
|
||||||
|
var el = elements[i];
|
||||||
|
array[i] = new KdTreeElement<T>(i, elementsPointFunc(el), el);
|
||||||
|
}
|
||||||
|
|
||||||
|
Root = BuildTree(new ArraySegment<KdTreeElement<T>>(array));
|
||||||
}
|
}
|
||||||
|
|
||||||
private KdTreeNode<T> BuildTree((int, PdfPoint, T)[] P, int depth)
|
private KdTreeNode<T> BuildTree(ArraySegment<KdTreeElement<T>> P, int depth = 0)
|
||||||
{
|
{
|
||||||
if (P.Length == 0)
|
if (P.Count == 0)
|
||||||
{
|
{
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
else if (P.Length == 1)
|
else if (P.Count == 1)
|
||||||
{
|
{
|
||||||
return new KdTreeLeaf<T>(P[0], depth);
|
return new KdTreeLeaf<T>(P.GetAt(0), depth);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (depth % 2 == 0)
|
if (depth % 2 == 0)
|
||||||
{
|
{
|
||||||
Array.Sort(P, (p0, p1) => p0.Item2.X.CompareTo(p1.Item2.X));
|
P.Sort(kdTreeComparerX);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
Array.Sort(P, (p0, p1) => p0.Item2.Y.CompareTo(p1.Item2.Y));
|
P.Sort(kdTreeComparerY);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (P.Length == 2)
|
if (P.Count == 2)
|
||||||
{
|
{
|
||||||
return new KdTreeNode<T>(new KdTreeLeaf<T>(P[0], depth + 1), null, P[1], depth);
|
return new KdTreeNode<T>(new KdTreeLeaf<T>(P.GetAt(0), depth + 1), null, P.GetAt(1), depth);
|
||||||
}
|
}
|
||||||
|
|
||||||
int median = P.Length / 2;
|
int median = P.Count / 2;
|
||||||
|
|
||||||
KdTreeNode<T> vLeft = BuildTree(P.Take(median).ToArray(), depth + 1);
|
KdTreeNode<T> vLeft = BuildTree(P.Take(median), depth + 1);
|
||||||
KdTreeNode<T> vRight = BuildTree(P.Skip(median + 1).ToArray(), depth + 1);
|
KdTreeNode<T> vRight = BuildTree(P.Skip(median + 1), depth + 1);
|
||||||
|
|
||||||
return new KdTreeNode<T>(vLeft, vRight, P[median], depth);
|
return new KdTreeNode<T>(vLeft, vRight, P.GetAt(median), depth);
|
||||||
}
|
}
|
||||||
|
|
||||||
#region NN
|
#region NN
|
||||||
@@ -216,7 +228,7 @@
|
|||||||
{
|
{
|
||||||
var kdTreeNodes = new KNearestNeighboursQueue(k);
|
var kdTreeNodes = new KNearestNeighboursQueue(k);
|
||||||
FindNearestNeighbours(Root, pivot, k, pivotPointFunc, distanceMeasure, kdTreeNodes);
|
FindNearestNeighbours(Root, pivot, k, pivotPointFunc, distanceMeasure, kdTreeNodes);
|
||||||
return kdTreeNodes.SelectMany(n => n.Value.Select(e => (e.Element, e.Index, n.Key))).ToList();
|
return kdTreeNodes.SelectMany(n => n.Value.Select(e => (e.Element, e.Index, n.Key))).ToArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static (KdTreeNode<T>, double) FindNearestNeighbours(KdTreeNode<T> node, T pivot, int k,
|
private static (KdTreeNode<T>, double) FindNearestNeighbours(KdTreeNode<T> node, T pivot, int k,
|
||||||
@@ -350,6 +362,38 @@
|
|||||||
}
|
}
|
||||||
#endregion
|
#endregion
|
||||||
|
|
||||||
|
internal readonly struct KdTreeElement<R>
|
||||||
|
{
|
||||||
|
internal KdTreeElement(int index, PdfPoint point, R value)
|
||||||
|
{
|
||||||
|
Index = index;
|
||||||
|
Value = point;
|
||||||
|
Element = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int Index { get; }
|
||||||
|
|
||||||
|
public PdfPoint Value { get; }
|
||||||
|
|
||||||
|
public R Element { get; }
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed class KdTreeComparerY : IComparer<KdTreeElement<T>>
|
||||||
|
{
|
||||||
|
public int Compare(KdTreeElement<T> p0, KdTreeElement<T> p1)
|
||||||
|
{
|
||||||
|
return p0.Value.Y.CompareTo(p1.Value.Y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed class KdTreeComparerX : IComparer<KdTreeElement<T>>
|
||||||
|
{
|
||||||
|
public int Compare(KdTreeElement<T> p0, KdTreeElement<T> p1)
|
||||||
|
{
|
||||||
|
return p0.Value.X.CompareTo(p1.Value.X);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// K-D tree leaf.
|
/// K-D tree leaf.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
@@ -361,7 +405,7 @@
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public override bool IsLeaf => true;
|
public override bool IsLeaf => true;
|
||||||
|
|
||||||
internal KdTreeLeaf((int, PdfPoint, Q) point, int depth)
|
internal KdTreeLeaf(KdTreeElement<Q> point, int depth)
|
||||||
: base(null, null, point, depth)
|
: base(null, null, point, depth)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
@@ -423,15 +467,15 @@
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public int Index { get; }
|
public int Index { get; }
|
||||||
|
|
||||||
internal KdTreeNode(KdTreeNode<Q> leftChild, KdTreeNode<Q> rightChild, (int, PdfPoint, Q) point, int depth)
|
internal KdTreeNode(KdTreeNode<Q> leftChild, KdTreeNode<Q> rightChild, KdTreeElement<Q> point, int depth)
|
||||||
{
|
{
|
||||||
LeftChild = leftChild;
|
LeftChild = leftChild;
|
||||||
RightChild = rightChild;
|
RightChild = rightChild;
|
||||||
Value = point.Item2;
|
Value = point.Value;
|
||||||
Element = point.Item3;
|
Element = point.Element;
|
||||||
Depth = depth;
|
Depth = depth;
|
||||||
IsAxisCutX = depth % 2 == 0;
|
IsAxisCutX = depth % 2 == 0;
|
||||||
Index = point.Item1;
|
Index = point.Index;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -447,7 +491,11 @@
|
|||||||
|
|
||||||
private void RecursiveGetLeaves(KdTreeNode<Q> leaf, ref List<KdTreeLeaf<Q>> leaves)
|
private void RecursiveGetLeaves(KdTreeNode<Q> leaf, ref List<KdTreeLeaf<Q>> leaves)
|
||||||
{
|
{
|
||||||
if (leaf == null) return;
|
if (leaf == null)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (leaf is KdTreeLeaf<Q> lLeaf)
|
if (leaf is KdTreeLeaf<Q> lLeaf)
|
||||||
{
|
{
|
||||||
leaves.Add(lLeaf);
|
leaves.Add(lLeaf);
|
||||||
|
|||||||
186
src/UglyToad.PdfPig.Tests/Dla/ArraySegmentExtensionsTests.cs
Normal file
186
src/UglyToad.PdfPig.Tests/Dla/ArraySegmentExtensionsTests.cs
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
namespace UglyToad.PdfPig.Tests.Dla
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis;
|
||||||
|
using Xunit;
|
||||||
|
|
||||||
|
public class ArraySegmentExtensionsTests
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void TakeGetAt()
|
||||||
|
{
|
||||||
|
ArraySegment<int> array = new ArraySegment<int>(Enumerable.Range(0, 10).ToArray());
|
||||||
|
Assert.Equal(10, array.Count);
|
||||||
|
|
||||||
|
// Take first 5
|
||||||
|
ArraySegment<int> arrayFirst5 = array.Take(5);
|
||||||
|
Assert.Equal(5, arrayFirst5.Count);
|
||||||
|
Assert.Equal(0, arrayFirst5.GetAt(0));
|
||||||
|
Assert.Equal(1, arrayFirst5.GetAt(1));
|
||||||
|
Assert.Equal(2, arrayFirst5.GetAt(2));
|
||||||
|
Assert.Equal(3, arrayFirst5.GetAt(3));
|
||||||
|
Assert.Equal(4, arrayFirst5.GetAt(4));
|
||||||
|
|
||||||
|
// Take first 2 of first 5
|
||||||
|
ArraySegment<int> arrayFirst2of5 = arrayFirst5.Take(2);
|
||||||
|
Assert.Equal(2, arrayFirst2of5.Count);
|
||||||
|
Assert.Equal(0, arrayFirst2of5.GetAt(0));
|
||||||
|
Assert.Equal(1, arrayFirst2of5.GetAt(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void SkipGetAt()
|
||||||
|
{
|
||||||
|
ArraySegment<int> array = new ArraySegment<int>(Enumerable.Range(0, 10).ToArray());
|
||||||
|
Assert.Equal(10, array.Count);
|
||||||
|
|
||||||
|
// Skip first 5
|
||||||
|
ArraySegment<int> arrayFirst5 = array.Skip(5);
|
||||||
|
Assert.Equal(5, arrayFirst5.Count);
|
||||||
|
Assert.Equal(5, arrayFirst5.GetAt(0));
|
||||||
|
Assert.Equal(6, arrayFirst5.GetAt(1));
|
||||||
|
Assert.Equal(7, arrayFirst5.GetAt(2));
|
||||||
|
Assert.Equal(8, arrayFirst5.GetAt(3));
|
||||||
|
Assert.Equal(9, arrayFirst5.GetAt(4));
|
||||||
|
|
||||||
|
// Skip first 2 of first 5
|
||||||
|
ArraySegment<int> arrayFirst2of5 = arrayFirst5.Skip(2);
|
||||||
|
Assert.Equal(3, arrayFirst2of5.Count);
|
||||||
|
Assert.Equal(7, arrayFirst2of5.GetAt(0));
|
||||||
|
Assert.Equal(8, arrayFirst2of5.GetAt(1));
|
||||||
|
Assert.Equal(9, arrayFirst2of5.GetAt(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void SkipTakeGetAt()
|
||||||
|
{
|
||||||
|
ArraySegment<int> array = new ArraySegment<int>(Enumerable.Range(0, 10).ToArray());
|
||||||
|
Assert.Equal(10, array.Count);
|
||||||
|
|
||||||
|
// Skip first 5
|
||||||
|
ArraySegment<int> arrayFirst5 = array.Skip(5);
|
||||||
|
Assert.Equal(5, arrayFirst5.Count);
|
||||||
|
Assert.Equal(5, arrayFirst5.GetAt(0));
|
||||||
|
Assert.Equal(6, arrayFirst5.GetAt(1));
|
||||||
|
Assert.Equal(7, arrayFirst5.GetAt(2));
|
||||||
|
Assert.Equal(8, arrayFirst5.GetAt(3));
|
||||||
|
Assert.Equal(9, arrayFirst5.GetAt(4));
|
||||||
|
|
||||||
|
// Skip first 2 of first 5
|
||||||
|
ArraySegment<int> arrayFirst2of5 = arrayFirst5.Take(2);
|
||||||
|
Assert.Equal(2, arrayFirst2of5.Count);
|
||||||
|
Assert.Equal(5, arrayFirst2of5.GetAt(0));
|
||||||
|
Assert.Equal(6, arrayFirst2of5.GetAt(1));
|
||||||
|
Assert.Equal(7, arrayFirst2of5.GetAt(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void TakeSkipGetAt()
|
||||||
|
{
|
||||||
|
ArraySegment<int> array = new ArraySegment<int>(Enumerable.Range(0, 10).ToArray());
|
||||||
|
Assert.Equal(10, array.Count);
|
||||||
|
|
||||||
|
// Take first 5
|
||||||
|
ArraySegment<int> arrayFirst5 = array.Take(5);
|
||||||
|
Assert.Equal(5, arrayFirst5.Count);
|
||||||
|
Assert.Equal(0, arrayFirst5.GetAt(0));
|
||||||
|
Assert.Equal(1, arrayFirst5.GetAt(1));
|
||||||
|
Assert.Equal(2, arrayFirst5.GetAt(2));
|
||||||
|
Assert.Equal(3, arrayFirst5.GetAt(3));
|
||||||
|
Assert.Equal(4, arrayFirst5.GetAt(4));
|
||||||
|
|
||||||
|
// Take first 2 of first 5
|
||||||
|
ArraySegment<int> arrayFirst2of5 = arrayFirst5.Skip(2);
|
||||||
|
Assert.Equal(3, arrayFirst2of5.Count);
|
||||||
|
Assert.Equal(2, arrayFirst2of5.GetAt(0));
|
||||||
|
Assert.Equal(3, arrayFirst2of5.GetAt(1));
|
||||||
|
Assert.Equal(4, arrayFirst2of5.GetAt(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Sort()
|
||||||
|
{
|
||||||
|
IntInverseComparer intInverseComparer = new IntInverseComparer();
|
||||||
|
IntComparer intComparer = new IntComparer();
|
||||||
|
|
||||||
|
int[] originalArray = Enumerable.Range(0, 10).ToArray();
|
||||||
|
|
||||||
|
ArraySegment<int> array = new ArraySegment<int>(originalArray);
|
||||||
|
Assert.Equal(10, array.Count);
|
||||||
|
|
||||||
|
array.Sort(intInverseComparer);
|
||||||
|
Assert.Equal(10, array.Count);
|
||||||
|
Assert.Equal(9, array.GetAt(0));
|
||||||
|
Assert.Equal(8, array.GetAt(1));
|
||||||
|
Assert.Equal(7, array.GetAt(2));
|
||||||
|
Assert.Equal(6, array.GetAt(3));
|
||||||
|
Assert.Equal(5, array.GetAt(4));
|
||||||
|
Assert.Equal(4, array.GetAt(5));
|
||||||
|
Assert.Equal(3, array.GetAt(6));
|
||||||
|
Assert.Equal(2, array.GetAt(7));
|
||||||
|
Assert.Equal(1, array.GetAt(8));
|
||||||
|
Assert.Equal(0, array.GetAt(9));
|
||||||
|
|
||||||
|
ArraySegment<int> skip1Take7 = array.Skip(1).Take(7);
|
||||||
|
Assert.Equal(7, skip1Take7.Count);
|
||||||
|
Assert.Equal(8, skip1Take7.GetAt(0));
|
||||||
|
Assert.Equal(7, skip1Take7.GetAt(1));
|
||||||
|
Assert.Equal(6, skip1Take7.GetAt(2));
|
||||||
|
Assert.Equal(5, skip1Take7.GetAt(3));
|
||||||
|
Assert.Equal(4, skip1Take7.GetAt(4));
|
||||||
|
Assert.Equal(3, skip1Take7.GetAt(5));
|
||||||
|
Assert.Equal(2, skip1Take7.GetAt(6));
|
||||||
|
|
||||||
|
skip1Take7.Sort(intComparer);
|
||||||
|
Assert.Equal(7, skip1Take7.Count);
|
||||||
|
Assert.Equal(2, skip1Take7.GetAt(0));
|
||||||
|
Assert.Equal(3, skip1Take7.GetAt(1));
|
||||||
|
Assert.Equal(4, skip1Take7.GetAt(2));
|
||||||
|
Assert.Equal(5, skip1Take7.GetAt(3));
|
||||||
|
Assert.Equal(6, skip1Take7.GetAt(4));
|
||||||
|
Assert.Equal(7, skip1Take7.GetAt(5));
|
||||||
|
Assert.Equal(8, skip1Take7.GetAt(6));
|
||||||
|
|
||||||
|
Assert.Equal(10, array.Count);
|
||||||
|
Assert.Equal(9, array.GetAt(0));
|
||||||
|
Assert.Equal(2, array.GetAt(1));
|
||||||
|
Assert.Equal(3, array.GetAt(2));
|
||||||
|
Assert.Equal(4, array.GetAt(3));
|
||||||
|
Assert.Equal(5, array.GetAt(4));
|
||||||
|
Assert.Equal(6, array.GetAt(5));
|
||||||
|
Assert.Equal(7, array.GetAt(6));
|
||||||
|
Assert.Equal(8, array.GetAt(7));
|
||||||
|
Assert.Equal(1, array.GetAt(8));
|
||||||
|
Assert.Equal(0, array.GetAt(9));
|
||||||
|
|
||||||
|
Assert.Equal(9, originalArray[0]);
|
||||||
|
Assert.Equal(2, originalArray[1]);
|
||||||
|
Assert.Equal(3, originalArray[2]);
|
||||||
|
Assert.Equal(4, originalArray[3]);
|
||||||
|
Assert.Equal(5, originalArray[4]);
|
||||||
|
Assert.Equal(6, originalArray[5]);
|
||||||
|
Assert.Equal(7, originalArray[6]);
|
||||||
|
Assert.Equal(8, originalArray[7]);
|
||||||
|
Assert.Equal(1, originalArray[8]);
|
||||||
|
Assert.Equal(0, originalArray[9]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private class IntInverseComparer : IComparer<int>
|
||||||
|
{
|
||||||
|
public int Compare(int x, int y)
|
||||||
|
{
|
||||||
|
return -x.CompareTo(y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class IntComparer : IComparer<int>
|
||||||
|
{
|
||||||
|
public int Compare(int x, int y)
|
||||||
|
{
|
||||||
|
return x.CompareTo(y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
BIN
src/UglyToad.PdfPig.Tests/Dla/Documents/2559 words.pdf
Normal file
BIN
src/UglyToad.PdfPig.Tests/Dla/Documents/2559 words.pdf
Normal file
Binary file not shown.
@@ -0,0 +1,25 @@
|
|||||||
|
namespace UglyToad.PdfPig.Tests.Dla
|
||||||
|
{
|
||||||
|
using System.Linq;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
||||||
|
using Xunit;
|
||||||
|
|
||||||
|
public class NearestNeighbourWordExtractorTests
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void Words2559Doc()
|
||||||
|
{
|
||||||
|
// Microsoft Word count of words = 2559
|
||||||
|
|
||||||
|
using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath("2559 words.pdf")))
|
||||||
|
{
|
||||||
|
var page = document.GetPage(1);
|
||||||
|
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters).ToArray();
|
||||||
|
|
||||||
|
var noSpacesWords = words.Where(x => !string.IsNullOrEmpty(x.Text.Trim())).ToArray();
|
||||||
|
|
||||||
|
Assert.Equal(2559, noSpacesWords.Length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -114,6 +114,9 @@
|
|||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
<None Update="Dla\Documents\2559 words.pdf">
|
||||||
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
|
</None>
|
||||||
<None Update="Dla\Documents\90 180 270 rotated.pdf">
|
<None Update="Dla\Documents\90 180 270 rotated.pdf">
|
||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
</None>
|
</None>
|
||||||
|
|||||||
Reference in New Issue
Block a user