Improve KdTree memory allocation using ArraySegments

This commit is contained in:
BobLd 2023-06-26 21:01:33 +01:00
parent 76fc9808fc
commit 8a82500427
6 changed files with 340 additions and 20 deletions

View File

@ -0,0 +1,58 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
using System;
using System.Collections.Generic;
/// <summary>
/// Useful <see cref="ArraySegment{T}"/> extensions.
/// </summary>
public static class ArraySegmentExtensions
{
/// <summary>
/// Returns a specified number of contiguous elements from the start of a sequence.
/// </summary>
/// <typeparam name="T">The type of the elements of <see name="source"/>.</typeparam>
/// <param name="source">An <see cref="ArraySegment{T}"/> to return elements from.</param>
/// <param name="count">The number of elements to return.</param>
/// <returns>An <see cref="ArraySegment{T}"/> that contains the specified number of elements from the start of the input sequence.</returns>
public static ArraySegment<T> Take<T>(this ArraySegment<T> source, int count)
{
return new ArraySegment<T>(source.Array, source.Offset, count);
}
/// <summary>
/// Bypasses a specified number of elements in a sequence and then returns the remaining elements.
/// </summary>
/// <typeparam name="T">The type of the elements of <see name="source"/>.</typeparam>
/// <param name="source">An <see cref="ArraySegment{T}"/> to return elements from.</param>
/// <param name="count">The number of elements to skip before returning the remaining elements.</param>
/// <returns>An <see cref="ArraySegment{T}"/> that contains the elements that occur after the specified index in the input sequence.</returns>
public static ArraySegment<T> Skip<T>(this ArraySegment<T> source, int count)
{
return new ArraySegment<T>(source.Array, source.Offset + count, source.Count - count);
}
/// <summary>
/// Sorts the elements in a <see cref="ArraySegment{T}"/> using the specified <see cref="IComparer{T}"/>.
/// </summary>
/// <typeparam name="T">The type of the elements of <see name="source"/>.</typeparam>
/// <param name="source">The <see cref="ArraySegment{T}"/> to sort.</param>
/// <param name="comparer">The implementation to use when comparing elements.</param>
public static void Sort<T>(this ArraySegment<T> source, IComparer<T> comparer)
{
Array.Sort(source.Array, source.Offset, source.Count, comparer);
}
/// <summary>
/// Returns the element at a specified index in a sequence.
/// </summary>
/// <typeparam name="T">The type of the elements of <see name="source"/>.</typeparam>
/// <param name="source">The <see cref="ArraySegment{T}"/> to get the element from.</param>
/// <param name="index">The index of the element to retrieve.</param>
/// <returns>The element at the specified position in the <see name="source"/> sequence.</returns>
public static T GetAt<T>(this ArraySegment<T> source, int index)
{
return source.Array[source.Offset + index];
}
}
}

View File

@ -54,6 +54,9 @@
/// <typeparam name="T"></typeparam>
public class KdTree<T>
{
private readonly KdTreeComparerY kdTreeComparerY = new KdTreeComparerY();
private readonly KdTreeComparerX kdTreeComparerX = new KdTreeComparerX();
/// <summary>
/// The root of the tree.
/// </summary>
@ -77,40 +80,49 @@
}
Count = elements.Count;
Root = BuildTree(Enumerable.Range(0, elements.Count).Zip(elements, (e, p) => (e, elementsPointFunc(p), p)).ToArray(), 0);
KdTreeElement<T>[] array = new KdTreeElement<T>[Count];
for (int i = 0; i < Count; i++)
{
var el = elements[i];
array[i] = new KdTreeElement<T>(i, elementsPointFunc(el), el);
}
Root = BuildTree(new ArraySegment<KdTreeElement<T>>(array));
}
private KdTreeNode<T> BuildTree((int, PdfPoint, T)[] P, int depth)
private KdTreeNode<T> BuildTree(ArraySegment<KdTreeElement<T>> P, int depth = 0)
{
if (P.Length == 0)
if (P.Count == 0)
{
return null;
}
else if (P.Length == 1)
else if (P.Count == 1)
{
return new KdTreeLeaf<T>(P[0], depth);
return new KdTreeLeaf<T>(P.GetAt(0), depth);
}
if (depth % 2 == 0)
{
Array.Sort(P, (p0, p1) => p0.Item2.X.CompareTo(p1.Item2.X));
P.Sort(kdTreeComparerX);
}
else
{
Array.Sort(P, (p0, p1) => p0.Item2.Y.CompareTo(p1.Item2.Y));
P.Sort(kdTreeComparerY);
}
if (P.Length == 2)
if (P.Count == 2)
{
return new KdTreeNode<T>(new KdTreeLeaf<T>(P[0], depth + 1), null, P[1], depth);
return new KdTreeNode<T>(new KdTreeLeaf<T>(P.GetAt(0), depth + 1), null, P.GetAt(1), depth);
}
int median = P.Length / 2;
int median = P.Count / 2;
KdTreeNode<T> vLeft = BuildTree(P.Take(median).ToArray(), depth + 1);
KdTreeNode<T> vRight = BuildTree(P.Skip(median + 1).ToArray(), depth + 1);
KdTreeNode<T> vLeft = BuildTree(P.Take(median), depth + 1);
KdTreeNode<T> vRight = BuildTree(P.Skip(median + 1), depth + 1);
return new KdTreeNode<T>(vLeft, vRight, P[median], depth);
return new KdTreeNode<T>(vLeft, vRight, P.GetAt(median), depth);
}
#region NN
@ -216,7 +228,7 @@
{
var kdTreeNodes = new KNearestNeighboursQueue(k);
FindNearestNeighbours(Root, pivot, k, pivotPointFunc, distanceMeasure, kdTreeNodes);
return kdTreeNodes.SelectMany(n => n.Value.Select(e => (e.Element, e.Index, n.Key))).ToList();
return kdTreeNodes.SelectMany(n => n.Value.Select(e => (e.Element, e.Index, n.Key))).ToArray();
}
private static (KdTreeNode<T>, double) FindNearestNeighbours(KdTreeNode<T> node, T pivot, int k,
@ -350,6 +362,38 @@
}
#endregion
internal readonly struct KdTreeElement<R>
{
internal KdTreeElement(int index, PdfPoint point, R value)
{
Index = index;
Value = point;
Element = value;
}
public int Index { get; }
public PdfPoint Value { get; }
public R Element { get; }
}
private sealed class KdTreeComparerY : IComparer<KdTreeElement<T>>
{
public int Compare(KdTreeElement<T> p0, KdTreeElement<T> p1)
{
return p0.Value.Y.CompareTo(p1.Value.Y);
}
}
private sealed class KdTreeComparerX : IComparer<KdTreeElement<T>>
{
public int Compare(KdTreeElement<T> p0, KdTreeElement<T> p1)
{
return p0.Value.X.CompareTo(p1.Value.X);
}
}
/// <summary>
/// K-D tree leaf.
/// </summary>
@ -361,7 +405,7 @@
/// </summary>
public override bool IsLeaf => true;
internal KdTreeLeaf((int, PdfPoint, Q) point, int depth)
internal KdTreeLeaf(KdTreeElement<Q> point, int depth)
: base(null, null, point, depth)
{ }
@ -423,15 +467,15 @@
/// </summary>
public int Index { get; }
internal KdTreeNode(KdTreeNode<Q> leftChild, KdTreeNode<Q> rightChild, (int, PdfPoint, Q) point, int depth)
internal KdTreeNode(KdTreeNode<Q> leftChild, KdTreeNode<Q> rightChild, KdTreeElement<Q> point, int depth)
{
LeftChild = leftChild;
RightChild = rightChild;
Value = point.Item2;
Element = point.Item3;
Value = point.Value;
Element = point.Element;
Depth = depth;
IsAxisCutX = depth % 2 == 0;
Index = point.Item1;
Index = point.Index;
}
/// <summary>
@ -447,7 +491,11 @@
private void RecursiveGetLeaves(KdTreeNode<Q> leaf, ref List<KdTreeLeaf<Q>> leaves)
{
if (leaf == null) return;
if (leaf == null)
{
return;
}
if (leaf is KdTreeLeaf<Q> lLeaf)
{
leaves.Add(lLeaf);

View File

@ -0,0 +1,186 @@
namespace UglyToad.PdfPig.Tests.Dla
{
using System;
using System.Collections.Generic;
using System.Linq;
using UglyToad.PdfPig.DocumentLayoutAnalysis;
using Xunit;
public class ArraySegmentExtensionsTests
{
[Fact]
public void TakeGetAt()
{
ArraySegment<int> array = new ArraySegment<int>(Enumerable.Range(0, 10).ToArray());
Assert.Equal(10, array.Count);
// Take first 5
ArraySegment<int> arrayFirst5 = array.Take(5);
Assert.Equal(5, arrayFirst5.Count);
Assert.Equal(0, arrayFirst5.GetAt(0));
Assert.Equal(1, arrayFirst5.GetAt(1));
Assert.Equal(2, arrayFirst5.GetAt(2));
Assert.Equal(3, arrayFirst5.GetAt(3));
Assert.Equal(4, arrayFirst5.GetAt(4));
// Take first 2 of first 5
ArraySegment<int> arrayFirst2of5 = arrayFirst5.Take(2);
Assert.Equal(2, arrayFirst2of5.Count);
Assert.Equal(0, arrayFirst2of5.GetAt(0));
Assert.Equal(1, arrayFirst2of5.GetAt(1));
}
[Fact]
public void SkipGetAt()
{
ArraySegment<int> array = new ArraySegment<int>(Enumerable.Range(0, 10).ToArray());
Assert.Equal(10, array.Count);
// Skip first 5
ArraySegment<int> arrayFirst5 = array.Skip(5);
Assert.Equal(5, arrayFirst5.Count);
Assert.Equal(5, arrayFirst5.GetAt(0));
Assert.Equal(6, arrayFirst5.GetAt(1));
Assert.Equal(7, arrayFirst5.GetAt(2));
Assert.Equal(8, arrayFirst5.GetAt(3));
Assert.Equal(9, arrayFirst5.GetAt(4));
// Skip first 2 of first 5
ArraySegment<int> arrayFirst2of5 = arrayFirst5.Skip(2);
Assert.Equal(3, arrayFirst2of5.Count);
Assert.Equal(7, arrayFirst2of5.GetAt(0));
Assert.Equal(8, arrayFirst2of5.GetAt(1));
Assert.Equal(9, arrayFirst2of5.GetAt(2));
}
[Fact]
public void SkipTakeGetAt()
{
ArraySegment<int> array = new ArraySegment<int>(Enumerable.Range(0, 10).ToArray());
Assert.Equal(10, array.Count);
// Skip first 5
ArraySegment<int> arrayFirst5 = array.Skip(5);
Assert.Equal(5, arrayFirst5.Count);
Assert.Equal(5, arrayFirst5.GetAt(0));
Assert.Equal(6, arrayFirst5.GetAt(1));
Assert.Equal(7, arrayFirst5.GetAt(2));
Assert.Equal(8, arrayFirst5.GetAt(3));
Assert.Equal(9, arrayFirst5.GetAt(4));
// Skip first 2 of first 5
ArraySegment<int> arrayFirst2of5 = arrayFirst5.Take(2);
Assert.Equal(2, arrayFirst2of5.Count);
Assert.Equal(5, arrayFirst2of5.GetAt(0));
Assert.Equal(6, arrayFirst2of5.GetAt(1));
Assert.Equal(7, arrayFirst2of5.GetAt(2));
}
[Fact]
public void TakeSkipGetAt()
{
ArraySegment<int> array = new ArraySegment<int>(Enumerable.Range(0, 10).ToArray());
Assert.Equal(10, array.Count);
// Take first 5
ArraySegment<int> arrayFirst5 = array.Take(5);
Assert.Equal(5, arrayFirst5.Count);
Assert.Equal(0, arrayFirst5.GetAt(0));
Assert.Equal(1, arrayFirst5.GetAt(1));
Assert.Equal(2, arrayFirst5.GetAt(2));
Assert.Equal(3, arrayFirst5.GetAt(3));
Assert.Equal(4, arrayFirst5.GetAt(4));
// Take first 2 of first 5
ArraySegment<int> arrayFirst2of5 = arrayFirst5.Skip(2);
Assert.Equal(3, arrayFirst2of5.Count);
Assert.Equal(2, arrayFirst2of5.GetAt(0));
Assert.Equal(3, arrayFirst2of5.GetAt(1));
Assert.Equal(4, arrayFirst2of5.GetAt(2));
}
[Fact]
public void Sort()
{
IntInverseComparer intInverseComparer = new IntInverseComparer();
IntComparer intComparer = new IntComparer();
int[] originalArray = Enumerable.Range(0, 10).ToArray();
ArraySegment<int> array = new ArraySegment<int>(originalArray);
Assert.Equal(10, array.Count);
array.Sort(intInverseComparer);
Assert.Equal(10, array.Count);
Assert.Equal(9, array.GetAt(0));
Assert.Equal(8, array.GetAt(1));
Assert.Equal(7, array.GetAt(2));
Assert.Equal(6, array.GetAt(3));
Assert.Equal(5, array.GetAt(4));
Assert.Equal(4, array.GetAt(5));
Assert.Equal(3, array.GetAt(6));
Assert.Equal(2, array.GetAt(7));
Assert.Equal(1, array.GetAt(8));
Assert.Equal(0, array.GetAt(9));
ArraySegment<int> skip1Take7 = array.Skip(1).Take(7);
Assert.Equal(7, skip1Take7.Count);
Assert.Equal(8, skip1Take7.GetAt(0));
Assert.Equal(7, skip1Take7.GetAt(1));
Assert.Equal(6, skip1Take7.GetAt(2));
Assert.Equal(5, skip1Take7.GetAt(3));
Assert.Equal(4, skip1Take7.GetAt(4));
Assert.Equal(3, skip1Take7.GetAt(5));
Assert.Equal(2, skip1Take7.GetAt(6));
skip1Take7.Sort(intComparer);
Assert.Equal(7, skip1Take7.Count);
Assert.Equal(2, skip1Take7.GetAt(0));
Assert.Equal(3, skip1Take7.GetAt(1));
Assert.Equal(4, skip1Take7.GetAt(2));
Assert.Equal(5, skip1Take7.GetAt(3));
Assert.Equal(6, skip1Take7.GetAt(4));
Assert.Equal(7, skip1Take7.GetAt(5));
Assert.Equal(8, skip1Take7.GetAt(6));
Assert.Equal(10, array.Count);
Assert.Equal(9, array.GetAt(0));
Assert.Equal(2, array.GetAt(1));
Assert.Equal(3, array.GetAt(2));
Assert.Equal(4, array.GetAt(3));
Assert.Equal(5, array.GetAt(4));
Assert.Equal(6, array.GetAt(5));
Assert.Equal(7, array.GetAt(6));
Assert.Equal(8, array.GetAt(7));
Assert.Equal(1, array.GetAt(8));
Assert.Equal(0, array.GetAt(9));
Assert.Equal(9, originalArray[0]);
Assert.Equal(2, originalArray[1]);
Assert.Equal(3, originalArray[2]);
Assert.Equal(4, originalArray[3]);
Assert.Equal(5, originalArray[4]);
Assert.Equal(6, originalArray[5]);
Assert.Equal(7, originalArray[6]);
Assert.Equal(8, originalArray[7]);
Assert.Equal(1, originalArray[8]);
Assert.Equal(0, originalArray[9]);
}
private class IntInverseComparer : IComparer<int>
{
public int Compare(int x, int y)
{
return -x.CompareTo(y);
}
}
private class IntComparer : IComparer<int>
{
public int Compare(int x, int y)
{
return x.CompareTo(y);
}
}
}
}

Binary file not shown.

View File

@ -0,0 +1,25 @@
namespace UglyToad.PdfPig.Tests.Dla
{
using System.Linq;
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
using Xunit;
public class NearestNeighbourWordExtractorTests
{
[Fact]
public void Words2559Doc()
{
// Microsoft Word count of words = 2559
using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath("2559 words.pdf")))
{
var page = document.GetPage(1);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters).ToArray();
var noSpacesWords = words.Where(x => !string.IsNullOrEmpty(x.Text.Trim())).ToArray();
Assert.Equal(2559, noSpacesWords.Length);
}
}
}
}

View File

@ -114,6 +114,9 @@
</ItemGroup>
<ItemGroup>
<None Update="Dla\Documents\2559 words.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Dla\Documents\90 180 270 rotated.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>