mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-08-20 09:37:44 +08:00
Improve KdTree memory allocation using ArraySegments
This commit is contained in:
parent
76fc9808fc
commit
8a82500427
@ -0,0 +1,58 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
|
||||
/// <summary>
|
||||
/// Useful <see cref="ArraySegment{T}"/> extensions.
|
||||
/// </summary>
|
||||
public static class ArraySegmentExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Returns a specified number of contiguous elements from the start of a sequence.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">The type of the elements of <see name="source"/>.</typeparam>
|
||||
/// <param name="source">An <see cref="ArraySegment{T}"/> to return elements from.</param>
|
||||
/// <param name="count">The number of elements to return.</param>
|
||||
/// <returns>An <see cref="ArraySegment{T}"/> that contains the specified number of elements from the start of the input sequence.</returns>
|
||||
public static ArraySegment<T> Take<T>(this ArraySegment<T> source, int count)
|
||||
{
|
||||
return new ArraySegment<T>(source.Array, source.Offset, count);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Bypasses a specified number of elements in a sequence and then returns the remaining elements.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">The type of the elements of <see name="source"/>.</typeparam>
|
||||
/// <param name="source">An <see cref="ArraySegment{T}"/> to return elements from.</param>
|
||||
/// <param name="count">The number of elements to skip before returning the remaining elements.</param>
|
||||
/// <returns>An <see cref="ArraySegment{T}"/> that contains the elements that occur after the specified index in the input sequence.</returns>
|
||||
public static ArraySegment<T> Skip<T>(this ArraySegment<T> source, int count)
|
||||
{
|
||||
return new ArraySegment<T>(source.Array, source.Offset + count, source.Count - count);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sorts the elements in a <see cref="ArraySegment{T}"/> using the specified <see cref="IComparer{T}"/>.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">The type of the elements of <see name="source"/>.</typeparam>
|
||||
/// <param name="source">The <see cref="ArraySegment{T}"/> to sort.</param>
|
||||
/// <param name="comparer">The implementation to use when comparing elements.</param>
|
||||
public static void Sort<T>(this ArraySegment<T> source, IComparer<T> comparer)
|
||||
{
|
||||
Array.Sort(source.Array, source.Offset, source.Count, comparer);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the element at a specified index in a sequence.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">The type of the elements of <see name="source"/>.</typeparam>
|
||||
/// <param name="source">The <see cref="ArraySegment{T}"/> to get the element from.</param>
|
||||
/// <param name="index">The index of the element to retrieve.</param>
|
||||
/// <returns>The element at the specified position in the <see name="source"/> sequence.</returns>
|
||||
public static T GetAt<T>(this ArraySegment<T> source, int index)
|
||||
{
|
||||
return source.Array[source.Offset + index];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -54,6 +54,9 @@
|
||||
/// <typeparam name="T"></typeparam>
|
||||
public class KdTree<T>
|
||||
{
|
||||
private readonly KdTreeComparerY kdTreeComparerY = new KdTreeComparerY();
|
||||
private readonly KdTreeComparerX kdTreeComparerX = new KdTreeComparerX();
|
||||
|
||||
/// <summary>
|
||||
/// The root of the tree.
|
||||
/// </summary>
|
||||
@ -77,40 +80,49 @@
|
||||
}
|
||||
|
||||
Count = elements.Count;
|
||||
Root = BuildTree(Enumerable.Range(0, elements.Count).Zip(elements, (e, p) => (e, elementsPointFunc(p), p)).ToArray(), 0);
|
||||
|
||||
KdTreeElement<T>[] array = new KdTreeElement<T>[Count];
|
||||
|
||||
for (int i = 0; i < Count; i++)
|
||||
{
|
||||
var el = elements[i];
|
||||
array[i] = new KdTreeElement<T>(i, elementsPointFunc(el), el);
|
||||
}
|
||||
|
||||
Root = BuildTree(new ArraySegment<KdTreeElement<T>>(array));
|
||||
}
|
||||
|
||||
private KdTreeNode<T> BuildTree((int, PdfPoint, T)[] P, int depth)
|
||||
private KdTreeNode<T> BuildTree(ArraySegment<KdTreeElement<T>> P, int depth = 0)
|
||||
{
|
||||
if (P.Length == 0)
|
||||
if (P.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
else if (P.Length == 1)
|
||||
else if (P.Count == 1)
|
||||
{
|
||||
return new KdTreeLeaf<T>(P[0], depth);
|
||||
return new KdTreeLeaf<T>(P.GetAt(0), depth);
|
||||
}
|
||||
|
||||
if (depth % 2 == 0)
|
||||
{
|
||||
Array.Sort(P, (p0, p1) => p0.Item2.X.CompareTo(p1.Item2.X));
|
||||
P.Sort(kdTreeComparerX);
|
||||
}
|
||||
else
|
||||
{
|
||||
Array.Sort(P, (p0, p1) => p0.Item2.Y.CompareTo(p1.Item2.Y));
|
||||
P.Sort(kdTreeComparerY);
|
||||
}
|
||||
|
||||
if (P.Length == 2)
|
||||
if (P.Count == 2)
|
||||
{
|
||||
return new KdTreeNode<T>(new KdTreeLeaf<T>(P[0], depth + 1), null, P[1], depth);
|
||||
return new KdTreeNode<T>(new KdTreeLeaf<T>(P.GetAt(0), depth + 1), null, P.GetAt(1), depth);
|
||||
}
|
||||
|
||||
int median = P.Length / 2;
|
||||
int median = P.Count / 2;
|
||||
|
||||
KdTreeNode<T> vLeft = BuildTree(P.Take(median).ToArray(), depth + 1);
|
||||
KdTreeNode<T> vRight = BuildTree(P.Skip(median + 1).ToArray(), depth + 1);
|
||||
KdTreeNode<T> vLeft = BuildTree(P.Take(median), depth + 1);
|
||||
KdTreeNode<T> vRight = BuildTree(P.Skip(median + 1), depth + 1);
|
||||
|
||||
return new KdTreeNode<T>(vLeft, vRight, P[median], depth);
|
||||
return new KdTreeNode<T>(vLeft, vRight, P.GetAt(median), depth);
|
||||
}
|
||||
|
||||
#region NN
|
||||
@ -216,7 +228,7 @@
|
||||
{
|
||||
var kdTreeNodes = new KNearestNeighboursQueue(k);
|
||||
FindNearestNeighbours(Root, pivot, k, pivotPointFunc, distanceMeasure, kdTreeNodes);
|
||||
return kdTreeNodes.SelectMany(n => n.Value.Select(e => (e.Element, e.Index, n.Key))).ToList();
|
||||
return kdTreeNodes.SelectMany(n => n.Value.Select(e => (e.Element, e.Index, n.Key))).ToArray();
|
||||
}
|
||||
|
||||
private static (KdTreeNode<T>, double) FindNearestNeighbours(KdTreeNode<T> node, T pivot, int k,
|
||||
@ -350,6 +362,38 @@
|
||||
}
|
||||
#endregion
|
||||
|
||||
internal readonly struct KdTreeElement<R>
|
||||
{
|
||||
internal KdTreeElement(int index, PdfPoint point, R value)
|
||||
{
|
||||
Index = index;
|
||||
Value = point;
|
||||
Element = value;
|
||||
}
|
||||
|
||||
public int Index { get; }
|
||||
|
||||
public PdfPoint Value { get; }
|
||||
|
||||
public R Element { get; }
|
||||
}
|
||||
|
||||
private sealed class KdTreeComparerY : IComparer<KdTreeElement<T>>
|
||||
{
|
||||
public int Compare(KdTreeElement<T> p0, KdTreeElement<T> p1)
|
||||
{
|
||||
return p0.Value.Y.CompareTo(p1.Value.Y);
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class KdTreeComparerX : IComparer<KdTreeElement<T>>
|
||||
{
|
||||
public int Compare(KdTreeElement<T> p0, KdTreeElement<T> p1)
|
||||
{
|
||||
return p0.Value.X.CompareTo(p1.Value.X);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// K-D tree leaf.
|
||||
/// </summary>
|
||||
@ -361,7 +405,7 @@
|
||||
/// </summary>
|
||||
public override bool IsLeaf => true;
|
||||
|
||||
internal KdTreeLeaf((int, PdfPoint, Q) point, int depth)
|
||||
internal KdTreeLeaf(KdTreeElement<Q> point, int depth)
|
||||
: base(null, null, point, depth)
|
||||
{ }
|
||||
|
||||
@ -423,15 +467,15 @@
|
||||
/// </summary>
|
||||
public int Index { get; }
|
||||
|
||||
internal KdTreeNode(KdTreeNode<Q> leftChild, KdTreeNode<Q> rightChild, (int, PdfPoint, Q) point, int depth)
|
||||
internal KdTreeNode(KdTreeNode<Q> leftChild, KdTreeNode<Q> rightChild, KdTreeElement<Q> point, int depth)
|
||||
{
|
||||
LeftChild = leftChild;
|
||||
RightChild = rightChild;
|
||||
Value = point.Item2;
|
||||
Element = point.Item3;
|
||||
Value = point.Value;
|
||||
Element = point.Element;
|
||||
Depth = depth;
|
||||
IsAxisCutX = depth % 2 == 0;
|
||||
Index = point.Item1;
|
||||
Index = point.Index;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@ -447,7 +491,11 @@
|
||||
|
||||
private void RecursiveGetLeaves(KdTreeNode<Q> leaf, ref List<KdTreeLeaf<Q>> leaves)
|
||||
{
|
||||
if (leaf == null) return;
|
||||
if (leaf == null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (leaf is KdTreeLeaf<Q> lLeaf)
|
||||
{
|
||||
leaves.Add(lLeaf);
|
||||
|
||||
186
src/UglyToad.PdfPig.Tests/Dla/ArraySegmentExtensionsTests.cs
Normal file
186
src/UglyToad.PdfPig.Tests/Dla/ArraySegmentExtensionsTests.cs
Normal file
@ -0,0 +1,186 @@
|
||||
namespace UglyToad.PdfPig.Tests.Dla
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis;
|
||||
using Xunit;
|
||||
|
||||
public class ArraySegmentExtensionsTests
|
||||
{
|
||||
[Fact]
|
||||
public void TakeGetAt()
|
||||
{
|
||||
ArraySegment<int> array = new ArraySegment<int>(Enumerable.Range(0, 10).ToArray());
|
||||
Assert.Equal(10, array.Count);
|
||||
|
||||
// Take first 5
|
||||
ArraySegment<int> arrayFirst5 = array.Take(5);
|
||||
Assert.Equal(5, arrayFirst5.Count);
|
||||
Assert.Equal(0, arrayFirst5.GetAt(0));
|
||||
Assert.Equal(1, arrayFirst5.GetAt(1));
|
||||
Assert.Equal(2, arrayFirst5.GetAt(2));
|
||||
Assert.Equal(3, arrayFirst5.GetAt(3));
|
||||
Assert.Equal(4, arrayFirst5.GetAt(4));
|
||||
|
||||
// Take first 2 of first 5
|
||||
ArraySegment<int> arrayFirst2of5 = arrayFirst5.Take(2);
|
||||
Assert.Equal(2, arrayFirst2of5.Count);
|
||||
Assert.Equal(0, arrayFirst2of5.GetAt(0));
|
||||
Assert.Equal(1, arrayFirst2of5.GetAt(1));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SkipGetAt()
|
||||
{
|
||||
ArraySegment<int> array = new ArraySegment<int>(Enumerable.Range(0, 10).ToArray());
|
||||
Assert.Equal(10, array.Count);
|
||||
|
||||
// Skip first 5
|
||||
ArraySegment<int> arrayFirst5 = array.Skip(5);
|
||||
Assert.Equal(5, arrayFirst5.Count);
|
||||
Assert.Equal(5, arrayFirst5.GetAt(0));
|
||||
Assert.Equal(6, arrayFirst5.GetAt(1));
|
||||
Assert.Equal(7, arrayFirst5.GetAt(2));
|
||||
Assert.Equal(8, arrayFirst5.GetAt(3));
|
||||
Assert.Equal(9, arrayFirst5.GetAt(4));
|
||||
|
||||
// Skip first 2 of first 5
|
||||
ArraySegment<int> arrayFirst2of5 = arrayFirst5.Skip(2);
|
||||
Assert.Equal(3, arrayFirst2of5.Count);
|
||||
Assert.Equal(7, arrayFirst2of5.GetAt(0));
|
||||
Assert.Equal(8, arrayFirst2of5.GetAt(1));
|
||||
Assert.Equal(9, arrayFirst2of5.GetAt(2));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SkipTakeGetAt()
|
||||
{
|
||||
ArraySegment<int> array = new ArraySegment<int>(Enumerable.Range(0, 10).ToArray());
|
||||
Assert.Equal(10, array.Count);
|
||||
|
||||
// Skip first 5
|
||||
ArraySegment<int> arrayFirst5 = array.Skip(5);
|
||||
Assert.Equal(5, arrayFirst5.Count);
|
||||
Assert.Equal(5, arrayFirst5.GetAt(0));
|
||||
Assert.Equal(6, arrayFirst5.GetAt(1));
|
||||
Assert.Equal(7, arrayFirst5.GetAt(2));
|
||||
Assert.Equal(8, arrayFirst5.GetAt(3));
|
||||
Assert.Equal(9, arrayFirst5.GetAt(4));
|
||||
|
||||
// Skip first 2 of first 5
|
||||
ArraySegment<int> arrayFirst2of5 = arrayFirst5.Take(2);
|
||||
Assert.Equal(2, arrayFirst2of5.Count);
|
||||
Assert.Equal(5, arrayFirst2of5.GetAt(0));
|
||||
Assert.Equal(6, arrayFirst2of5.GetAt(1));
|
||||
Assert.Equal(7, arrayFirst2of5.GetAt(2));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TakeSkipGetAt()
|
||||
{
|
||||
ArraySegment<int> array = new ArraySegment<int>(Enumerable.Range(0, 10).ToArray());
|
||||
Assert.Equal(10, array.Count);
|
||||
|
||||
// Take first 5
|
||||
ArraySegment<int> arrayFirst5 = array.Take(5);
|
||||
Assert.Equal(5, arrayFirst5.Count);
|
||||
Assert.Equal(0, arrayFirst5.GetAt(0));
|
||||
Assert.Equal(1, arrayFirst5.GetAt(1));
|
||||
Assert.Equal(2, arrayFirst5.GetAt(2));
|
||||
Assert.Equal(3, arrayFirst5.GetAt(3));
|
||||
Assert.Equal(4, arrayFirst5.GetAt(4));
|
||||
|
||||
// Take first 2 of first 5
|
||||
ArraySegment<int> arrayFirst2of5 = arrayFirst5.Skip(2);
|
||||
Assert.Equal(3, arrayFirst2of5.Count);
|
||||
Assert.Equal(2, arrayFirst2of5.GetAt(0));
|
||||
Assert.Equal(3, arrayFirst2of5.GetAt(1));
|
||||
Assert.Equal(4, arrayFirst2of5.GetAt(2));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sort()
|
||||
{
|
||||
IntInverseComparer intInverseComparer = new IntInverseComparer();
|
||||
IntComparer intComparer = new IntComparer();
|
||||
|
||||
int[] originalArray = Enumerable.Range(0, 10).ToArray();
|
||||
|
||||
ArraySegment<int> array = new ArraySegment<int>(originalArray);
|
||||
Assert.Equal(10, array.Count);
|
||||
|
||||
array.Sort(intInverseComparer);
|
||||
Assert.Equal(10, array.Count);
|
||||
Assert.Equal(9, array.GetAt(0));
|
||||
Assert.Equal(8, array.GetAt(1));
|
||||
Assert.Equal(7, array.GetAt(2));
|
||||
Assert.Equal(6, array.GetAt(3));
|
||||
Assert.Equal(5, array.GetAt(4));
|
||||
Assert.Equal(4, array.GetAt(5));
|
||||
Assert.Equal(3, array.GetAt(6));
|
||||
Assert.Equal(2, array.GetAt(7));
|
||||
Assert.Equal(1, array.GetAt(8));
|
||||
Assert.Equal(0, array.GetAt(9));
|
||||
|
||||
ArraySegment<int> skip1Take7 = array.Skip(1).Take(7);
|
||||
Assert.Equal(7, skip1Take7.Count);
|
||||
Assert.Equal(8, skip1Take7.GetAt(0));
|
||||
Assert.Equal(7, skip1Take7.GetAt(1));
|
||||
Assert.Equal(6, skip1Take7.GetAt(2));
|
||||
Assert.Equal(5, skip1Take7.GetAt(3));
|
||||
Assert.Equal(4, skip1Take7.GetAt(4));
|
||||
Assert.Equal(3, skip1Take7.GetAt(5));
|
||||
Assert.Equal(2, skip1Take7.GetAt(6));
|
||||
|
||||
skip1Take7.Sort(intComparer);
|
||||
Assert.Equal(7, skip1Take7.Count);
|
||||
Assert.Equal(2, skip1Take7.GetAt(0));
|
||||
Assert.Equal(3, skip1Take7.GetAt(1));
|
||||
Assert.Equal(4, skip1Take7.GetAt(2));
|
||||
Assert.Equal(5, skip1Take7.GetAt(3));
|
||||
Assert.Equal(6, skip1Take7.GetAt(4));
|
||||
Assert.Equal(7, skip1Take7.GetAt(5));
|
||||
Assert.Equal(8, skip1Take7.GetAt(6));
|
||||
|
||||
Assert.Equal(10, array.Count);
|
||||
Assert.Equal(9, array.GetAt(0));
|
||||
Assert.Equal(2, array.GetAt(1));
|
||||
Assert.Equal(3, array.GetAt(2));
|
||||
Assert.Equal(4, array.GetAt(3));
|
||||
Assert.Equal(5, array.GetAt(4));
|
||||
Assert.Equal(6, array.GetAt(5));
|
||||
Assert.Equal(7, array.GetAt(6));
|
||||
Assert.Equal(8, array.GetAt(7));
|
||||
Assert.Equal(1, array.GetAt(8));
|
||||
Assert.Equal(0, array.GetAt(9));
|
||||
|
||||
Assert.Equal(9, originalArray[0]);
|
||||
Assert.Equal(2, originalArray[1]);
|
||||
Assert.Equal(3, originalArray[2]);
|
||||
Assert.Equal(4, originalArray[3]);
|
||||
Assert.Equal(5, originalArray[4]);
|
||||
Assert.Equal(6, originalArray[5]);
|
||||
Assert.Equal(7, originalArray[6]);
|
||||
Assert.Equal(8, originalArray[7]);
|
||||
Assert.Equal(1, originalArray[8]);
|
||||
Assert.Equal(0, originalArray[9]);
|
||||
}
|
||||
|
||||
private class IntInverseComparer : IComparer<int>
|
||||
{
|
||||
public int Compare(int x, int y)
|
||||
{
|
||||
return -x.CompareTo(y);
|
||||
}
|
||||
}
|
||||
|
||||
private class IntComparer : IComparer<int>
|
||||
{
|
||||
public int Compare(int x, int y)
|
||||
{
|
||||
return x.CompareTo(y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
BIN
src/UglyToad.PdfPig.Tests/Dla/Documents/2559 words.pdf
Normal file
BIN
src/UglyToad.PdfPig.Tests/Dla/Documents/2559 words.pdf
Normal file
Binary file not shown.
@ -0,0 +1,25 @@
|
||||
namespace UglyToad.PdfPig.Tests.Dla
|
||||
{
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
||||
using Xunit;
|
||||
|
||||
public class NearestNeighbourWordExtractorTests
|
||||
{
|
||||
[Fact]
|
||||
public void Words2559Doc()
|
||||
{
|
||||
// Microsoft Word count of words = 2559
|
||||
|
||||
using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath("2559 words.pdf")))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters).ToArray();
|
||||
|
||||
var noSpacesWords = words.Where(x => !string.IsNullOrEmpty(x.Text.Trim())).ToArray();
|
||||
|
||||
Assert.Equal(2559, noSpacesWords.Length);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -114,6 +114,9 @@
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<None Update="Dla\Documents\2559 words.pdf">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="Dla\Documents\90 180 270 rotated.pdf">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
|
||||
Loading…
Reference in New Issue
Block a user