Performance improvements and .Net 9 support (#1116)

* Refactor letter handling by orientation for efficiency

Improved the processing of letters based on their text orientation by preallocating separate lists for each orientation (horizontal, rotate270, rotate180, rotate90, and other). This change reduces multiple calls to `GetWords` and minimizes enumerations and allocations, enhancing performance and readability. Each letter is now added to the appropriate list in a single iteration over the `letters` collection.

* Update target frameworks to include net9.0

Expanded compatibility in `UglyToad.PdfPig.csproj` by adding
`net9.0` to the list of target frameworks, alongside existing
versions.

* Add .NET 9.0 support and refactor key components

Updated project files for UglyToad.PdfPig to target .NET 9.0, enhancing compatibility with the latest framework features.

Refactored `GetBlocks` in `DocstrumBoundingBoxes.cs` for improved input handling and performance.

Significantly optimized `NearestNeighbourWordExtractor.cs` by replacing multiple lists with an array of buckets and implementing parallel processing for better efficiency.

Consistent updates across `Fonts`, `Tests`, `Tokenization`, and `Tokens` project files to include .NET 9.0 support.

* Improve null checks and optimize list handling

- Updated null check for `words` in `DocstrumBoundingBoxes.cs` for better readability and performance.
- Changed from `ToList()` to `ToArray()` to avoid unnecessary enumeration.
- Added `results.TrimExcess()` in `NearestNeighbourWordExtractor.cs` to optimize memory usage.

---------

Co-authored-by: Chuck Beasley <CBeasley@kilpatricktownsend.com>
This commit is contained in:
Chuck B. 2025-08-01 17:24:16 -04:00 committed by GitHub
parent 83d6fc6cc2
commit 1ed9e017f4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 55 additions and 33 deletions

View File

@ -1,6 +1,6 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
<LangVersion>12</LangVersion>
<Version>0.1.12-alpha001</Version>
<IsTestProject>False</IsTestProject>

View File

@ -48,12 +48,19 @@
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{
if (words?.Any() != true)
if (words is null)
{
return Array.Empty<TextBlock>();
}
return GetBlocks(words.ToList(),
// Avoid multiple enumeration and unnecessary ToArray() if already a list
var wordList = words as IReadOnlyList<Word> ?? words.ToArray();
if (wordList.Count == 0)
{
return Array.Empty<TextBlock>();
}
return GetBlocks(wordList,
options.WithinLineBounds, options.WithinLineMultiplier, options.WithinLineBinSize,
options.BetweenLineBounds, options.BetweenLineMultiplier, options.BetweenLineBinSize,
options.AngularDifferenceBounds,

View File

@ -1,6 +1,6 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
<LangVersion>12</LangVersion>
<Version>0.1.12-alpha001</Version>
<IsTestProject>False</IsTestProject>

View File

@ -51,34 +51,49 @@
if (options.GroupByOrientation)
{
// axis aligned
List<Word> words = GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism);
var buckets = new List<Letter>[5];
for (int i = 0; i < buckets.Length; i++) buckets[i] = new List<Letter>();
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism));
foreach (var l in letters)
{
switch (l.TextOrientation)
{
case TextOrientation.Horizontal: buckets[0].Add(l); break;
case TextOrientation.Rotate270: buckets[1].Add(l); break;
case TextOrientation.Rotate180: buckets[2].Add(l); break;
case TextOrientation.Rotate90: buckets[3].Add(l); break;
default: buckets[4].Add(l); break;
}
}
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism));
// Use a thread-safe collection to avoid lock contention.
var results = new List<Word>(letters.Count); // Pre-allocate for performance
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism));
// Limit parallelism to avoid oversubscription.
var parallelOptions = new System.Threading.Tasks.ParallelOptions
{
MaxDegreeOfParallelism = options.MaxDegreeOfParallelism > 0 ? options.MaxDegreeOfParallelism : Environment.ProcessorCount
};
// not axis aligned
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
options.MaximumDistance, options.DistanceMeasure, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism));
return words;
// Use partitioner for better load balancing and avoid ConcurrentBag overhead
System.Threading.Tasks.Parallel.ForEach(
System.Collections.Concurrent.Partitioner.Create(0, buckets.Length),
parallelOptions,
range =>
{
for (int i = range.Item1; i < range.Item2; i++)
{
if (buckets[i].Count == 0) continue;
var measure = (i == 4) ? options.DistanceMeasure : options.DistanceMeasureAA;
var words = GetWords(buckets[i], options.MaximumDistance, measure, options.FilterPivot, options.Filter, options.MaxDegreeOfParallelism);
lock (results)
{
results.AddRange(words);
}
}
});
results.TrimExcess();
return results;
}
else
{

View File

@ -1,6 +1,6 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
<LangVersion>12</LangVersion>
<Version>0.1.12-alpha001</Version>
<IsTestProject>False</IsTestProject>

View File

@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>net471;net8.0</TargetFrameworks>
<TargetFrameworks>net471;net8.0;net9.0</TargetFrameworks>
<IsTestProject>true</IsTestProject>
<IsPackable>false</IsPackable>
<DebugType>full</DebugType>

View File

@ -1,6 +1,6 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
<LangVersion>12</LangVersion>
<Version>0.1.12-alpha001</Version>
<IsTestProject>False</IsTestProject>

View File

@ -1,6 +1,6 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
<LangVersion>12</LangVersion>
<Version>0.1.12-alpha001</Version>
<IsTestProject>False</IsTestProject>

View File

@ -1,6 +1,6 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
<LangVersion>12</LangVersion>
<Version>0.1.12-alpha001</Version>
<IsTestProject>False</IsTestProject>