mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-08-20 04:31:25 +08:00
Performance improvements and .Net 9 support (#1116)
* Refactor letter handling by orientation for efficiency Improved the processing of letters based on their text orientation by preallocating separate lists for each orientation (horizontal, rotate270, rotate180, rotate90, and other). This change reduces multiple calls to `GetWords` and minimizes enumerations and allocations, enhancing performance and readability. Each letter is now added to the appropriate list in a single iteration over the `letters` collection. * Update target frameworks to include net9.0 Expanded compatibility in `UglyToad.PdfPig.csproj` by adding `net9.0` to the list of target frameworks, alongside existing versions. * Add .NET 9.0 support and refactor key components Updated project files for UglyToad.PdfPig to target .NET 9.0, enhancing compatibility with the latest framework features. Refactored `GetBlocks` in `DocstrumBoundingBoxes.cs` for improved input handling and performance. Significantly optimized `NearestNeighbourWordExtractor.cs` by replacing multiple lists with an array of buckets and implementing parallel processing for better efficiency. Consistent updates across `Fonts`, `Tests`, `Tokenization`, and `Tokens` project files to include .NET 9.0 support. * Improve null checks and optimize list handling - Updated null check for `words` in `DocstrumBoundingBoxes.cs` for better readability and performance. - Changed from `ToList()` to `ToArray()` to avoid unnecessary enumeration. - Added `results.TrimExcess()` in `NearestNeighbourWordExtractor.cs` to optimize memory usage. --------- Co-authored-by: Chuck Beasley <CBeasley@kilpatricktownsend.com>
This commit is contained in:
parent
83d6fc6cc2
commit
1ed9e017f4
@ -1,6 +1,6 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
|
||||
<LangVersion>12</LangVersion>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<IsTestProject>False</IsTestProject>
|
||||
|
@ -48,12 +48,19 @@
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
|
||||
{
|
||||
if (words?.Any() != true)
|
||||
if (words is null)
|
||||
{
|
||||
return Array.Empty<TextBlock>();
|
||||
}
|
||||
|
||||
return GetBlocks(words.ToList(),
|
||||
// Avoid multiple enumeration and unnecessary ToArray() if already a list
|
||||
var wordList = words as IReadOnlyList<Word> ?? words.ToArray();
|
||||
if (wordList.Count == 0)
|
||||
{
|
||||
return Array.Empty<TextBlock>();
|
||||
}
|
||||
|
||||
return GetBlocks(wordList,
|
||||
options.WithinLineBounds, options.WithinLineMultiplier, options.WithinLineBinSize,
|
||||
options.BetweenLineBounds, options.BetweenLineMultiplier, options.BetweenLineBinSize,
|
||||
options.AngularDifferenceBounds,
|
||||
|
@ -1,6 +1,6 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
|
||||
<LangVersion>12</LangVersion>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<IsTestProject>False</IsTestProject>
|
||||
|
@ -51,34 +51,49 @@
|
||||
|
||||
if (options.GroupByOrientation)
|
||||
{
|
||||
// axis aligned
|
||||
List<Word> words = GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism);
|
||||
var buckets = new List<Letter>[5];
|
||||
for (int i = 0; i < buckets.Length; i++) buckets[i] = new List<Letter>();
|
||||
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism));
|
||||
foreach (var l in letters)
|
||||
{
|
||||
switch (l.TextOrientation)
|
||||
{
|
||||
case TextOrientation.Horizontal: buckets[0].Add(l); break;
|
||||
case TextOrientation.Rotate270: buckets[1].Add(l); break;
|
||||
case TextOrientation.Rotate180: buckets[2].Add(l); break;
|
||||
case TextOrientation.Rotate90: buckets[3].Add(l); break;
|
||||
default: buckets[4].Add(l); break;
|
||||
}
|
||||
}
|
||||
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism));
|
||||
// Use a thread-safe collection to avoid lock contention.
|
||||
var results = new List<Word>(letters.Count); // Pre-allocate for performance
|
||||
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism));
|
||||
// Limit parallelism to avoid oversubscription.
|
||||
var parallelOptions = new System.Threading.Tasks.ParallelOptions
|
||||
{
|
||||
MaxDegreeOfParallelism = options.MaxDegreeOfParallelism > 0 ? options.MaxDegreeOfParallelism : Environment.ProcessorCount
|
||||
};
|
||||
|
||||
// not axis aligned
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasure, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism));
|
||||
|
||||
return words;
|
||||
// Use partitioner for better load balancing and avoid ConcurrentBag overhead
|
||||
System.Threading.Tasks.Parallel.ForEach(
|
||||
System.Collections.Concurrent.Partitioner.Create(0, buckets.Length),
|
||||
parallelOptions,
|
||||
range =>
|
||||
{
|
||||
for (int i = range.Item1; i < range.Item2; i++)
|
||||
{
|
||||
if (buckets[i].Count == 0) continue;
|
||||
var measure = (i == 4) ? options.DistanceMeasure : options.DistanceMeasureAA;
|
||||
var words = GetWords(buckets[i], options.MaximumDistance, measure, options.FilterPivot, options.Filter, options.MaxDegreeOfParallelism);
|
||||
lock (results)
|
||||
{
|
||||
results.AddRange(words);
|
||||
}
|
||||
}
|
||||
});
|
||||
results.TrimExcess();
|
||||
return results;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1,6 +1,6 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
|
||||
<LangVersion>12</LangVersion>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<IsTestProject>False</IsTestProject>
|
||||
|
@ -1,7 +1,7 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>net471;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>net471;net8.0;net9.0</TargetFrameworks>
|
||||
<IsTestProject>true</IsTestProject>
|
||||
<IsPackable>false</IsPackable>
|
||||
<DebugType>full</DebugType>
|
||||
|
@ -1,6 +1,6 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
|
||||
<LangVersion>12</LangVersion>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<IsTestProject>False</IsTestProject>
|
||||
|
@ -1,6 +1,6 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
|
||||
<LangVersion>12</LangVersion>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<IsTestProject>False</IsTestProject>
|
||||
|
@ -1,6 +1,6 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
|
||||
<LangVersion>12</LangVersion>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<IsTestProject>False</IsTestProject>
|
||||
|
Loading…
Reference in New Issue
Block a user