mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-08-20 09:00:07 +08:00
Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3650e27432 | ||
|
|
a43b968ea9 | ||
|
|
1031dcc221 | ||
|
|
0f641774e6 | ||
|
|
a3edc926c8 | ||
|
|
f1923fcbcd | ||
|
|
7ff58893af | ||
|
|
bee6f13888 | ||
|
|
e6dd2d15c2 | ||
|
|
7dd5d68be3 | ||
|
|
bdf3b8e2b4 | ||
|
|
c8dff885bd | ||
|
|
0b228c57b7 | ||
|
|
ef21227b3c | ||
|
|
b9f2230a0a | ||
|
|
b6950a5fb0 | ||
|
|
1ed9e017f4 | ||
|
|
83d6fc6cc2 | ||
|
|
febfa4d4b3 | ||
|
|
0ebbe0540d | ||
|
|
52c0635273 | ||
|
|
b6bd0a3169 |
5
.github/workflows/build_and_test_macos.yml
vendored
5
.github/workflows/build_and_test_macos.yml
vendored
@ -19,10 +19,11 @@ jobs:
|
||||
2.1.x
|
||||
6.0.x
|
||||
8.0.x
|
||||
9.0.x
|
||||
|
||||
# Build the release build
|
||||
- name: Build the solution
|
||||
run: dotnet build -c Release src/UglyToad.PdfPig.sln
|
||||
run: dotnet build -c Release src/UglyToad.PdfPig.sln -f net8.0
|
||||
|
||||
- name: Run the tests
|
||||
run: dotnet test -c Release src/UglyToad.PdfPig.sln
|
||||
run: dotnet test -c Release src/UglyToad.PdfPig.sln -f net8.0
|
||||
|
||||
67
.github/workflows/nightly_release.yml
vendored
67
.github/workflows/nightly_release.yml
vendored
@ -5,39 +5,55 @@ on:
|
||||
- cron: "0 0 * * *"
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
tests:
|
||||
uses: ./.github/workflows/run_integration_tests.yml
|
||||
check_date:
|
||||
runs-on: ubuntu-latest
|
||||
name: Check latest commit
|
||||
outputs:
|
||||
should_run: ${{ steps.should_run.outputs.should_run }}
|
||||
steps:
|
||||
- uses: actions/checkout@master
|
||||
- name: print latest_commit
|
||||
run: echo ${{ github.sha }}
|
||||
permissions:
|
||||
contents: write # Grant write permission for tagging
|
||||
|
||||
jobs:
|
||||
check_publish_needed:
|
||||
runs-on: ubuntu-latest
|
||||
name: Check if this commit has already been published
|
||||
outputs:
|
||||
should_run: ${{ steps.check.outputs.should_run }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Fetch tags
|
||||
run: git fetch --tags
|
||||
|
||||
- id: check
|
||||
run: |
|
||||
latest_commit=$(git rev-parse nightly-latest || echo "")
|
||||
echo "Latest published commit: $latest_commit"
|
||||
if [ "$latest_commit" = "${{ github.sha }}" ]; then
|
||||
echo "No new commit since last publish."
|
||||
echo "should_run=false" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "New commit detected."
|
||||
echo "should_run=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
tests:
|
||||
needs: check_publish_needed
|
||||
if: ${{ needs.check_publish_needed.outputs.should_run == 'true' }}
|
||||
uses: ./.github/workflows/run_integration_tests.yml
|
||||
|
||||
- id: should_run
|
||||
continue-on-error: true
|
||||
name: check latest commit is less than a day ago
|
||||
if: ${{ github.event_name == 'schedule' }}
|
||||
run: test -z $(git rev-list --after="24 hours" ${{ github.sha }}) && echo "::set-output name=should_run::false"
|
||||
build_and_publish_nightly:
|
||||
needs: [check_date, tests]
|
||||
if: ${{ needs.check_date.outputs.should_run != 'false' }}
|
||||
needs: [check_publish_needed, tests]
|
||||
if: ${{ needs.check_publish_needed.outputs.should_run == 'true' }}
|
||||
runs-on: windows-2022
|
||||
name: build_and_publish_nightly
|
||||
steps:
|
||||
- uses: actions/checkout@master
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up dotnet core
|
||||
uses: actions/setup-dotnet@v2
|
||||
uses: actions/setup-dotnet@v4
|
||||
with:
|
||||
dotnet-version: |
|
||||
2.1.x
|
||||
6.0.x
|
||||
8.0.x
|
||||
9.0.x
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v1.0.2
|
||||
@ -54,5 +70,10 @@ jobs:
|
||||
|
||||
- name: Publish Nuget to GitHub registry
|
||||
run: dotnet nuget push **/*.nupkg --api-key ${{secrets.NUGET_API_KEY}} --source https://api.nuget.org/v3/index.json
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Tag latest nightly commit
|
||||
run: |
|
||||
git config user.name "github-actions"
|
||||
git config user.email "github-actions@github.com"
|
||||
git tag -f nightly-latest ${{ github.sha }}
|
||||
git push origin nightly-latest --force
|
||||
|
||||
4
.github/workflows/run_common_crawl_tests.yml
vendored
4
.github/workflows/run_common_crawl_tests.yml
vendored
@ -15,7 +15,9 @@ jobs:
|
||||
- name: Set up dotnet core
|
||||
uses: actions/setup-dotnet@v3
|
||||
with:
|
||||
dotnet-version: "8.0.x"
|
||||
dotnet-version: |
|
||||
8.0.x
|
||||
9.0.x
|
||||
|
||||
- name: Restore corpus cache 0000, 0001
|
||||
id: restore-corpus
|
||||
|
||||
4
.github/workflows/run_integration_tests.yml
vendored
4
.github/workflows/run_integration_tests.yml
vendored
@ -15,7 +15,9 @@ jobs:
|
||||
- name: Set up dotnet core
|
||||
uses: actions/setup-dotnet@v3
|
||||
with:
|
||||
dotnet-version: "8.0.x"
|
||||
dotnet-version: |
|
||||
8.0.x
|
||||
9.0.x
|
||||
|
||||
- name: Restore cached part 1
|
||||
id: restore-cache-p1
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
using System;
|
||||
using System.Buffers;
|
||||
using System.Buffers;
|
||||
|
||||
namespace UglyToad.PdfPig.Core;
|
||||
|
||||
|
||||
@ -71,7 +71,7 @@
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override bool Equals(object obj)
|
||||
public override bool Equals(object? obj)
|
||||
{
|
||||
return obj is IndirectReference other && Equals(other);
|
||||
}
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
/// <summary>
|
||||
/// Convert the string to bytes using the ISO 8859-1 encoding.
|
||||
/// </summary>
|
||||
public static byte[] StringAsLatin1Bytes(string s)
|
||||
public static byte[]? StringAsLatin1Bytes(string? s)
|
||||
{
|
||||
if (s == null)
|
||||
{
|
||||
|
||||
@ -264,7 +264,7 @@
|
||||
/// Try to convert raw bytes to a PdfDocEncoding encoded string. If unsupported characters are encountered
|
||||
/// meaning we cannot safely round-trip the value to bytes this will instead return false.
|
||||
/// </summary>
|
||||
public static bool TryConvertBytesToString(ReadOnlySpan<byte> bytes, out string result)
|
||||
public static bool TryConvertBytesToString(ReadOnlySpan<byte> bytes, out string? result)
|
||||
{
|
||||
result = null;
|
||||
if (bytes.Length == 0)
|
||||
|
||||
@ -70,7 +70,7 @@
|
||||
/// <summary>
|
||||
/// Returns a value indicating whether this <see cref="PdfLine"/> is equal to a specified <see cref="PdfLine"/> .
|
||||
/// </summary>
|
||||
public override bool Equals(object obj)
|
||||
public override bool Equals(object? obj)
|
||||
{
|
||||
return obj is PdfLine other && Equals(other);
|
||||
}
|
||||
|
||||
@ -83,7 +83,7 @@
|
||||
/// <summary>
|
||||
/// Returns a value indicating whether this <see cref="PdfPoint"/> is equal to a specified <see cref="PdfPoint"/> .
|
||||
/// </summary>
|
||||
public override bool Equals(object obj)
|
||||
public override bool Equals(object? obj)
|
||||
{
|
||||
return obj is PdfPoint other && Equals(other);
|
||||
}
|
||||
|
||||
@ -177,7 +177,7 @@
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override bool Equals(object obj)
|
||||
public override bool Equals(object? obj)
|
||||
{
|
||||
return obj is PdfRectangle other && Equals(other);
|
||||
}
|
||||
|
||||
@ -241,8 +241,8 @@
|
||||
public bool IsClosed()
|
||||
{
|
||||
var filteredCount = 0;
|
||||
IPathCommand last = null;
|
||||
IPathCommand first = null;
|
||||
IPathCommand? last = null;
|
||||
IPathCommand? first = null;
|
||||
for (int i = Commands.Count - 1; i >= 0; i--)
|
||||
{
|
||||
var cmd = Commands[i];
|
||||
@ -376,14 +376,14 @@
|
||||
/// Gets a <see cref="PdfRectangle"/> which entirely contains the geometry of the defined path.
|
||||
/// </summary>
|
||||
/// <returns>For paths which don't define any geometry this returns <see langword="null"/>.</returns>
|
||||
public static PdfRectangle? GetBoundingRectangle(IReadOnlyList<PdfSubpath> path)
|
||||
public static PdfRectangle? GetBoundingRectangle(IReadOnlyList<PdfSubpath>? path)
|
||||
{
|
||||
if (path == null || path.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var bboxes = path.Select(x => x.GetBoundingRectangle()).Where(x => x.HasValue).Select(x => x.Value).ToList();
|
||||
var bboxes = path.Select(x => x.GetBoundingRectangle()).Where(x => x.HasValue).Select(x => x!.Value).ToList();
|
||||
if (bboxes.Count == 0)
|
||||
{
|
||||
return null;
|
||||
@ -433,7 +433,7 @@
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override bool Equals(object obj)
|
||||
public override bool Equals(object? obj)
|
||||
{
|
||||
return (obj is Close);
|
||||
}
|
||||
@ -479,7 +479,7 @@
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override bool Equals(object obj)
|
||||
public override bool Equals(object? obj)
|
||||
{
|
||||
if (obj is Move move)
|
||||
{
|
||||
@ -545,7 +545,7 @@
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override bool Equals(object obj)
|
||||
public override bool Equals(object? obj)
|
||||
{
|
||||
if (obj is Line line)
|
||||
{
|
||||
@ -651,7 +651,7 @@
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override bool Equals(object obj)
|
||||
public override bool Equals(object? obj)
|
||||
{
|
||||
if (obj is QuadraticBezierCurve curve)
|
||||
{
|
||||
@ -809,7 +809,7 @@
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override bool Equals(object obj)
|
||||
public override bool Equals(object? obj)
|
||||
{
|
||||
if (obj is CubicBezierCurve curve)
|
||||
{
|
||||
@ -944,7 +944,7 @@
|
||||
/// <summary>
|
||||
/// Compares two <see cref="PdfSubpath"/>s for equality. Paths will only be considered equal if the commands which construct the paths are in the same order.
|
||||
/// </summary>
|
||||
public override bool Equals(object obj)
|
||||
public override bool Equals(object? obj)
|
||||
{
|
||||
if (!(obj is PdfSubpath path) || Commands.Count != path.Commands.Count)
|
||||
{
|
||||
|
||||
@ -463,7 +463,7 @@
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override bool Equals(object obj)
|
||||
public override bool Equals(object? obj)
|
||||
{
|
||||
return obj is TransformationMatrix other && Equals(other);
|
||||
}
|
||||
|
||||
@ -1,12 +1,14 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
|
||||
<LangVersion>12</LangVersion>
|
||||
<Version>0.1.11</Version>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<IsTestProject>False</IsTestProject>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<SignAssembly>true</SignAssembly>
|
||||
<AssemblyOriginatorKeyFile>..\pdfpig.snk</AssemblyOriginatorKeyFile>
|
||||
<Nullable>enable</Nullable>
|
||||
<WarningsAsErrors>nullable</WarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
<ItemGroup Condition="'$(TargetFramework)'=='net462'">
|
||||
<PackageReference Include="System.ValueTuple" Version="4.5.0" />
|
||||
|
||||
@ -89,7 +89,7 @@ namespace UglyToad.PdfPig.Core
|
||||
/// <inheritdoc />
|
||||
public override bool TryGetSecond(out B b)
|
||||
{
|
||||
b = default(B);
|
||||
b = default!;
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -135,7 +135,7 @@ namespace UglyToad.PdfPig.Core
|
||||
/// <inheritdoc />
|
||||
public override bool TryGetFirst(out A a)
|
||||
{
|
||||
a = default(A);
|
||||
a = default!;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@ -128,13 +128,13 @@
|
||||
throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pagesTextBlocks));
|
||||
}
|
||||
|
||||
ConcurrentDictionary<int, List<TextBlock>> pageDecorations = new ConcurrentDictionary<int, List<TextBlock>>();
|
||||
ConcurrentDictionary<int, OrderedSet<TextBlock>> pageDecorations = new ConcurrentDictionary<int, OrderedSet<TextBlock>>();
|
||||
|
||||
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
|
||||
|
||||
Parallel.For(0, pagesTextBlocks.Count, parallelOptions, p =>
|
||||
{
|
||||
if (!pageDecorations.TryAdd(p, new List<TextBlock>()))
|
||||
if (!pageDecorations.TryAdd(p, new OrderedSet<TextBlock>()))
|
||||
{
|
||||
throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary.");
|
||||
}
|
||||
@ -165,7 +165,7 @@
|
||||
var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n);
|
||||
if (score >= similarityThreshold)
|
||||
{
|
||||
if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
|
||||
pageDecorations[p].TryAdd(current);
|
||||
}
|
||||
}
|
||||
|
||||
@ -180,7 +180,7 @@
|
||||
var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n);
|
||||
if (score >= similarityThreshold)
|
||||
{
|
||||
if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
|
||||
pageDecorations[p].TryAdd(current);
|
||||
}
|
||||
}
|
||||
|
||||
@ -195,7 +195,7 @@
|
||||
var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n);
|
||||
if (score >= similarityThreshold)
|
||||
{
|
||||
if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
|
||||
pageDecorations[p].TryAdd(current);
|
||||
}
|
||||
}
|
||||
|
||||
@ -210,12 +210,12 @@
|
||||
var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n);
|
||||
if (score >= similarityThreshold)
|
||||
{
|
||||
if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
|
||||
pageDecorations[p].TryAdd(current);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return pageDecorations.OrderBy(x => x.Key).Select(x => x.Value).ToList();
|
||||
return pageDecorations.OrderBy(x => x.Key).Select(x => x.Value.GetList()).ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
56
src/UglyToad.PdfPig.DocumentLayoutAnalysis/OrderedSet.cs
Normal file
56
src/UglyToad.PdfPig.DocumentLayoutAnalysis/OrderedSet.cs
Normal file
@ -0,0 +1,56 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
using System;
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
|
||||
internal class OrderedSet<T>
|
||||
{
|
||||
private readonly HashSet<T> _set;
|
||||
private readonly List<T> _list;
|
||||
|
||||
public OrderedSet() : this(EqualityComparer<T>.Default)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
public OrderedSet(IEqualityComparer<T> comparer)
|
||||
{
|
||||
_set = new HashSet<T>(comparer);
|
||||
_list = new List<T>();
|
||||
}
|
||||
|
||||
public int Count => _set.Count;
|
||||
|
||||
public bool TryAdd(T item)
|
||||
{
|
||||
if (_set.Contains(item)) return false;
|
||||
|
||||
_list.Add(item);
|
||||
_set.Add(item);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public void Clear()
|
||||
{
|
||||
_list.Clear();
|
||||
_set.Clear();
|
||||
}
|
||||
|
||||
public bool Contains(T item)
|
||||
{
|
||||
return item is not null && _set.Contains(item);
|
||||
}
|
||||
|
||||
public void CopyTo(T[] array, int arrayIndex)
|
||||
{
|
||||
_list.CopyTo(array, arrayIndex);
|
||||
}
|
||||
public List<T> GetList()
|
||||
{
|
||||
return _list;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -48,12 +48,19 @@
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
|
||||
{
|
||||
if (words?.Any() != true)
|
||||
if (words is null)
|
||||
{
|
||||
return Array.Empty<TextBlock>();
|
||||
}
|
||||
|
||||
return GetBlocks(words.ToList(),
|
||||
// Avoid multiple enumeration and unnecessary ToArray() if already a list
|
||||
var wordList = words as IReadOnlyList<Word> ?? words.ToArray();
|
||||
if (wordList.Count == 0)
|
||||
{
|
||||
return Array.Empty<TextBlock>();
|
||||
}
|
||||
|
||||
return GetBlocks(wordList,
|
||||
options.WithinLineBounds, options.WithinLineMultiplier, options.WithinLineBinSize,
|
||||
options.BetweenLineBounds, options.BetweenLineMultiplier, options.BetweenLineBinSize,
|
||||
options.AngularDifferenceBounds,
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
|
||||
<LangVersion>12</LangVersion>
|
||||
<Version>0.1.11</Version>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<IsTestProject>False</IsTestProject>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<SignAssembly>true</SignAssembly>
|
||||
|
||||
@ -51,34 +51,49 @@
|
||||
|
||||
if (options.GroupByOrientation)
|
||||
{
|
||||
// axis aligned
|
||||
List<Word> words = GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism);
|
||||
var buckets = new List<Letter>[5];
|
||||
for (int i = 0; i < buckets.Length; i++) buckets[i] = new List<Letter>();
|
||||
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism));
|
||||
foreach (var l in letters)
|
||||
{
|
||||
switch (l.TextOrientation)
|
||||
{
|
||||
case TextOrientation.Horizontal: buckets[0].Add(l); break;
|
||||
case TextOrientation.Rotate270: buckets[1].Add(l); break;
|
||||
case TextOrientation.Rotate180: buckets[2].Add(l); break;
|
||||
case TextOrientation.Rotate90: buckets[3].Add(l); break;
|
||||
default: buckets[4].Add(l); break;
|
||||
}
|
||||
}
|
||||
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism));
|
||||
// Use a thread-safe collection to avoid lock contention.
|
||||
var results = new List<Word>(letters.Count); // Pre-allocate for performance
|
||||
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism));
|
||||
// Limit parallelism to avoid oversubscription.
|
||||
var parallelOptions = new System.Threading.Tasks.ParallelOptions
|
||||
{
|
||||
MaxDegreeOfParallelism = options.MaxDegreeOfParallelism > 0 ? options.MaxDegreeOfParallelism : Environment.ProcessorCount
|
||||
};
|
||||
|
||||
// not axis aligned
|
||||
words.AddRange(GetWords(
|
||||
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
|
||||
options.MaximumDistance, options.DistanceMeasure, options.FilterPivot,
|
||||
options.Filter, options.MaxDegreeOfParallelism));
|
||||
|
||||
return words;
|
||||
// Use partitioner for better load balancing and avoid ConcurrentBag overhead
|
||||
System.Threading.Tasks.Parallel.ForEach(
|
||||
System.Collections.Concurrent.Partitioner.Create(0, buckets.Length),
|
||||
parallelOptions,
|
||||
range =>
|
||||
{
|
||||
for (int i = range.Item1; i < range.Item2; i++)
|
||||
{
|
||||
if (buckets[i].Count == 0) continue;
|
||||
var measure = (i == 4) ? options.DistanceMeasure : options.DistanceMeasureAA;
|
||||
var words = GetWords(buckets[i], options.MaximumDistance, measure, options.FilterPivot, options.Filter, options.MaxDegreeOfParallelism);
|
||||
lock (results)
|
||||
{
|
||||
results.AddRange(words);
|
||||
}
|
||||
}
|
||||
});
|
||||
results.TrimExcess();
|
||||
return results;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
|
||||
<LangVersion>12</LangVersion>
|
||||
<Version>0.1.11</Version>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<IsTestProject>False</IsTestProject>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<SignAssembly>true</SignAssembly>
|
||||
|
||||
@ -32,4 +32,18 @@ public class AccentedCharactersInBookmarksTests
|
||||
},
|
||||
nodes);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanReadContainerBookmarksCorrectly()
|
||||
{
|
||||
var path = IntegrationHelpers.GetDocumentPath("dotnet-ai.pdf");
|
||||
|
||||
using var document = PdfDocument.Open(path);
|
||||
var isFound = document.TryGetBookmarks(out var bookmarks, false);
|
||||
Assert.True(isFound);
|
||||
Assert.True(bookmarks.Roots.Count == 3);
|
||||
isFound = document.TryGetBookmarks(out bookmarks, true);
|
||||
Assert.True(isFound);
|
||||
Assert.True(bookmarks.Roots.Count > 3);
|
||||
}
|
||||
}
|
||||
BIN
src/UglyToad.PdfPig.Tests/Integration/Documents/dotnet-ai.pdf
Normal file
BIN
src/UglyToad.PdfPig.Tests/Integration/Documents/dotnet-ai.pdf
Normal file
Binary file not shown.
@ -7,6 +7,15 @@
|
||||
|
||||
public class GithubIssuesTests
|
||||
{
|
||||
[Fact]
|
||||
public void Issue1122()
|
||||
{
|
||||
var path = IntegrationHelpers.GetSpecificTestDocumentPath("StackOverflow_Issue_1122.pdf");
|
||||
|
||||
var ex = Assert.Throws<PdfDocumentFormatException>(() => PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true }));
|
||||
Assert.StartsWith("Reached maximum search depth while getting indirect reference.", ex.Message);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Issue1096()
|
||||
{
|
||||
|
||||
Binary file not shown.
@ -260,6 +260,7 @@
|
||||
"UglyToad.PdfPig.Outline.DocumentBookmarkNode",
|
||||
"UglyToad.PdfPig.Outline.EmbeddedBookmarkNode",
|
||||
"UglyToad.PdfPig.Outline.ExternalBookmarkNode",
|
||||
"UglyToad.PdfPig.Outline.ContainerBookmarkNode",
|
||||
"UglyToad.PdfPig.Outline.UriBookmarkNode",
|
||||
"UglyToad.PdfPig.Outline.Destinations.ExplicitDestination",
|
||||
"UglyToad.PdfPig.Outline.Destinations.ExplicitDestinationCoordinates",
|
||||
|
||||
@ -261,6 +261,47 @@ endobj";
|
||||
Assert.Equal(3, tokens.OfType<DictionaryToken>().Count());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Document006324Test()
|
||||
{
|
||||
const string content =
|
||||
"""
|
||||
q
|
||||
1 0 0 1 248.6304 572.546 cm
|
||||
0 0 m
|
||||
0.021 -0.007 l
|
||||
3 -0.003 -0.01 0 0 0 c
|
||||
f
|
||||
Q
|
||||
q
|
||||
1 0 0 1 2489394 57249855 cm
|
||||
0 0 m
|
||||
-0.046 -0.001 -0.609 0.029 -0.286 -0.014 c
|
||||
-02.61 -0.067 -0.286 -0. .61 -0 0 c
|
||||
f
|
||||
Q
|
||||
q
|
||||
1 0 0 1 24862464 572. .836 cm
|
||||
0 0 m
|
||||
0.936 -0.029 l
|
||||
0.038 -0.021 0.55 -0.014 0 0 c
|
||||
f
|
||||
Q
|
||||
""";
|
||||
|
||||
var tokens = new List<IToken>();
|
||||
|
||||
var scanner = new CoreTokenScanner(
|
||||
StringBytesTestConverter.Convert(content, false).Bytes,
|
||||
true,
|
||||
isStream: true);
|
||||
|
||||
while (scanner.MoveNext())
|
||||
{
|
||||
tokens.Add(scanner.CurrentToken);
|
||||
}
|
||||
}
|
||||
|
||||
private static void AssertCorrectToken<T, TData>(IToken token, TData expected) where T : IDataToken<TData>
|
||||
{
|
||||
var cast = Assert.IsType<T>(token);
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>net471;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>net471;net8.0;net9.0</TargetFrameworks>
|
||||
<IsTestProject>true</IsTestProject>
|
||||
<IsPackable>false</IsPackable>
|
||||
<DebugType>full</DebugType>
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
|
||||
<LangVersion>12</LangVersion>
|
||||
<Version>0.1.11</Version>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<IsTestProject>False</IsTestProject>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<SignAssembly>true</SignAssembly>
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
|
||||
<LangVersion>12</LangVersion>
|
||||
<Version>0.1.11</Version>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<IsTestProject>False</IsTestProject>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<SignAssembly>true</SignAssembly>
|
||||
|
||||
@ -25,7 +25,7 @@
|
||||
/// <summary>
|
||||
/// Extract bookmarks, if any.
|
||||
/// </summary>
|
||||
public Bookmarks? GetBookmarks(Catalog catalog)
|
||||
public Bookmarks? GetBookmarks(Catalog catalog,bool allowContainerNode = false)
|
||||
{
|
||||
if (!catalog.CatalogDictionary.TryGet(NameToken.Outlines, pdfScanner, out DictionaryToken? outlinesDictionary))
|
||||
{
|
||||
@ -47,7 +47,7 @@
|
||||
|
||||
while (next != null)
|
||||
{
|
||||
ReadBookmarksRecursively(next, 0, false, seen, catalog.NamedDestinations, roots);
|
||||
ReadBookmarksRecursively(next, 0, false, seen, catalog.NamedDestinations, roots, allowContainerNode);
|
||||
|
||||
if (!next.TryGet(NameToken.Next, out IndirectReferenceToken nextReference)
|
||||
|| !seen.Add(nextReference.Data))
|
||||
@ -65,8 +65,7 @@
|
||||
/// Extract bookmarks recursively.
|
||||
/// </summary>
|
||||
private void ReadBookmarksRecursively(DictionaryToken nodeDictionary, int level, bool readSiblings, HashSet<IndirectReference> seen,
|
||||
NamedDestinations namedDestinations,
|
||||
List<BookmarkNode> list)
|
||||
NamedDestinations namedDestinations, List<BookmarkNode> list, bool allowContainerNode = false)
|
||||
{
|
||||
// 12.3 Document-Level Navigation
|
||||
|
||||
@ -80,7 +79,7 @@
|
||||
var children = new List<BookmarkNode>();
|
||||
if (nodeDictionary.TryGet(NameToken.First, pdfScanner, out DictionaryToken? firstChild))
|
||||
{
|
||||
ReadBookmarksRecursively(firstChild, level + 1, true, seen, namedDestinations, children);
|
||||
ReadBookmarksRecursively(firstChild, level + 1, true, seen, namedDestinations, children, allowContainerNode);
|
||||
}
|
||||
|
||||
BookmarkNode bookmark;
|
||||
@ -108,6 +107,11 @@
|
||||
return;
|
||||
}
|
||||
}
|
||||
else if(allowContainerNode)
|
||||
{
|
||||
bookmark = new ContainerBookmarkNode(title, level, children);
|
||||
log.Warn($"No /Dest(ination) or /A(ction) entry found for bookmark node: {nodeDictionary}.");
|
||||
}
|
||||
else
|
||||
{
|
||||
log.Error($"No /Dest(ination) or /A(ction) entry found for bookmark node: {nodeDictionary}.");
|
||||
@ -138,7 +142,7 @@
|
||||
break;
|
||||
}
|
||||
|
||||
ReadBookmarksRecursively(current, level, false, seen, namedDestinations, list);
|
||||
ReadBookmarksRecursively(current, level, false, seen, namedDestinations, list, allowContainerNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
16
src/UglyToad.PdfPig/Outline/ContainerBookmarkNode.cs
Normal file
16
src/UglyToad.PdfPig/Outline/ContainerBookmarkNode.cs
Normal file
@ -0,0 +1,16 @@
|
||||
namespace UglyToad.PdfPig.Outline;
|
||||
|
||||
/// <summary>
|
||||
/// represents a pure container bookmark node: it has a title and child nodes but no destination or action.
|
||||
/// <para>This is used to handle the common "grouping" bookmarks in PDFs.</para>
|
||||
/// </summary>
|
||||
public class ContainerBookmarkNode : BookmarkNode
|
||||
{
|
||||
/// <summary>
|
||||
/// create a container bookmark node.
|
||||
/// </summary>
|
||||
public ContainerBookmarkNode(string title, int level, IReadOnlyList<BookmarkNode> children)
|
||||
: base(title, level, children)
|
||||
{
|
||||
}
|
||||
}
|
||||
@ -19,7 +19,8 @@
|
||||
throw new ArgumentNullException(nameof(dictionary));
|
||||
}
|
||||
|
||||
if (dictionary.TryGet(NameToken.Type, out var type) && !ReferenceEquals(type, NameToken.Catalog))
|
||||
if (dictionary.TryGet(NameToken.Type, out var type) && !ReferenceEquals(type, NameToken.Catalog)
|
||||
&& !isLenientParsing)
|
||||
{
|
||||
throw new PdfDocumentFormatException($"The type of the catalog dictionary was not Catalog: {dictionary}.");
|
||||
}
|
||||
|
||||
@ -255,14 +255,14 @@
|
||||
/// Gets the bookmarks if this document contains some.
|
||||
/// </summary>
|
||||
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
|
||||
public bool TryGetBookmarks([NotNullWhen(true)] out Bookmarks? bookmarks)
|
||||
public bool TryGetBookmarks([NotNullWhen(true)] out Bookmarks? bookmarks, bool allowContainerNode = false)
|
||||
{
|
||||
if (isDisposed)
|
||||
{
|
||||
throw new ObjectDisposedException("Cannot access the bookmarks after the document is disposed.");
|
||||
}
|
||||
|
||||
bookmarks = bookmarksProvider.GetBookmarks(Structure.Catalog);
|
||||
bookmarks = bookmarksProvider.GetBookmarks(Structure.Catalog, allowContainerNode);
|
||||
|
||||
return bookmarks != null;
|
||||
}
|
||||
|
||||
@ -112,8 +112,9 @@
|
||||
{
|
||||
AsciiHexDecodeFilter => 0.5,
|
||||
Ascii85Filter => 0.8,
|
||||
FlateFilter or RunLengthFilter => 3,
|
||||
RunLengthFilter => 1.5,
|
||||
LzwFilter => 2,
|
||||
FlateFilter => 10,
|
||||
_ => 1000
|
||||
};
|
||||
}
|
||||
@ -122,12 +123,12 @@
|
||||
/// Returns an equivalent token where any indirect references of child objects are
|
||||
/// recursively traversed and resolved.
|
||||
/// </summary>
|
||||
internal static T? Resolve<T>(this T? token, IPdfTokenScanner scanner, List<IndirectReference>? visited = null) where T : IToken
|
||||
internal static T? Resolve<T>(this T? token, IPdfTokenScanner scanner, HashSet<IndirectReference>? visited = null) where T : IToken
|
||||
{
|
||||
return (T?)ResolveInternal(token, scanner, visited ?? []);
|
||||
}
|
||||
|
||||
private static IToken? ResolveInternal(this IToken? token, IPdfTokenScanner scanner, List<IndirectReference> visited)
|
||||
private static IToken? ResolveInternal(this IToken? token, IPdfTokenScanner scanner, HashSet<IndirectReference> visited)
|
||||
{
|
||||
if (token is StreamToken stream)
|
||||
{
|
||||
|
||||
@ -625,7 +625,8 @@
|
||||
{
|
||||
if (offset < 0)
|
||||
{
|
||||
var result = GetObjectFromStream(lengthReference.Data, offset);
|
||||
ushort searchDepth = 0;
|
||||
var result = GetObjectFromStream(lengthReference.Data, offset, ref searchDepth);
|
||||
|
||||
if (!(result.Data is NumericToken streamLengthToken))
|
||||
{
|
||||
@ -714,9 +715,23 @@
|
||||
|
||||
coreTokenScanner.DeregisterCustomTokenizer(tokenizer);
|
||||
}
|
||||
|
||||
|
||||
public ObjectToken? Get(IndirectReference reference)
|
||||
{
|
||||
ushort searchDepth = 0;
|
||||
return Get(reference, ref searchDepth);
|
||||
}
|
||||
|
||||
private ObjectToken? Get(IndirectReference reference, ref ushort searchDepth)
|
||||
{
|
||||
if (searchDepth > 100)
|
||||
{
|
||||
throw new PdfDocumentFormatException("Reached maximum search depth while getting indirect reference.");
|
||||
}
|
||||
|
||||
searchDepth++;
|
||||
|
||||
|
||||
if (isDisposed)
|
||||
{
|
||||
throw new ObjectDisposedException(nameof(PdfTokenScanner));
|
||||
@ -740,7 +755,7 @@
|
||||
// Negative offsets refer to a stream with that number.
|
||||
if (offset < 0)
|
||||
{
|
||||
var result = GetObjectFromStream(reference, offset);
|
||||
var result = GetObjectFromStream(reference, offset, ref searchDepth);
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -802,11 +817,11 @@
|
||||
}
|
||||
}
|
||||
|
||||
private ObjectToken GetObjectFromStream(IndirectReference reference, long offset)
|
||||
private ObjectToken GetObjectFromStream(IndirectReference reference, long offset, ref ushort searchDepth)
|
||||
{
|
||||
var streamObjectNumber = offset * -1;
|
||||
|
||||
var streamObject = Get(new IndirectReference(streamObjectNumber, 0));
|
||||
var streamObject = Get(new IndirectReference(streamObjectNumber, 0), ref searchDepth);
|
||||
|
||||
if (!(streamObject?.Data is StreamToken stream))
|
||||
{
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
|
||||
<LangVersion>12</LangVersion>
|
||||
<Version>0.1.11</Version>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<IsTestProject>False</IsTestProject>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<SignAssembly>true</SignAssembly>
|
||||
|
||||
@ -1,5 +1,10 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.CodeAnalysis;
|
||||
using System.Globalization;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using Console = System.Console;
|
||||
|
||||
@ -7,6 +12,113 @@ namespace UglyToad.PdfPig.ConsoleRunner
|
||||
{
|
||||
public static class Program
|
||||
{
|
||||
private class OptionalArg
|
||||
{
|
||||
public required string ShortSymbol { get; init; }
|
||||
|
||||
public required string Symbol { get; init; }
|
||||
|
||||
public required bool SupportsValue { get; init; }
|
||||
|
||||
public string? Value { get; set; }
|
||||
}
|
||||
|
||||
private class ParsedArgs
|
||||
{
|
||||
public required IReadOnlyList<OptionalArg> SuppliedArgs { get; init; }
|
||||
|
||||
public required string SuppliedDirectoryPath { get; init; }
|
||||
}
|
||||
|
||||
private static IReadOnlyList<OptionalArg> GetSupportedArgs() =>
|
||||
[
|
||||
new OptionalArg
|
||||
{
|
||||
SupportsValue = false,
|
||||
ShortSymbol = "nr",
|
||||
Symbol = "no-recursion"
|
||||
},
|
||||
new OptionalArg
|
||||
{
|
||||
SupportsValue = true,
|
||||
ShortSymbol = "o",
|
||||
Symbol = "output"
|
||||
},
|
||||
new OptionalArg
|
||||
{
|
||||
SupportsValue = true,
|
||||
ShortSymbol = "l",
|
||||
Symbol = "limit"
|
||||
}
|
||||
];
|
||||
|
||||
private static bool TryParseArgs(
|
||||
string[] args,
|
||||
[NotNullWhen(true)] out ParsedArgs? parsed)
|
||||
{
|
||||
parsed = null;
|
||||
string? path = null;
|
||||
var suppliedOpts = new List<OptionalArg>();
|
||||
|
||||
var opts = GetSupportedArgs();
|
||||
|
||||
for (var i = 0; i < args.Length; i++)
|
||||
{
|
||||
var str = args[i];
|
||||
|
||||
var isOptFlag = str.StartsWith('-');
|
||||
|
||||
if (!isOptFlag)
|
||||
{
|
||||
if (path == null)
|
||||
{
|
||||
path = str;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
var item = opts.SingleOrDefault(x =>
|
||||
string.Equals("-" + x.ShortSymbol, str, StringComparison.OrdinalIgnoreCase)
|
||||
|| string.Equals("--" + x.Symbol, str, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
if (item == null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (item.SupportsValue)
|
||||
{
|
||||
if (i == args.Length - 1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
i++;
|
||||
item.Value = args[i];
|
||||
}
|
||||
|
||||
suppliedOpts.Add(item);
|
||||
}
|
||||
}
|
||||
|
||||
if (path == null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
parsed = new ParsedArgs
|
||||
{
|
||||
SuppliedArgs = suppliedOpts,
|
||||
SuppliedDirectoryPath = path
|
||||
};
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public static int Main(string[] args)
|
||||
{
|
||||
if (args.Length == 0)
|
||||
@ -15,30 +127,47 @@ namespace UglyToad.PdfPig.ConsoleRunner
|
||||
return 7;
|
||||
}
|
||||
|
||||
var path = args[0];
|
||||
|
||||
if (!Directory.Exists(path))
|
||||
if (!TryParseArgs(args, out var parsed))
|
||||
{
|
||||
Console.WriteLine($"The provided path is not a valid directory: {path}.");
|
||||
var strJoined = string.Join(" ", args);
|
||||
Console.WriteLine($"Unrecognized arguments passed: {strJoined}");
|
||||
return 7;
|
||||
}
|
||||
|
||||
var maxCount = default(int?);
|
||||
|
||||
if (args.Length > 1 && int.TryParse(args[1], out var countIn))
|
||||
if (!Directory.Exists(parsed.SuppliedDirectoryPath))
|
||||
{
|
||||
maxCount = countIn;
|
||||
Console.WriteLine($"The provided path is not a valid directory: {parsed.SuppliedDirectoryPath}.");
|
||||
return 7;
|
||||
}
|
||||
|
||||
int? maxCount = null;
|
||||
var limit = parsed.SuppliedArgs.SingleOrDefault(x => x.ShortSymbol == "l");
|
||||
if (limit?.Value != null && int.TryParse(limit.Value, CultureInfo.InvariantCulture, out var maxCountArg))
|
||||
{
|
||||
Console.WriteLine($"Limiting input files to first: {maxCountArg}");
|
||||
maxCount = maxCountArg;
|
||||
}
|
||||
|
||||
var noRecursionMode = parsed.SuppliedArgs.Any(x => x.ShortSymbol == "nr");
|
||||
var outputOpt = parsed.SuppliedArgs.SingleOrDefault(x => x.ShortSymbol == "o" && x.Value != null);
|
||||
|
||||
var hasError = false;
|
||||
var errorBuilder = new StringBuilder();
|
||||
var fileList = Directory.GetFiles(path, "*.pdf", SearchOption.AllDirectories);
|
||||
var fileList = Directory.GetFiles(
|
||||
parsed.SuppliedDirectoryPath,
|
||||
"*.pdf",
|
||||
noRecursionMode ? SearchOption.TopDirectoryOnly : SearchOption.AllDirectories)
|
||||
.OrderBy(x => x).ToList();
|
||||
var runningCount = 0;
|
||||
|
||||
Console.WriteLine($"Found {fileList.Length} files.");
|
||||
Console.WriteLine($"Found {fileList.Count} files.");
|
||||
Console.WriteLine();
|
||||
|
||||
Console.WriteLine($"{GetCleanFilename("File")}| Size\t| Words\t| Pages");
|
||||
PrintTableColumns("File", "Size", "Words", "Pages", "Open cost (μs)", "Total cost (μs)", "Page cost (μs)");
|
||||
|
||||
var dataList = new List<DataRecord>();
|
||||
|
||||
var sw = new Stopwatch();
|
||||
foreach (var file in fileList)
|
||||
{
|
||||
if (maxCount.HasValue && runningCount >= maxCount)
|
||||
@ -50,8 +179,20 @@ namespace UglyToad.PdfPig.ConsoleRunner
|
||||
{
|
||||
var numWords = 0;
|
||||
var numPages = 0;
|
||||
long openMicros;
|
||||
long totalPageMicros;
|
||||
|
||||
sw.Reset();
|
||||
sw.Start();
|
||||
|
||||
using (var pdfDocument = PdfDocument.Open(file))
|
||||
{
|
||||
sw.Stop();
|
||||
|
||||
openMicros = sw.Elapsed.Microseconds;
|
||||
|
||||
sw.Start();
|
||||
|
||||
foreach (var page in pdfDocument.GetPages())
|
||||
{
|
||||
numPages++;
|
||||
@ -63,13 +204,36 @@ namespace UglyToad.PdfPig.ConsoleRunner
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sw.Stop();
|
||||
totalPageMicros = sw.Elapsed.Microseconds;
|
||||
}
|
||||
|
||||
var filename = Path.GetFileName(file);
|
||||
|
||||
var size = new FileInfo(file);
|
||||
|
||||
Console.WriteLine($"{GetCleanFilename(filename)}| {size.Length}\t| {numWords}\t| {numPages}");
|
||||
var item = new DataRecord
|
||||
{
|
||||
FileName = filename,
|
||||
OpenCostMicros = openMicros,
|
||||
Pages = numPages,
|
||||
Size = size.Length,
|
||||
Words = numWords,
|
||||
TotalCostMicros = totalPageMicros + openMicros,
|
||||
PerPageMicros = Math.Round(totalPageMicros / (double)Math.Max(numPages, 1), 2)
|
||||
};
|
||||
|
||||
dataList.Add(item);
|
||||
|
||||
PrintTableColumns(
|
||||
item.FileName,
|
||||
item.Size,
|
||||
item.Words,
|
||||
item.Pages,
|
||||
item.OpenCostMicros,
|
||||
item.TotalCostMicros,
|
||||
item.PerPageMicros);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
@ -88,12 +252,71 @@ namespace UglyToad.PdfPig.ConsoleRunner
|
||||
return 5;
|
||||
}
|
||||
|
||||
if (outputOpt != null && outputOpt.Value != null)
|
||||
{
|
||||
WriteOutput(outputOpt.Value, dataList);
|
||||
}
|
||||
|
||||
Console.WriteLine("Complete! :)");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private static string GetCleanFilename(string name, int maxLength = 30)
|
||||
private static void WriteOutput(string outPath, IReadOnlyList<DataRecord> records)
|
||||
{
|
||||
using var fs = File.OpenWrite(outPath);
|
||||
using var sw = new StreamWriter(fs);
|
||||
|
||||
sw.WriteLine("File,Size,Words,Pages,Open Cost,Total Cost,Per Page");
|
||||
foreach (var record in records)
|
||||
{
|
||||
var sizeStr = record.Size.ToString("D", CultureInfo.InvariantCulture);
|
||||
var wordsStr = record.Words.ToString("D", CultureInfo.InvariantCulture);
|
||||
var pagesStr = record.Pages.ToString("D", CultureInfo.InvariantCulture);
|
||||
var openCostStr = record.OpenCostMicros.ToString("D", CultureInfo.InvariantCulture);
|
||||
var totalCostStr = record.TotalCostMicros.ToString("D", CultureInfo.InvariantCulture);
|
||||
var ppcStr = record.PerPageMicros.ToString("F2", CultureInfo.InvariantCulture);
|
||||
|
||||
var numericPartsStr = string.Join(",",
|
||||
[
|
||||
sizeStr,
|
||||
wordsStr,
|
||||
pagesStr,
|
||||
openCostStr,
|
||||
totalCostStr,
|
||||
ppcStr
|
||||
]);
|
||||
|
||||
sw.WriteLine($"\"{record.FileName}\",{numericPartsStr}");
|
||||
}
|
||||
|
||||
sw.Flush();
|
||||
}
|
||||
|
||||
private static void PrintTableColumns(params object[] values)
|
||||
{
|
||||
for (var i = 0; i < values.Length; i++)
|
||||
{
|
||||
var value = values[i];
|
||||
var valueStr = value.ToString();
|
||||
|
||||
var cleaned = GetCleanStr(valueStr ?? string.Empty);
|
||||
|
||||
var padChars = 16 - cleaned.Length;
|
||||
|
||||
var padding = padChars > 0 ? new string(' ', padChars) : string.Empty;
|
||||
|
||||
var padded = cleaned + padding;
|
||||
|
||||
Console.Write("| ");
|
||||
|
||||
Console.Write(padded);
|
||||
}
|
||||
|
||||
Console.WriteLine();
|
||||
}
|
||||
|
||||
private static string GetCleanStr(string name, int maxLength = 16)
|
||||
{
|
||||
if (name.Length <= maxLength)
|
||||
{
|
||||
@ -105,4 +328,21 @@ namespace UglyToad.PdfPig.ConsoleRunner
|
||||
return name.Substring(0, maxLength);
|
||||
}
|
||||
}
|
||||
|
||||
internal class DataRecord
|
||||
{
|
||||
public required string FileName { get; init; }
|
||||
|
||||
public required long Size { get; init; }
|
||||
|
||||
public required int Words { get; init; }
|
||||
|
||||
public required int Pages { get; init; }
|
||||
|
||||
public required long OpenCostMicros { get; init; }
|
||||
|
||||
public required long TotalCostMicros { get; init; }
|
||||
|
||||
public required double PerPageMicros { get; init; }
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
{
|
||||
"profiles": {
|
||||
"UglyToad.PdfPig.ConsoleRunner": {
|
||||
"commandName": "Project",
|
||||
"commandLineArgs": "\"C:\\temp\\pdfs\\archive\""
|
||||
"commandName": "Project",
|
||||
"commandLineArgs": "\"C:\\temp\\pdfs\\archive\""
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4,6 +4,7 @@
|
||||
<LangVersion>latest</LangVersion>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net8</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
|
||||
<PackageId>PdfPig</PackageId>
|
||||
@ -11,9 +11,9 @@
|
||||
<PackageTags>PDF;Reader;Document;Adobe;PDFBox;PdfPig;pdf-extract;pdf-to-text;pdf;file;text;C#;dotnet;.NET</PackageTags>
|
||||
<RepositoryUrl>https://github.com/UglyToad/PdfPig</RepositoryUrl>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<Version>0.1.11</Version>
|
||||
<AssemblyVersion>0.1.11.0</AssemblyVersion>
|
||||
<FileVersion>0.1.11.0</FileVersion>
|
||||
<Version>0.1.12-alpha001</Version>
|
||||
<AssemblyVersion>0.1.12.0</AssemblyVersion>
|
||||
<FileVersion>0.1.12.0</FileVersion>
|
||||
<PackageIconUrl>https://raw.githubusercontent.com/UglyToad/PdfPig/master/documentation/pdfpig.png</PackageIconUrl>
|
||||
<PackageIcon>pdfpig.png</PackageIcon>
|
||||
<Product>PdfPig</Product>
|
||||
|
||||
5
tools/global.json
Normal file
5
tools/global.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"sdk": {
|
||||
"version": "8.0.*"
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user