Compare commits

...

22 Commits

Author SHA1 Message Date
Karl
3650e27432
add container node support for BookmarksProvider.cs (#1133)
* add container node support for BookmarksProvider.cs

* move position

* fixed unittest error

* revert package name

* remove duplicated package info.
2025-08-14 21:17:58 +01:00
BobLd
a43b968ea9 Lower max search depth in preventing StackOverflow in ParseTrailer 2025-08-10 10:06:23 +01:00
BobLd
1031dcc221 Prevent StackOverflow in ParseTrailer and fix #1122 2025-08-09 08:46:04 +01:00
BobLd
0f641774e6 Update build_and_test_macos.yml 2025-08-09 08:33:34 +01:00
BobLd
a3edc926c8 Update build_and_test_macos.yml 2025-08-09 08:21:21 +01:00
BobLd
f1923fcbcd Increase FlateFilter multiplier when preventing malicious OOM and fix #1125 2025-08-08 19:04:31 +01:00
EliotJones
7ff58893af only run tests if nightly publish needed 2025-08-04 21:46:13 -05:00
EliotJones
bee6f13888 fix tag fetching and parse behavior 2025-08-04 21:40:28 -05:00
EliotJones
e6dd2d15c2 use gemini to mark ched gpt's work and improve the action 2025-08-04 21:00:12 -05:00
EliotJones
7dd5d68be3 prevent duplicate package publish on manual run, attempt 1 2025-08-04 20:49:18 -05:00
BobLd
bdf3b8e2b4 Update nightly_release.yml 2025-08-03 20:03:13 +01:00
BobLd
c8dff885bd Update run_common_crawl_tests.yml 2025-08-03 08:56:17 +01:00
BobLd
0b228c57b7 Update run_integration_tests.yml 2025-08-03 08:52:27 +01:00
BobLd
ef21227b3c Update run_integration_tests.yml 2025-08-03 08:46:40 +01:00
BobLd
b9f2230a0a Add global.json in tools 2025-08-03 08:43:58 +01:00
BobLd
b6950a5fb0
Update run_integration_tests.yml (#1117) 2025-08-03 08:34:50 +01:00
Chuck B.
1ed9e017f4
Performance improvements and .Net 9 support (#1116)
* Refactor letter handling by orientation for efficiency

Improved the processing of letters based on their text orientation by preallocating separate lists for each orientation (horizontal, rotate270, rotate180, rotate90, and other). This change reduces multiple calls to `GetWords` and minimizes enumerations and allocations, enhancing performance and readability. Each letter is now added to the appropriate list in a single iteration over the `letters` collection.

* Update target frameworks to include net9.0

Expanded compatibility in `UglyToad.PdfPig.csproj` by adding
`net9.0` to the list of target frameworks, alongside existing
versions.

* Add .NET 9.0 support and refactor key components

Updated project files for UglyToad.PdfPig to target .NET 9.0, enhancing compatibility with the latest framework features.

Refactored `GetBlocks` in `DocstrumBoundingBoxes.cs` for improved input handling and performance.

Significantly optimized `NearestNeighbourWordExtractor.cs` by replacing multiple lists with an array of buckets and implementing parallel processing for better efficiency.

Consistent updates across `Fonts`, `Tests`, `Tokenization`, and `Tokens` project files to include .NET 9.0 support.

* Improve null checks and optimize list handling

- Updated null check for `words` in `DocstrumBoundingBoxes.cs` for better readability and performance.
- Changed from `ToList()` to `ToArray()` to avoid unnecessary enumeration.
- Added `results.TrimExcess()` in `NearestNeighbourWordExtractor.cs` to optimize memory usage.

---------

Co-authored-by: Chuck Beasley <CBeasley@kilpatricktownsend.com>
2025-08-01 22:24:16 +01:00
EliotJones
83d6fc6cc2 allow missing catalog type definition for catalog dictionary
Some checks failed
Build and test / build (push) Has been cancelled
Build and test [MacOS] / build (push) Has been cancelled
Run Common Crawl Tests / build (push) Has been cancelled
Run Integration Tests / build (push) Has been cancelled
Nightly Release / tests (push) Has been cancelled
Nightly Release / Check latest commit (push) Has been cancelled
Nightly Release / build_and_publish_nightly (push) Has been cancelled
as long as there is a pages entry we accept this in lenient parsing mode. this
is to fix document 006705.pdf in the corpus that had '/calalog' as the dictionary
entry.

also adds a test for some weird content stream content in 0006324.pdf where
numbers seem to get split in the content stream on a decimal place. this is
just to check that our parser doesn't hard crash
2025-07-27 02:55:29 +01:00
theolivenbaum
febfa4d4b3 Fix usage of List.Contains 2025-07-27 02:52:56 +01:00
Eliot Jones
0ebbe0540d
add nullability to core projec (#1111) 2025-07-27 02:48:58 +01:00
EliotJones
52c0635273 support performance profiling information in console runner 2025-07-26 15:04:03 -05:00
EliotJones
b6bd0a3169 bump version to 0.1.12-alpha001 2025-07-26 13:43:28 -05:00
42 changed files with 583 additions and 130 deletions

View File

@ -19,10 +19,11 @@ jobs:
2.1.x
6.0.x
8.0.x
9.0.x
# Build the release build
- name: Build the solution
run: dotnet build -c Release src/UglyToad.PdfPig.sln
run: dotnet build -c Release src/UglyToad.PdfPig.sln -f net8.0
- name: Run the tests
run: dotnet test -c Release src/UglyToad.PdfPig.sln
run: dotnet test -c Release src/UglyToad.PdfPig.sln -f net8.0

View File

@ -5,39 +5,55 @@ on:
- cron: "0 0 * * *"
workflow_dispatch:
jobs:
tests:
uses: ./.github/workflows/run_integration_tests.yml
check_date:
runs-on: ubuntu-latest
name: Check latest commit
outputs:
should_run: ${{ steps.should_run.outputs.should_run }}
steps:
- uses: actions/checkout@master
- name: print latest_commit
run: echo ${{ github.sha }}
permissions:
contents: write # Grant write permission for tagging
jobs:
check_publish_needed:
runs-on: ubuntu-latest
name: Check if this commit has already been published
outputs:
should_run: ${{ steps.check.outputs.should_run }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Fetch tags
run: git fetch --tags
- id: check
run: |
latest_commit=$(git rev-parse nightly-latest || echo "")
echo "Latest published commit: $latest_commit"
if [ "$latest_commit" = "${{ github.sha }}" ]; then
echo "No new commit since last publish."
echo "should_run=false" >> $GITHUB_OUTPUT
else
echo "New commit detected."
echo "should_run=true" >> $GITHUB_OUTPUT
fi
tests:
needs: check_publish_needed
if: ${{ needs.check_publish_needed.outputs.should_run == 'true' }}
uses: ./.github/workflows/run_integration_tests.yml
- id: should_run
continue-on-error: true
name: check latest commit is less than a day ago
if: ${{ github.event_name == 'schedule' }}
run: test -z $(git rev-list --after="24 hours" ${{ github.sha }}) && echo "::set-output name=should_run::false"
build_and_publish_nightly:
needs: [check_date, tests]
if: ${{ needs.check_date.outputs.should_run != 'false' }}
needs: [check_publish_needed, tests]
if: ${{ needs.check_publish_needed.outputs.should_run == 'true' }}
runs-on: windows-2022
name: build_and_publish_nightly
steps:
- uses: actions/checkout@master
- uses: actions/checkout@v4
- name: Set up dotnet core
uses: actions/setup-dotnet@v2
uses: actions/setup-dotnet@v4
with:
dotnet-version: |
2.1.x
6.0.x
8.0.x
9.0.x
- name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v1.0.2
@ -54,5 +70,10 @@ jobs:
- name: Publish Nuget to GitHub registry
run: dotnet nuget push **/*.nupkg --api-key ${{secrets.NUGET_API_KEY}} --source https://api.nuget.org/v3/index.json
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Tag latest nightly commit
run: |
git config user.name "github-actions"
git config user.email "github-actions@github.com"
git tag -f nightly-latest ${{ github.sha }}
git push origin nightly-latest --force

View File

@ -15,7 +15,9 @@ jobs:
- name: Set up dotnet core
uses: actions/setup-dotnet@v3
with:
dotnet-version: "8.0.x"
dotnet-version: |
8.0.x
9.0.x
- name: Restore corpus cache 0000, 0001
id: restore-corpus

View File

@ -15,7 +15,9 @@ jobs:
- name: Set up dotnet core
uses: actions/setup-dotnet@v3
with:
dotnet-version: "8.0.x"
dotnet-version: |
8.0.x
9.0.x
- name: Restore cached part 1
id: restore-cache-p1

View File

@ -1,5 +1,4 @@
using System;
using System.Buffers;
using System.Buffers;
namespace UglyToad.PdfPig.Core;

View File

@ -71,7 +71,7 @@
}
/// <inheritdoc />
public override bool Equals(object obj)
public override bool Equals(object? obj)
{
return obj is IndirectReference other && Equals(other);
}

View File

@ -16,7 +16,7 @@
/// <summary>
/// Convert the string to bytes using the ISO 8859-1 encoding.
/// </summary>
public static byte[] StringAsLatin1Bytes(string s)
public static byte[]? StringAsLatin1Bytes(string? s)
{
if (s == null)
{

View File

@ -264,7 +264,7 @@
/// Try to convert raw bytes to a PdfDocEncoding encoded string. If unsupported characters are encountered
/// meaning we cannot safely round-trip the value to bytes this will instead return false.
/// </summary>
public static bool TryConvertBytesToString(ReadOnlySpan<byte> bytes, out string result)
public static bool TryConvertBytesToString(ReadOnlySpan<byte> bytes, out string? result)
{
result = null;
if (bytes.Length == 0)

View File

@ -70,7 +70,7 @@
/// <summary>
/// Returns a value indicating whether this <see cref="PdfLine"/> is equal to a specified <see cref="PdfLine"/> .
/// </summary>
public override bool Equals(object obj)
public override bool Equals(object? obj)
{
return obj is PdfLine other && Equals(other);
}

View File

@ -83,7 +83,7 @@
/// <summary>
/// Returns a value indicating whether this <see cref="PdfPoint"/> is equal to a specified <see cref="PdfPoint"/> .
/// </summary>
public override bool Equals(object obj)
public override bool Equals(object? obj)
{
return obj is PdfPoint other && Equals(other);
}

View File

@ -177,7 +177,7 @@
}
/// <inheritdoc />
public override bool Equals(object obj)
public override bool Equals(object? obj)
{
return obj is PdfRectangle other && Equals(other);
}

View File

@ -241,8 +241,8 @@
public bool IsClosed()
{
var filteredCount = 0;
IPathCommand last = null;
IPathCommand first = null;
IPathCommand? last = null;
IPathCommand? first = null;
for (int i = Commands.Count - 1; i >= 0; i--)
{
var cmd = Commands[i];
@ -376,14 +376,14 @@
/// Gets a <see cref="PdfRectangle"/> which entirely contains the geometry of the defined path.
/// </summary>
/// <returns>For paths which don't define any geometry this returns <see langword="null"/>.</returns>
public static PdfRectangle? GetBoundingRectangle(IReadOnlyList<PdfSubpath> path)
public static PdfRectangle? GetBoundingRectangle(IReadOnlyList<PdfSubpath>? path)
{
if (path == null || path.Count == 0)
{
return null;
}
var bboxes = path.Select(x => x.GetBoundingRectangle()).Where(x => x.HasValue).Select(x => x.Value).ToList();
var bboxes = path.Select(x => x.GetBoundingRectangle()).Where(x => x.HasValue).Select(x => x!.Value).ToList();
if (bboxes.Count == 0)
{
return null;
@ -433,7 +433,7 @@
}
/// <inheritdoc />
public override bool Equals(object obj)
public override bool Equals(object? obj)
{
return (obj is Close);
}
@ -479,7 +479,7 @@
}
/// <inheritdoc />
public override bool Equals(object obj)
public override bool Equals(object? obj)
{
if (obj is Move move)
{
@ -545,7 +545,7 @@
}
/// <inheritdoc />
public override bool Equals(object obj)
public override bool Equals(object? obj)
{
if (obj is Line line)
{
@ -651,7 +651,7 @@
}
/// <inheritdoc />
public override bool Equals(object obj)
public override bool Equals(object? obj)
{
if (obj is QuadraticBezierCurve curve)
{
@ -809,7 +809,7 @@
}
/// <inheritdoc />
public override bool Equals(object obj)
public override bool Equals(object? obj)
{
if (obj is CubicBezierCurve curve)
{
@ -944,7 +944,7 @@
/// <summary>
/// Compares two <see cref="PdfSubpath"/>s for equality. Paths will only be considered equal if the commands which construct the paths are in the same order.
/// </summary>
public override bool Equals(object obj)
public override bool Equals(object? obj)
{
if (!(obj is PdfSubpath path) || Commands.Count != path.Commands.Count)
{

View File

@ -463,7 +463,7 @@
}
/// <inheritdoc />
public override bool Equals(object obj)
public override bool Equals(object? obj)
{
return obj is TransformationMatrix other && Equals(other);
}

View File

@ -1,12 +1,14 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
<LangVersion>12</LangVersion>
<Version>0.1.11</Version>
<Version>0.1.12-alpha001</Version>
<IsTestProject>False</IsTestProject>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<SignAssembly>true</SignAssembly>
<AssemblyOriginatorKeyFile>..\pdfpig.snk</AssemblyOriginatorKeyFile>
<Nullable>enable</Nullable>
<WarningsAsErrors>nullable</WarningsAsErrors>
</PropertyGroup>
<ItemGroup Condition="'$(TargetFramework)'=='net462'">
<PackageReference Include="System.ValueTuple" Version="4.5.0" />

View File

@ -89,7 +89,7 @@ namespace UglyToad.PdfPig.Core
/// <inheritdoc />
public override bool TryGetSecond(out B b)
{
b = default(B);
b = default!;
return false;
}
@ -135,7 +135,7 @@ namespace UglyToad.PdfPig.Core
/// <inheritdoc />
public override bool TryGetFirst(out A a)
{
a = default(A);
a = default!;
return false;
}

View File

@ -128,13 +128,13 @@
throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pagesTextBlocks));
}
ConcurrentDictionary<int, List<TextBlock>> pageDecorations = new ConcurrentDictionary<int, List<TextBlock>>();
ConcurrentDictionary<int, OrderedSet<TextBlock>> pageDecorations = new ConcurrentDictionary<int, OrderedSet<TextBlock>>();
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
Parallel.For(0, pagesTextBlocks.Count, parallelOptions, p =>
{
if (!pageDecorations.TryAdd(p, new List<TextBlock>()))
if (!pageDecorations.TryAdd(p, new OrderedSet<TextBlock>()))
{
throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary.");
}
@ -165,7 +165,7 @@
var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n);
if (score >= similarityThreshold)
{
if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
pageDecorations[p].TryAdd(current);
}
}
@ -180,7 +180,7 @@
var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n);
if (score >= similarityThreshold)
{
if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
pageDecorations[p].TryAdd(current);
}
}
@ -195,7 +195,7 @@
var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n);
if (score >= similarityThreshold)
{
if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
pageDecorations[p].TryAdd(current);
}
}
@ -210,12 +210,12 @@
var score = Score(current, previousPage, nextPage, minimumEditDistanceNormalised, similarityThreshold, n);
if (score >= similarityThreshold)
{
if (!pageDecorations[p].Contains(current)) pageDecorations[p].Add(current);
pageDecorations[p].TryAdd(current);
}
}
});
return pageDecorations.OrderBy(x => x.Key).Select(x => x.Value).ToList();
return pageDecorations.OrderBy(x => x.Key).Select(x => x.Value.GetList()).ToList();
}
/// <summary>

View File

@ -0,0 +1,56 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
internal class OrderedSet<T>
{
private readonly HashSet<T> _set;
private readonly List<T> _list;
public OrderedSet() : this(EqualityComparer<T>.Default)
{
}
public OrderedSet(IEqualityComparer<T> comparer)
{
_set = new HashSet<T>(comparer);
_list = new List<T>();
}
public int Count => _set.Count;
public bool TryAdd(T item)
{
if (_set.Contains(item)) return false;
_list.Add(item);
_set.Add(item);
return true;
}
public void Clear()
{
_list.Clear();
_set.Clear();
}
public bool Contains(T item)
{
return item is not null && _set.Contains(item);
}
public void CopyTo(T[] array, int arrayIndex)
{
_list.CopyTo(array, arrayIndex);
}
public List<T> GetList()
{
return _list;
}
}
}

View File

@ -48,12 +48,19 @@
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words)
{
if (words?.Any() != true)
if (words is null)
{
return Array.Empty<TextBlock>();
}
return GetBlocks(words.ToList(),
// Avoid multiple enumeration and unnecessary ToArray() if already a list
var wordList = words as IReadOnlyList<Word> ?? words.ToArray();
if (wordList.Count == 0)
{
return Array.Empty<TextBlock>();
}
return GetBlocks(wordList,
options.WithinLineBounds, options.WithinLineMultiplier, options.WithinLineBinSize,
options.BetweenLineBounds, options.BetweenLineMultiplier, options.BetweenLineBinSize,
options.AngularDifferenceBounds,

View File

@ -1,8 +1,8 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
<LangVersion>12</LangVersion>
<Version>0.1.11</Version>
<Version>0.1.12-alpha001</Version>
<IsTestProject>False</IsTestProject>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<SignAssembly>true</SignAssembly>

View File

@ -51,34 +51,49 @@
if (options.GroupByOrientation)
{
// axis aligned
List<Word> words = GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism);
var buckets = new List<Letter>[5];
for (int i = 0; i < buckets.Length; i++) buckets[i] = new List<Letter>();
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism));
foreach (var l in letters)
{
switch (l.TextOrientation)
{
case TextOrientation.Horizontal: buckets[0].Add(l); break;
case TextOrientation.Rotate270: buckets[1].Add(l); break;
case TextOrientation.Rotate180: buckets[2].Add(l); break;
case TextOrientation.Rotate90: buckets[3].Add(l); break;
default: buckets[4].Add(l); break;
}
}
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism));
// Use a thread-safe collection to avoid lock contention.
var results = new List<Word>(letters.Count); // Pre-allocate for performance
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
options.MaximumDistance, options.DistanceMeasureAA, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism));
// Limit parallelism to avoid oversubscription.
var parallelOptions = new System.Threading.Tasks.ParallelOptions
{
MaxDegreeOfParallelism = options.MaxDegreeOfParallelism > 0 ? options.MaxDegreeOfParallelism : Environment.ProcessorCount
};
// not axis aligned
words.AddRange(GetWords(
letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
options.MaximumDistance, options.DistanceMeasure, options.FilterPivot,
options.Filter, options.MaxDegreeOfParallelism));
return words;
// Use partitioner for better load balancing and avoid ConcurrentBag overhead
System.Threading.Tasks.Parallel.ForEach(
System.Collections.Concurrent.Partitioner.Create(0, buckets.Length),
parallelOptions,
range =>
{
for (int i = range.Item1; i < range.Item2; i++)
{
if (buckets[i].Count == 0) continue;
var measure = (i == 4) ? options.DistanceMeasure : options.DistanceMeasureAA;
var words = GetWords(buckets[i], options.MaximumDistance, measure, options.FilterPivot, options.Filter, options.MaxDegreeOfParallelism);
lock (results)
{
results.AddRange(words);
}
}
});
results.TrimExcess();
return results;
}
else
{

View File

@ -1,8 +1,8 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
<LangVersion>12</LangVersion>
<Version>0.1.11</Version>
<Version>0.1.12-alpha001</Version>
<IsTestProject>False</IsTestProject>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<SignAssembly>true</SignAssembly>

View File

@ -32,4 +32,18 @@ public class AccentedCharactersInBookmarksTests
},
nodes);
}
[Fact]
public void CanReadContainerBookmarksCorrectly()
{
var path = IntegrationHelpers.GetDocumentPath("dotnet-ai.pdf");
using var document = PdfDocument.Open(path);
var isFound = document.TryGetBookmarks(out var bookmarks, false);
Assert.True(isFound);
Assert.True(bookmarks.Roots.Count == 3);
isFound = document.TryGetBookmarks(out bookmarks, true);
Assert.True(isFound);
Assert.True(bookmarks.Roots.Count > 3);
}
}

View File

@ -7,6 +7,15 @@
public class GithubIssuesTests
{
[Fact]
public void Issue1122()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("StackOverflow_Issue_1122.pdf");
var ex = Assert.Throws<PdfDocumentFormatException>(() => PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true }));
Assert.StartsWith("Reached maximum search depth while getting indirect reference.", ex.Message);
}
[Fact]
public void Issue1096()
{

View File

@ -260,6 +260,7 @@
"UglyToad.PdfPig.Outline.DocumentBookmarkNode",
"UglyToad.PdfPig.Outline.EmbeddedBookmarkNode",
"UglyToad.PdfPig.Outline.ExternalBookmarkNode",
"UglyToad.PdfPig.Outline.ContainerBookmarkNode",
"UglyToad.PdfPig.Outline.UriBookmarkNode",
"UglyToad.PdfPig.Outline.Destinations.ExplicitDestination",
"UglyToad.PdfPig.Outline.Destinations.ExplicitDestinationCoordinates",

View File

@ -261,6 +261,47 @@ endobj";
Assert.Equal(3, tokens.OfType<DictionaryToken>().Count());
}
[Fact]
public void Document006324Test()
{
const string content =
"""
q
1 0 0 1 248.6304 572.546 cm
0 0 m
0.021 -0.007 l
3 -0.003 -0.01 0 0 0 c
f
Q
q
1 0 0 1 2489394 57249855 cm
0 0 m
-0.046 -0.001 -0.609 0.029 -0.286 -0.014 c
-02.61 -0.067 -0.286 -0. .61 -0 0 c
f
Q
q
1 0 0 1 24862464 572. .836 cm
0 0 m
0.936 -0.029 l
0.038 -0.021 0.55 -0.014 0 0 c
f
Q
""";
var tokens = new List<IToken>();
var scanner = new CoreTokenScanner(
StringBytesTestConverter.Convert(content, false).Bytes,
true,
isStream: true);
while (scanner.MoveNext())
{
tokens.Add(scanner.CurrentToken);
}
}
private static void AssertCorrectToken<T, TData>(IToken token, TData expected) where T : IDataToken<TData>
{
var cast = Assert.IsType<T>(token);

View File

@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>net471;net8.0</TargetFrameworks>
<TargetFrameworks>net471;net8.0;net9.0</TargetFrameworks>
<IsTestProject>true</IsTestProject>
<IsPackable>false</IsPackable>
<DebugType>full</DebugType>

View File

@ -1,8 +1,8 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
<LangVersion>12</LangVersion>
<Version>0.1.11</Version>
<Version>0.1.12-alpha001</Version>
<IsTestProject>False</IsTestProject>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<SignAssembly>true</SignAssembly>

View File

@ -1,8 +1,8 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
<LangVersion>12</LangVersion>
<Version>0.1.11</Version>
<Version>0.1.12-alpha001</Version>
<IsTestProject>False</IsTestProject>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<SignAssembly>true</SignAssembly>

View File

@ -25,7 +25,7 @@
/// <summary>
/// Extract bookmarks, if any.
/// </summary>
public Bookmarks? GetBookmarks(Catalog catalog)
public Bookmarks? GetBookmarks(Catalog catalog,bool allowContainerNode = false)
{
if (!catalog.CatalogDictionary.TryGet(NameToken.Outlines, pdfScanner, out DictionaryToken? outlinesDictionary))
{
@ -47,7 +47,7 @@
while (next != null)
{
ReadBookmarksRecursively(next, 0, false, seen, catalog.NamedDestinations, roots);
ReadBookmarksRecursively(next, 0, false, seen, catalog.NamedDestinations, roots, allowContainerNode);
if (!next.TryGet(NameToken.Next, out IndirectReferenceToken nextReference)
|| !seen.Add(nextReference.Data))
@ -65,8 +65,7 @@
/// Extract bookmarks recursively.
/// </summary>
private void ReadBookmarksRecursively(DictionaryToken nodeDictionary, int level, bool readSiblings, HashSet<IndirectReference> seen,
NamedDestinations namedDestinations,
List<BookmarkNode> list)
NamedDestinations namedDestinations, List<BookmarkNode> list, bool allowContainerNode = false)
{
// 12.3 Document-Level Navigation
@ -80,7 +79,7 @@
var children = new List<BookmarkNode>();
if (nodeDictionary.TryGet(NameToken.First, pdfScanner, out DictionaryToken? firstChild))
{
ReadBookmarksRecursively(firstChild, level + 1, true, seen, namedDestinations, children);
ReadBookmarksRecursively(firstChild, level + 1, true, seen, namedDestinations, children, allowContainerNode);
}
BookmarkNode bookmark;
@ -108,6 +107,11 @@
return;
}
}
else if(allowContainerNode)
{
bookmark = new ContainerBookmarkNode(title, level, children);
log.Warn($"No /Dest(ination) or /A(ction) entry found for bookmark node: {nodeDictionary}.");
}
else
{
log.Error($"No /Dest(ination) or /A(ction) entry found for bookmark node: {nodeDictionary}.");
@ -138,7 +142,7 @@
break;
}
ReadBookmarksRecursively(current, level, false, seen, namedDestinations, list);
ReadBookmarksRecursively(current, level, false, seen, namedDestinations, list, allowContainerNode);
}
}
}

View File

@ -0,0 +1,16 @@
namespace UglyToad.PdfPig.Outline;
/// <summary>
/// represents a pure container bookmark node: it has a title and child nodes but no destination or action.
/// <para>This is used to handle the common "grouping" bookmarks in PDFs.</para>
/// </summary>
public class ContainerBookmarkNode : BookmarkNode
{
/// <summary>
/// create a container bookmark node.
/// </summary>
public ContainerBookmarkNode(string title, int level, IReadOnlyList<BookmarkNode> children)
: base(title, level, children)
{
}
}

View File

@ -19,7 +19,8 @@
throw new ArgumentNullException(nameof(dictionary));
}
if (dictionary.TryGet(NameToken.Type, out var type) && !ReferenceEquals(type, NameToken.Catalog))
if (dictionary.TryGet(NameToken.Type, out var type) && !ReferenceEquals(type, NameToken.Catalog)
&& !isLenientParsing)
{
throw new PdfDocumentFormatException($"The type of the catalog dictionary was not Catalog: {dictionary}.");
}

View File

@ -255,14 +255,14 @@
/// Gets the bookmarks if this document contains some.
/// </summary>
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
public bool TryGetBookmarks([NotNullWhen(true)] out Bookmarks? bookmarks)
public bool TryGetBookmarks([NotNullWhen(true)] out Bookmarks? bookmarks, bool allowContainerNode = false)
{
if (isDisposed)
{
throw new ObjectDisposedException("Cannot access the bookmarks after the document is disposed.");
}
bookmarks = bookmarksProvider.GetBookmarks(Structure.Catalog);
bookmarks = bookmarksProvider.GetBookmarks(Structure.Catalog, allowContainerNode);
return bookmarks != null;
}

View File

@ -112,8 +112,9 @@
{
AsciiHexDecodeFilter => 0.5,
Ascii85Filter => 0.8,
FlateFilter or RunLengthFilter => 3,
RunLengthFilter => 1.5,
LzwFilter => 2,
FlateFilter => 10,
_ => 1000
};
}
@ -122,12 +123,12 @@
/// Returns an equivalent token where any indirect references of child objects are
/// recursively traversed and resolved.
/// </summary>
internal static T? Resolve<T>(this T? token, IPdfTokenScanner scanner, List<IndirectReference>? visited = null) where T : IToken
internal static T? Resolve<T>(this T? token, IPdfTokenScanner scanner, HashSet<IndirectReference>? visited = null) where T : IToken
{
return (T?)ResolveInternal(token, scanner, visited ?? []);
}
private static IToken? ResolveInternal(this IToken? token, IPdfTokenScanner scanner, List<IndirectReference> visited)
private static IToken? ResolveInternal(this IToken? token, IPdfTokenScanner scanner, HashSet<IndirectReference> visited)
{
if (token is StreamToken stream)
{

View File

@ -625,7 +625,8 @@
{
if (offset < 0)
{
var result = GetObjectFromStream(lengthReference.Data, offset);
ushort searchDepth = 0;
var result = GetObjectFromStream(lengthReference.Data, offset, ref searchDepth);
if (!(result.Data is NumericToken streamLengthToken))
{
@ -714,9 +715,23 @@
coreTokenScanner.DeregisterCustomTokenizer(tokenizer);
}
public ObjectToken? Get(IndirectReference reference)
{
ushort searchDepth = 0;
return Get(reference, ref searchDepth);
}
private ObjectToken? Get(IndirectReference reference, ref ushort searchDepth)
{
if (searchDepth > 100)
{
throw new PdfDocumentFormatException("Reached maximum search depth while getting indirect reference.");
}
searchDepth++;
if (isDisposed)
{
throw new ObjectDisposedException(nameof(PdfTokenScanner));
@ -740,7 +755,7 @@
// Negative offsets refer to a stream with that number.
if (offset < 0)
{
var result = GetObjectFromStream(reference, offset);
var result = GetObjectFromStream(reference, offset, ref searchDepth);
return result;
}
@ -802,11 +817,11 @@
}
}
private ObjectToken GetObjectFromStream(IndirectReference reference, long offset)
private ObjectToken GetObjectFromStream(IndirectReference reference, long offset, ref ushort searchDepth)
{
var streamObjectNumber = offset * -1;
var streamObject = Get(new IndirectReference(streamObjectNumber, 0));
var streamObject = Get(new IndirectReference(streamObjectNumber, 0), ref searchDepth);
if (!(streamObject?.Data is StreamToken stream))
{

View File

@ -1,8 +1,8 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0;net9.0</TargetFrameworks>
<LangVersion>12</LangVersion>
<Version>0.1.11</Version>
<Version>0.1.12-alpha001</Version>
<IsTestProject>False</IsTestProject>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<SignAssembly>true</SignAssembly>

View File

@ -1,5 +1,10 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using Console = System.Console;
@ -7,6 +12,113 @@ namespace UglyToad.PdfPig.ConsoleRunner
{
public static class Program
{
private class OptionalArg
{
public required string ShortSymbol { get; init; }
public required string Symbol { get; init; }
public required bool SupportsValue { get; init; }
public string? Value { get; set; }
}
private class ParsedArgs
{
public required IReadOnlyList<OptionalArg> SuppliedArgs { get; init; }
public required string SuppliedDirectoryPath { get; init; }
}
private static IReadOnlyList<OptionalArg> GetSupportedArgs() =>
[
new OptionalArg
{
SupportsValue = false,
ShortSymbol = "nr",
Symbol = "no-recursion"
},
new OptionalArg
{
SupportsValue = true,
ShortSymbol = "o",
Symbol = "output"
},
new OptionalArg
{
SupportsValue = true,
ShortSymbol = "l",
Symbol = "limit"
}
];
private static bool TryParseArgs(
string[] args,
[NotNullWhen(true)] out ParsedArgs? parsed)
{
parsed = null;
string? path = null;
var suppliedOpts = new List<OptionalArg>();
var opts = GetSupportedArgs();
for (var i = 0; i < args.Length; i++)
{
var str = args[i];
var isOptFlag = str.StartsWith('-');
if (!isOptFlag)
{
if (path == null)
{
path = str;
}
else
{
return false;
}
}
else
{
var item = opts.SingleOrDefault(x =>
string.Equals("-" + x.ShortSymbol, str, StringComparison.OrdinalIgnoreCase)
|| string.Equals("--" + x.Symbol, str, StringComparison.OrdinalIgnoreCase));
if (item == null)
{
return false;
}
if (item.SupportsValue)
{
if (i == args.Length - 1)
{
return false;
}
i++;
item.Value = args[i];
}
suppliedOpts.Add(item);
}
}
if (path == null)
{
return false;
}
parsed = new ParsedArgs
{
SuppliedArgs = suppliedOpts,
SuppliedDirectoryPath = path
};
return true;
}
public static int Main(string[] args)
{
if (args.Length == 0)
@ -15,30 +127,47 @@ namespace UglyToad.PdfPig.ConsoleRunner
return 7;
}
var path = args[0];
if (!Directory.Exists(path))
if (!TryParseArgs(args, out var parsed))
{
Console.WriteLine($"The provided path is not a valid directory: {path}.");
var strJoined = string.Join(" ", args);
Console.WriteLine($"Unrecognized arguments passed: {strJoined}");
return 7;
}
var maxCount = default(int?);
if (args.Length > 1 && int.TryParse(args[1], out var countIn))
if (!Directory.Exists(parsed.SuppliedDirectoryPath))
{
maxCount = countIn;
Console.WriteLine($"The provided path is not a valid directory: {parsed.SuppliedDirectoryPath}.");
return 7;
}
int? maxCount = null;
var limit = parsed.SuppliedArgs.SingleOrDefault(x => x.ShortSymbol == "l");
if (limit?.Value != null && int.TryParse(limit.Value, CultureInfo.InvariantCulture, out var maxCountArg))
{
Console.WriteLine($"Limiting input files to first: {maxCountArg}");
maxCount = maxCountArg;
}
var noRecursionMode = parsed.SuppliedArgs.Any(x => x.ShortSymbol == "nr");
var outputOpt = parsed.SuppliedArgs.SingleOrDefault(x => x.ShortSymbol == "o" && x.Value != null);
var hasError = false;
var errorBuilder = new StringBuilder();
var fileList = Directory.GetFiles(path, "*.pdf", SearchOption.AllDirectories);
var fileList = Directory.GetFiles(
parsed.SuppliedDirectoryPath,
"*.pdf",
noRecursionMode ? SearchOption.TopDirectoryOnly : SearchOption.AllDirectories)
.OrderBy(x => x).ToList();
var runningCount = 0;
Console.WriteLine($"Found {fileList.Length} files.");
Console.WriteLine($"Found {fileList.Count} files.");
Console.WriteLine();
Console.WriteLine($"{GetCleanFilename("File")}| Size\t| Words\t| Pages");
PrintTableColumns("File", "Size", "Words", "Pages", "Open cost (μs)", "Total cost (μs)", "Page cost (μs)");
var dataList = new List<DataRecord>();
var sw = new Stopwatch();
foreach (var file in fileList)
{
if (maxCount.HasValue && runningCount >= maxCount)
@ -50,8 +179,20 @@ namespace UglyToad.PdfPig.ConsoleRunner
{
var numWords = 0;
var numPages = 0;
long openMicros;
long totalPageMicros;
sw.Reset();
sw.Start();
using (var pdfDocument = PdfDocument.Open(file))
{
sw.Stop();
openMicros = sw.Elapsed.Microseconds;
sw.Start();
foreach (var page in pdfDocument.GetPages())
{
numPages++;
@ -63,13 +204,36 @@ namespace UglyToad.PdfPig.ConsoleRunner
}
}
}
sw.Stop();
totalPageMicros = sw.Elapsed.Microseconds;
}
var filename = Path.GetFileName(file);
var size = new FileInfo(file);
Console.WriteLine($"{GetCleanFilename(filename)}| {size.Length}\t| {numWords}\t| {numPages}");
var item = new DataRecord
{
FileName = filename,
OpenCostMicros = openMicros,
Pages = numPages,
Size = size.Length,
Words = numWords,
TotalCostMicros = totalPageMicros + openMicros,
PerPageMicros = Math.Round(totalPageMicros / (double)Math.Max(numPages, 1), 2)
};
dataList.Add(item);
PrintTableColumns(
item.FileName,
item.Size,
item.Words,
item.Pages,
item.OpenCostMicros,
item.TotalCostMicros,
item.PerPageMicros);
}
catch (Exception ex)
{
@ -88,12 +252,71 @@ namespace UglyToad.PdfPig.ConsoleRunner
return 5;
}
if (outputOpt != null && outputOpt.Value != null)
{
WriteOutput(outputOpt.Value, dataList);
}
Console.WriteLine("Complete! :)");
return 0;
}
private static string GetCleanFilename(string name, int maxLength = 30)
private static void WriteOutput(string outPath, IReadOnlyList<DataRecord> records)
{
using var fs = File.OpenWrite(outPath);
using var sw = new StreamWriter(fs);
sw.WriteLine("File,Size,Words,Pages,Open Cost,Total Cost,Per Page");
foreach (var record in records)
{
var sizeStr = record.Size.ToString("D", CultureInfo.InvariantCulture);
var wordsStr = record.Words.ToString("D", CultureInfo.InvariantCulture);
var pagesStr = record.Pages.ToString("D", CultureInfo.InvariantCulture);
var openCostStr = record.OpenCostMicros.ToString("D", CultureInfo.InvariantCulture);
var totalCostStr = record.TotalCostMicros.ToString("D", CultureInfo.InvariantCulture);
var ppcStr = record.PerPageMicros.ToString("F2", CultureInfo.InvariantCulture);
var numericPartsStr = string.Join(",",
[
sizeStr,
wordsStr,
pagesStr,
openCostStr,
totalCostStr,
ppcStr
]);
sw.WriteLine($"\"{record.FileName}\",{numericPartsStr}");
}
sw.Flush();
}
private static void PrintTableColumns(params object[] values)
{
for (var i = 0; i < values.Length; i++)
{
var value = values[i];
var valueStr = value.ToString();
var cleaned = GetCleanStr(valueStr ?? string.Empty);
var padChars = 16 - cleaned.Length;
var padding = padChars > 0 ? new string(' ', padChars) : string.Empty;
var padded = cleaned + padding;
Console.Write("| ");
Console.Write(padded);
}
Console.WriteLine();
}
private static string GetCleanStr(string name, int maxLength = 16)
{
if (name.Length <= maxLength)
{
@ -105,4 +328,21 @@ namespace UglyToad.PdfPig.ConsoleRunner
return name.Substring(0, maxLength);
}
}
internal class DataRecord
{
public required string FileName { get; init; }
public required long Size { get; init; }
public required int Words { get; init; }
public required int Pages { get; init; }
public required long OpenCostMicros { get; init; }
public required long TotalCostMicros { get; init; }
public required double PerPageMicros { get; init; }
}
}

View File

@ -1,8 +1,8 @@
{
"profiles": {
"UglyToad.PdfPig.ConsoleRunner": {
"commandName": "Project",
"commandLineArgs": "\"C:\\temp\\pdfs\\archive\""
"commandName": "Project",
"commandLineArgs": "\"C:\\temp\\pdfs\\archive\""
}
}
}

View File

@ -4,6 +4,7 @@
<LangVersion>latest</LangVersion>
<OutputType>Exe</OutputType>
<TargetFramework>net8</TargetFramework>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>

View File

@ -1,4 +1,4 @@
<Project Sdk="Microsoft.NET.Sdk">
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netstandard2.0;net462;net471;net6.0;net8.0</TargetFrameworks>
<PackageId>PdfPig</PackageId>
@ -11,9 +11,9 @@
<PackageTags>PDF;Reader;Document;Adobe;PDFBox;PdfPig;pdf-extract;pdf-to-text;pdf;file;text;C#;dotnet;.NET</PackageTags>
<RepositoryUrl>https://github.com/UglyToad/PdfPig</RepositoryUrl>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<Version>0.1.11</Version>
<AssemblyVersion>0.1.11.0</AssemblyVersion>
<FileVersion>0.1.11.0</FileVersion>
<Version>0.1.12-alpha001</Version>
<AssemblyVersion>0.1.12.0</AssemblyVersion>
<FileVersion>0.1.12.0</FileVersion>
<PackageIconUrl>https://raw.githubusercontent.com/UglyToad/PdfPig/master/documentation/pdfpig.png</PackageIconUrl>
<PackageIcon>pdfpig.png</PackageIcon>
<Product>PdfPig</Product>

5
tools/global.json Normal file
View File

@ -0,0 +1,5 @@
{
"sdk": {
"version": "8.0.*"
}
}