diff --git a/.github/workflows/run_common_crawl_tests.yml b/.github/workflows/run_common_crawl_tests.yml index 5e7ccdc1..244dce42 100644 --- a/.github/workflows/run_common_crawl_tests.yml +++ b/.github/workflows/run_common_crawl_tests.yml @@ -10,6 +10,7 @@ jobs: build: runs-on: ubuntu-latest strategy: + fail-fast: false matrix: pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007"] @@ -45,7 +46,7 @@ jobs: - name: Remove unwanted test files run: | - while read f; do + while read f || [ -n "$f" ]; do full="corpus/extracted/$f" if [ -f "$full" ]; then echo "Removing $full" diff --git a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs index 14cd5f5a..0fb47f01 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs @@ -102,8 +102,10 @@ { var path = IntegrationHelpers.GetSpecificTestDocumentPath("Hang.pdf"); - var ex = Assert.Throws(() => PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true })); - Assert.StartsWith("Could not locate object with reference:", ex.Message); + using var doc = PdfDocument.Open(path, new ParsingOptions { UseLenientParsing = true }); + + var ex = Assert.Throws(() => doc.GetPage(1)); + Assert.StartsWith("Could not find", ex.Message); } [Fact] diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index 7d4226e6..c5fa7a6a 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -770,7 +770,8 @@ if (!MoveNext()) { - return BruteForceFileToFindReference(reference); + TryBruteForceFileToFindReference(reference, out var bfObjectToken); + return bfObjectToken; } var found = (ObjectToken)CurrentToken!; @@ -780,7 +781,9 @@ return found; } - return BruteForceFileToFindReference(reference); + TryBruteForceFileToFindReference(reference, out var bfToken); + + return bfToken; } public void ReplaceToken(IndirectReference reference, IToken token) @@ -790,8 +793,9 @@ overwrittenTokens[reference] = new ObjectToken(0, reference, token); } - private ObjectToken BruteForceFileToFindReference(IndirectReference reference) + private bool TryBruteForceFileToFindReference(IndirectReference reference, [NotNullWhen(true)] out ObjectToken? result) { + result = null; try { // Brute force read the entire file @@ -806,10 +810,12 @@ if (!objectLocationProvider.TryGetCached(reference, out var objectToken)) { - throw new PdfDocumentFormatException($"Could not locate object with reference: {reference} despite a full document search."); + return false; } - return objectToken; + result = objectToken; + + return true; } finally { diff --git a/tools/common-crawl-ignore.txt b/tools/common-crawl-ignore.txt index dc58bfbd..5d7f95b3 100644 --- a/tools/common-crawl-ignore.txt +++ b/tools/common-crawl-ignore.txt @@ -37,4 +37,13 @@ 0006766.pdf 0006844.pdf 0007159.pdf -0007559.pdf \ No newline at end of file +0007559.pdf +0008404.pdf +0008443.pdf +0008674.pdf +0008978.pdf +0009290.pdf +0009309.pdf +0009464.pdf +0009706.pdf +0009944.pdf \ No newline at end of file