mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-18 18:27:55 +08:00

* read last line of ignore file - do not cancel other matrix jobs if one test fails - read all lines of the ignore list even if it doesn't end with a newline - add ignore list for 0008 and 0009 * support missing object numbers when brute-forcing the file 10404 (ironically) contains not found references with number 43 0 for its info dictionary. changes brute-force code so that objects can be entirely missing * fix test since document is now opened successfully but mediabox is broken
59 lines
1.7 KiB
YAML
59 lines
1.7 KiB
YAML
name: Run Common Crawl Tests
|
|
|
|
on:
|
|
push:
|
|
branches: [master]
|
|
workflow_dispatch:
|
|
workflow_call:
|
|
|
|
jobs:
|
|
build:
|
|
runs-on: ubuntu-latest
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007"]
|
|
|
|
steps:
|
|
- uses: actions/checkout@v2
|
|
|
|
- name: Set up dotnet core
|
|
uses: actions/setup-dotnet@v3
|
|
with:
|
|
dotnet-version: |
|
|
8.0.x
|
|
9.0.x
|
|
|
|
- name: Restore corpus cache
|
|
id: restore-corpus
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: corpus/
|
|
key: ${{ runner.os }}-pdf-corpus-${{ matrix.pair }}
|
|
|
|
- name: Download corpus if cache missed
|
|
if: steps.restore-corpus.outputs.cache-hit != 'true'
|
|
run: |
|
|
mkdir -p corpus/zipfiles
|
|
cd corpus/zipfiles
|
|
for file in $(echo "${{ matrix.pair }}" | tr '-' ' '); do
|
|
echo "Downloading $file.zip"
|
|
wget -nv "https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/$file.zip" -O "$file.zip"
|
|
done
|
|
cd ..
|
|
unzip 'zipfiles/*.zip' -d extracted
|
|
# run: rm -f zipfiles/*.zip
|
|
|
|
- name: Remove unwanted test files
|
|
run: |
|
|
while read f || [ -n "$f" ]; do
|
|
full="corpus/extracted/$f"
|
|
if [ -f "$full" ]; then
|
|
echo "Removing $full"
|
|
rm "$full"
|
|
fi
|
|
done < tools/common-crawl-ignore.txt
|
|
|
|
- name: Run tests against corpus
|
|
run: dotnet run --project tools/UglyToad.PdfPig.ConsoleRunner/UglyToad.PdfPig.ConsoleRunner.csproj "corpus/extracted"
|