diff --git a/.github/workflows/run_common_crawl_tests.yml b/.github/workflows/run_common_crawl_tests.yml index 772ce05b..5e7ccdc1 100644 --- a/.github/workflows/run_common_crawl_tests.yml +++ b/.github/workflows/run_common_crawl_tests.yml @@ -9,6 +9,10 @@ on: jobs: build: runs-on: ubuntu-latest + strategy: + matrix: + pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007"] + steps: - uses: actions/checkout@v2 @@ -19,42 +23,35 @@ jobs: 8.0.x 9.0.x - - name: Restore corpus cache 0000, 0001 + - name: Restore corpus cache id: restore-corpus uses: actions/cache@v4 with: path: corpus/ - key: ${{ runner.os }}-pdf-corpus-0000-0001 + key: ${{ runner.os }}-pdf-corpus-${{ matrix.pair }} - - name: Download corpus if cache missed 0000, 0001 + - name: Download corpus if cache missed if: steps.restore-corpus.outputs.cache-hit != 'true' run: | mkdir -p corpus/zipfiles cd corpus/zipfiles - echo "Downloading 0000.zip" - wget -nv https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0000.zip -O 0000.zip - wget -nv https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0001.zip -O 0001.zip + for file in $(echo "${{ matrix.pair }}" | tr '-' ' '); do + echo "Downloading $file.zip" + wget -nv "https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/$file.zip" -O "$file.zip" + done cd .. unzip 'zipfiles/*.zip' -d extracted # run: rm -f zipfiles/*.zip - name: Remove unwanted test files run: | - skip_files=( - "corpus/extracted/0000399.pdf" - "corpus/extracted/0000819.pdf" - "corpus/extracted/0000920.pdf" - "corpus/extracted/0000300.pdf" - "corpus/extracted/0001589.pdf" - "corpus/extracted/0001957.pdf" - ) - - for file in "${skip_files[@]}"; do - if [ -f "$file" ]; then - echo "Removing $file" - rm "$file" + while read f; do + full="corpus/extracted/$f" + if [ -f "$full" ]; then + echo "Removing $full" + rm "$full" fi - done + done < tools/common-crawl-ignore.txt - name: Run tests against corpus run: dotnet run --project tools/UglyToad.PdfPig.ConsoleRunner/UglyToad.PdfPig.ConsoleRunner.csproj "corpus/extracted" diff --git a/tools/common-crawl-ignore.txt b/tools/common-crawl-ignore.txt new file mode 100644 index 00000000..dc58bfbd --- /dev/null +++ b/tools/common-crawl-ignore.txt @@ -0,0 +1,40 @@ +0000399.pdf +0000819.pdf +0000920.pdf +0000300.pdf +0001589.pdf +0001957.pdf +0002064.pdf +0002187.pdf +0002229.pdf +0002244.pdf +0002372.pdf +0002554.pdf +0002638.pdf +0002883.pdf +0002897.pdf +0002966.pdf +0003147.pdf +0003269.pdf +0003304.pdf +0003892.pdf +0003927.pdf +0003983.pdf +0004099.pdf +0004537.pdf +0004569.pdf +0004791.pdf +0004853.pdf +0004889.pdf +0004997.pdf +0005482.pdf +0005637.pdf +0006036.pdf +0006169.pdf +0006207.pdf +0006262.pdf +0006339.pdf +0006766.pdf +0006844.pdf +0007159.pdf +0007559.pdf \ No newline at end of file