PdfPig/.github/workflows/run_common_crawl_tests.yml

name: Run Common Crawl Tests

on:
  push:
    branches: [master]
  workflow_dispatch:
  workflow_call:

jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2

      - name: Set up dotnet core
        uses: actions/setup-dotnet@v3
        with:
          dotnet-version: |
            8.0.x
            9.0.x

      - name: Restore corpus cache 0000, 0001
        id: restore-corpus
        uses: actions/cache@v4
        with:
          path: corpus/
          key: ${{ runner.os }}-pdf-corpus-0000-0001

      - name: Download corpus if cache missed 0000, 0001
        if: steps.restore-corpus.outputs.cache-hit != 'true'
        run: |
          mkdir -p corpus/zipfiles
          cd corpus/zipfiles
          echo "Downloading 0000.zip"
          wget -nv https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0000.zip -O 0000.zip
          wget -nv https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0001.zip -O 0001.zip
          cd ..
          unzip 'zipfiles/*.zip' -d extracted
          # run: rm -f zipfiles/*.zip

      - name: Remove unwanted test files
        run: |
          skip_files=(
            "corpus/extracted/0000399.pdf"
            "corpus/extracted/0000819.pdf"
            "corpus/extracted/0000920.pdf"
            "corpus/extracted/0000300.pdf"
            "corpus/extracted/0001589.pdf"
            "corpus/extracted/0001957.pdf"
          )

          for file in "${skip_files[@]}"; do
            if [ -f "$file" ]; then
              echo "Removing $file"
              rm "$file"
            fi
          done

      - name: Run tests against corpus
        run: dotnet run --project tools/UglyToad.PdfPig.ConsoleRunner/UglyToad.PdfPig.ConsoleRunner.csproj "corpus/extracted"