Additional digital corpora testing (#1261)

* add additional testing pairs 0010-0011 for integration some of these files required the skip missing fonts flag set to true - propagate use lenient parsing for dictionaries inside arrays, handles a corrupt file 0012710 not in this test set * add pair 0012-0013
2026-02-24 20:26:06 +08:00 · 2026-02-22 17:01:03 -04:00
parent 34db05f735
commit a4047247a8
6 changed files with 36 additions and 7 deletions
--- a/.github/workflows/run_common_crawl_tests.yml
+++ b/.github/workflows/run_common_crawl_tests.yml
@@ -12,7 +12,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009"]
+        pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009", "0010-0011", "0012-0013"]

    steps:
      - uses: actions/checkout@v2
--- a/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs
@@ -6,7 +6,7 @@

    public class ArrayTokenizerTests
    {
-        private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true, new StackDepthGuard(256));
+        private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true, new StackDepthGuard(256), false);

        [Theory]
        [InlineData("]")]
--- a/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs
@@ -9,13 +9,15 @@
    {
        private readonly bool usePdfDocEncoding;
        private readonly StackDepthGuard stackDepthGuard;
+        private readonly bool useLenientParsing;

        public bool ReadsNextByte { get; } = false;

-        public ArrayTokenizer(bool usePdfDocEncoding, StackDepthGuard stackDepthGuard)
+        public ArrayTokenizer(bool usePdfDocEncoding, StackDepthGuard stackDepthGuard, bool useLenientParsing)
        {
            this.usePdfDocEncoding = usePdfDocEncoding;
            this.stackDepthGuard = stackDepthGuard;
+            this.useLenientParsing = useLenientParsing;
        }

        public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
@@ -27,7 +29,7 @@
                return false;
            }

-            var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, stackDepthGuard, ScannerScope.Array);
+            var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, stackDepthGuard, ScannerScope.Array, useLenientParsing: useLenientParsing);

            var contents = new List<IToken>();

--- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
+++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
@@ -70,7 +70,7 @@
            this.usePdfDocEncoding = usePdfDocEncoding;
            this.stackDepthGuard = stackDepthGuard;
            this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
-            this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding, this.stackDepthGuard);
+            this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding, this.stackDepthGuard, useLenientParsing);
            this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding, this.stackDepthGuard, useLenientParsing: useLenientParsing);
            this.scope = scope;
            this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
--- a/tools/UglyToad.PdfPig.ConsoleRunner/Program.cs
+++ b/tools/UglyToad.PdfPig.ConsoleRunner/Program.cs
@@ -185,7 +185,11 @@ namespace UglyToad.PdfPig.ConsoleRunner
                    sw.Reset();
                    sw.Start();

-                    using (var pdfDocument = PdfDocument.Open(file))
+                    using (var pdfDocument = PdfDocument.Open(file, new ParsingOptions
+                    {
+                        UseLenientParsing = true,
+                        SkipMissingFonts = true,
+                    }))
                    {
                        sw.Stop();

--- a/tools/common-crawl-ignore.txt
+++ b/tools/common-crawl-ignore.txt
@@ -46,4 +46,27 @@
 0009309.pdf
 0009464.pdf
 0009706.pdf
-0009944.pdf
+0009944.pdf
+0010114.pdf
+0010117.pdf
+0010216.pdf
+0010472.pdf
+0010697.pdf
+0010902.pdf
+0010950.pdf
+0011041.pdf
+0011171.pdf
+0011398.pdf
+0011450.pdf
+0011758.pdf
+0011989.pdf
+0012117.pdf
+0012684.pdf
+0012730.pdf
+0013051.pdf
+0013178.pdf
+0013338.pdf
+0013425.pdf
+0013587.pdf
+0013721.pdf
+0013822.pdf