Additional digital corpora testing (#1261)
Some checks are pending
Build, test and publish draft / build (push) Waiting to run
Build and test [MacOS] / build (push) Waiting to run
Run Common Crawl Tests / build (0000-0001) (push) Waiting to run
Run Common Crawl Tests / build (0002-0003) (push) Waiting to run
Run Common Crawl Tests / build (0004-0005) (push) Waiting to run
Run Common Crawl Tests / build (0006-0007) (push) Waiting to run
Run Common Crawl Tests / build (0008-0009) (push) Waiting to run
Run Common Crawl Tests / build (0010-0011) (push) Waiting to run
Run Common Crawl Tests / build (0012-0013) (push) Waiting to run
Run Integration Tests / build (push) Waiting to run
Tag Release / tag_if_version_changed (push) Waiting to run

* add additional testing pairs 0010-0011 for integration

some of these files required the skip missing fonts flag set to true

- propagate use lenient parsing for dictionaries inside arrays, handles a
corrupt file 0012710 not in this test set

* add pair 0012-0013
This commit is contained in:
Eliot Jones
2026-02-22 17:01:03 -04:00
committed by GitHub
parent 34db05f735
commit a4047247a8
6 changed files with 36 additions and 7 deletions

View File

@@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: false
matrix:
pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009"]
pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009", "0010-0011", "0012-0013"]
steps:
- uses: actions/checkout@v2

View File

@@ -6,7 +6,7 @@
public class ArrayTokenizerTests
{
private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true, new StackDepthGuard(256));
private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true, new StackDepthGuard(256), false);
[Theory]
[InlineData("]")]

View File

@@ -9,13 +9,15 @@
{
private readonly bool usePdfDocEncoding;
private readonly StackDepthGuard stackDepthGuard;
private readonly bool useLenientParsing;
public bool ReadsNextByte { get; } = false;
public ArrayTokenizer(bool usePdfDocEncoding, StackDepthGuard stackDepthGuard)
public ArrayTokenizer(bool usePdfDocEncoding, StackDepthGuard stackDepthGuard, bool useLenientParsing)
{
this.usePdfDocEncoding = usePdfDocEncoding;
this.stackDepthGuard = stackDepthGuard;
this.useLenientParsing = useLenientParsing;
}
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
@@ -27,7 +29,7 @@
return false;
}
var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, stackDepthGuard, ScannerScope.Array);
var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, stackDepthGuard, ScannerScope.Array, useLenientParsing: useLenientParsing);
var contents = new List<IToken>();

View File

@@ -70,7 +70,7 @@
this.usePdfDocEncoding = usePdfDocEncoding;
this.stackDepthGuard = stackDepthGuard;
this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding, this.stackDepthGuard);
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding, this.stackDepthGuard, useLenientParsing);
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding, this.stackDepthGuard, useLenientParsing: useLenientParsing);
this.scope = scope;
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;

View File

@@ -185,7 +185,11 @@ namespace UglyToad.PdfPig.ConsoleRunner
sw.Reset();
sw.Start();
using (var pdfDocument = PdfDocument.Open(file))
using (var pdfDocument = PdfDocument.Open(file, new ParsingOptions
{
UseLenientParsing = true,
SkipMissingFonts = true,
}))
{
sw.Stop();

View File

@@ -46,4 +46,27 @@
0009309.pdf
0009464.pdf
0009706.pdf
0009944.pdf
0009944.pdf
0010114.pdf
0010117.pdf
0010216.pdf
0010472.pdf
0010697.pdf
0010902.pdf
0010950.pdf
0011041.pdf
0011171.pdf
0011398.pdf
0011450.pdf
0011758.pdf
0011989.pdf
0012117.pdf
0012684.pdf
0012730.pdf
0013051.pdf
0013178.pdf
0013338.pdf
0013425.pdf
0013587.pdf
0013721.pdf
0013822.pdf