mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-02-24 20:26:06 +08:00
Additional digital corpora testing (#1261)
Some checks are pending
Build, test and publish draft / build (push) Waiting to run
Build and test [MacOS] / build (push) Waiting to run
Run Common Crawl Tests / build (0000-0001) (push) Waiting to run
Run Common Crawl Tests / build (0002-0003) (push) Waiting to run
Run Common Crawl Tests / build (0004-0005) (push) Waiting to run
Run Common Crawl Tests / build (0006-0007) (push) Waiting to run
Run Common Crawl Tests / build (0008-0009) (push) Waiting to run
Run Common Crawl Tests / build (0010-0011) (push) Waiting to run
Run Common Crawl Tests / build (0012-0013) (push) Waiting to run
Run Integration Tests / build (push) Waiting to run
Tag Release / tag_if_version_changed (push) Waiting to run
Some checks are pending
Build, test and publish draft / build (push) Waiting to run
Build and test [MacOS] / build (push) Waiting to run
Run Common Crawl Tests / build (0000-0001) (push) Waiting to run
Run Common Crawl Tests / build (0002-0003) (push) Waiting to run
Run Common Crawl Tests / build (0004-0005) (push) Waiting to run
Run Common Crawl Tests / build (0006-0007) (push) Waiting to run
Run Common Crawl Tests / build (0008-0009) (push) Waiting to run
Run Common Crawl Tests / build (0010-0011) (push) Waiting to run
Run Common Crawl Tests / build (0012-0013) (push) Waiting to run
Run Integration Tests / build (push) Waiting to run
Tag Release / tag_if_version_changed (push) Waiting to run
* add additional testing pairs 0010-0011 for integration some of these files required the skip missing fonts flag set to true - propagate use lenient parsing for dictionaries inside arrays, handles a corrupt file 0012710 not in this test set * add pair 0012-0013
This commit is contained in:
2
.github/workflows/run_common_crawl_tests.yml
vendored
2
.github/workflows/run_common_crawl_tests.yml
vendored
@@ -12,7 +12,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009"]
|
||||
pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009", "0010-0011", "0012-0013"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
public class ArrayTokenizerTests
|
||||
{
|
||||
private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true, new StackDepthGuard(256));
|
||||
private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true, new StackDepthGuard(256), false);
|
||||
|
||||
[Theory]
|
||||
[InlineData("]")]
|
||||
|
||||
@@ -9,13 +9,15 @@
|
||||
{
|
||||
private readonly bool usePdfDocEncoding;
|
||||
private readonly StackDepthGuard stackDepthGuard;
|
||||
private readonly bool useLenientParsing;
|
||||
|
||||
public bool ReadsNextByte { get; } = false;
|
||||
|
||||
public ArrayTokenizer(bool usePdfDocEncoding, StackDepthGuard stackDepthGuard)
|
||||
public ArrayTokenizer(bool usePdfDocEncoding, StackDepthGuard stackDepthGuard, bool useLenientParsing)
|
||||
{
|
||||
this.usePdfDocEncoding = usePdfDocEncoding;
|
||||
this.stackDepthGuard = stackDepthGuard;
|
||||
this.useLenientParsing = useLenientParsing;
|
||||
}
|
||||
|
||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||
@@ -27,7 +29,7 @@
|
||||
return false;
|
||||
}
|
||||
|
||||
var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, stackDepthGuard, ScannerScope.Array);
|
||||
var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, stackDepthGuard, ScannerScope.Array, useLenientParsing: useLenientParsing);
|
||||
|
||||
var contents = new List<IToken>();
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@
|
||||
this.usePdfDocEncoding = usePdfDocEncoding;
|
||||
this.stackDepthGuard = stackDepthGuard;
|
||||
this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
|
||||
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding, this.stackDepthGuard);
|
||||
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding, this.stackDepthGuard, useLenientParsing);
|
||||
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding, this.stackDepthGuard, useLenientParsing: useLenientParsing);
|
||||
this.scope = scope;
|
||||
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;
|
||||
|
||||
@@ -185,7 +185,11 @@ namespace UglyToad.PdfPig.ConsoleRunner
|
||||
sw.Reset();
|
||||
sw.Start();
|
||||
|
||||
using (var pdfDocument = PdfDocument.Open(file))
|
||||
using (var pdfDocument = PdfDocument.Open(file, new ParsingOptions
|
||||
{
|
||||
UseLenientParsing = true,
|
||||
SkipMissingFonts = true,
|
||||
}))
|
||||
{
|
||||
sw.Stop();
|
||||
|
||||
|
||||
@@ -46,4 +46,27 @@
|
||||
0009309.pdf
|
||||
0009464.pdf
|
||||
0009706.pdf
|
||||
0009944.pdf
|
||||
0009944.pdf
|
||||
0010114.pdf
|
||||
0010117.pdf
|
||||
0010216.pdf
|
||||
0010472.pdf
|
||||
0010697.pdf
|
||||
0010902.pdf
|
||||
0010950.pdf
|
||||
0011041.pdf
|
||||
0011171.pdf
|
||||
0011398.pdf
|
||||
0011450.pdf
|
||||
0011758.pdf
|
||||
0011989.pdf
|
||||
0012117.pdf
|
||||
0012684.pdf
|
||||
0012730.pdf
|
||||
0013051.pdf
|
||||
0013178.pdf
|
||||
0013338.pdf
|
||||
0013425.pdf
|
||||
0013587.pdf
|
||||
0013721.pdf
|
||||
0013822.pdf
|
||||
Reference in New Issue
Block a user