allow missing catalog type definition for catalog dictionary
Some checks failed
Build and test / build (push) Has been cancelled
Build and test [MacOS] / build (push) Has been cancelled
Run Common Crawl Tests / build (push) Has been cancelled
Run Integration Tests / build (push) Has been cancelled
Nightly Release / tests (push) Has been cancelled
Nightly Release / Check latest commit (push) Has been cancelled
Nightly Release / build_and_publish_nightly (push) Has been cancelled

as long as there is a pages entry we accept this in lenient parsing mode. this
is to fix document 006705.pdf in the corpus that had '/calalog' as the dictionary
entry.

also adds a test for some weird content stream content in 0006324.pdf where
numbers seem to get split in the content stream on a decimal place. this is
just to check that our parser doesn't hard crash
This commit is contained in:
EliotJones 2025-07-26 16:55:20 -05:00 committed by BobLd
parent febfa4d4b3
commit 83d6fc6cc2
2 changed files with 43 additions and 1 deletions

View File

@ -261,6 +261,47 @@ endobj";
Assert.Equal(3, tokens.OfType<DictionaryToken>().Count());
}
[Fact]
public void Document006324Test()
{
const string content =
"""
q
1 0 0 1 248.6304 572.546 cm
0 0 m
0.021 -0.007 l
3 -0.003 -0.01 0 0 0 c
f
Q
q
1 0 0 1 2489394 57249855 cm
0 0 m
-0.046 -0.001 -0.609 0.029 -0.286 -0.014 c
-02.61 -0.067 -0.286 -0. .61 -0 0 c
f
Q
q
1 0 0 1 24862464 572. .836 cm
0 0 m
0.936 -0.029 l
0.038 -0.021 0.55 -0.014 0 0 c
f
Q
""";
var tokens = new List<IToken>();
var scanner = new CoreTokenScanner(
StringBytesTestConverter.Convert(content, false).Bytes,
true,
isStream: true);
while (scanner.MoveNext())
{
tokens.Add(scanner.CurrentToken);
}
}
private static void AssertCorrectToken<T, TData>(IToken token, TData expected) where T : IDataToken<TData>
{
var cast = Assert.IsType<T>(token);

View File

@ -19,7 +19,8 @@
throw new ArgumentNullException(nameof(dictionary));
}
if (dictionary.TryGet(NameToken.Type, out var type) && !ReferenceEquals(type, NameToken.Catalog))
if (dictionary.TryGet(NameToken.Type, out var type) && !ReferenceEquals(type, NameToken.Catalog)
&& !isLenientParsing)
{
throw new PdfDocumentFormatException($"The type of the catalog dictionary was not Catalog: {dictionary}.");
}