Support not finding the Pages dictionary in lenient mode (#897)

* Support not finding the Pages dictionary in lenient mode and support Kids object not referencing a page object in lenient mode

---------

Co-authored-by: Arnaud TAMAILLON <arnaud.tamaillon@younited-credit.fr>
This commit is contained in:
Arnaud TAMAILLON 2024-09-01 16:09:48 +02:00 committed by GitHub
parent 1bfd6dedb4
commit cf45dcf6ad
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 124 additions and 3 deletions

View File

@ -1,6 +1,7 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using Annotations;
using PdfPig.Core;
public class CatGeneticsTests
{
@ -39,5 +40,39 @@
}
}
}
[Fact]
public void CanSupportPageInformationNotFoundInLenientMode()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("pages-indirect-to-null.pdf");
// Lenient Parsing On -> can process
using (var document = PdfDocument.Open(path))
{
// unable to parse
Assert.Equal(1, document.NumberOfPages);
Assert.NotNull(document.GetPage(1));
}
// Lenient Parsing Off -> throws
var ex = Assert.Throws<PdfDocumentFormatException>(() => PdfDocument.Open(path, ParsingOptions.LenientParsingOff));
Assert.Equal("Pages entry is null", ex.Message);
}
[Fact]
public void CanSupportPageKidsObjectNotBeingAPage()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("pages-kids-not-page.pdf");
using (var document = PdfDocument.Open(path))
{
// unable to parse
Assert.Equal(1, document.NumberOfPages);
Assert.NotNull(document.GetPage(1));
}
// Lenient Parsing Off -> throws
var ex = Assert.Throws<PdfDocumentFormatException>(() => PdfDocument.Open(path, ParsingOptions.LenientParsingOff));
Assert.Equal("Could not find dictionary associated with reference in pages kids array: 3 0.", ex.Message);
}
}
}

View File

@ -0,0 +1,35 @@
%PDF-1.0
%µ¶
1 0 obj
<</Type/Catalog/Pages 4 0 R>>
endobj
2 0 obj
<</Kids[3 0 R]/Count 1/Type/Pages/MediaBox[0 0 595 792]>>
endobj
3 0 obj
<</Length 58>>
stream
q
BT
/ 96 Tf
1 0 0 1 36 684 Tm
(Hello World!) Tj
ET
Q
endstream
endobj
xref
0 4
0000000000 65536 f
0000000019 00000 n
0000000069 00000 n
0000000147 00000 n
trailer
<</Size 6/Root 1 0 R>>
startxref
262
%%EOF

View File

@ -0,0 +1,35 @@
%PDF-1.0
%µ¶
1 0 obj
<</Type/Catalog/Pages 2 0 R>>
endobj
2 0 obj
<</Kids[3 0 R]/Count 1/Type/Pages/MediaBox[0 0 595 792]>>
endobj
3 0 obj
<</Length 58>>
stream
q
BT
/ 96 Tf
1 0 0 1 36 684 Tm
(Hello World!) Tj
ET
Q
endstream
endobj
xref
0 4
0000000000 65536 f
0000000019 00000 n
0000000069 00000 n
0000000147 00000 n
trailer
<</Size 6/Root 1 0 R>>
startxref
262
%%EOF

View File

@ -143,15 +143,19 @@
foreach (var kid in kids.Data)
{
DictionaryToken? kidDictionaryToken = null;
if (!(kid is IndirectReferenceToken kidRef))
{
throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}.");
}
if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken? kidDictionaryToken))
if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out kidDictionaryToken))
{
throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}.");
if (!isLenientParsing)
{
throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}.");
}
}
kidDictionaryToken ??= new DictionaryToken(new Dictionary<NameToken, IToken>());
bool isChildPage = CheckIfIsPage(kidDictionaryToken, current.reference, false, pdfTokenScanner, isLenientParsing);

View File

@ -46,6 +46,18 @@
pagesDictionary = DirectObjectFinder.Get<DictionaryToken>(value, scanner);
}
if (pagesDictionary == null)
{
if (isLenientParsing)
{
pagesDictionary = new DictionaryToken(new Dictionary<NameToken, IToken>());
}
else
{
throw new PdfDocumentFormatException($"Pages entry is null");
}
}
var pages = PagesFactory.Create(pagesReference, pagesDictionary, scanner, pageFactory, log, isLenientParsing);
var namedDestinations = NamedDestinationsProvider.Read(dictionary, scanner, pages, null);