Use file header offset when doing brute force find and fix #1223
Some checks failed
Build, test and publish draft / build (push) Has been cancelled
Build and test [MacOS] / build (push) Has been cancelled
Run Common Crawl Tests / build (0000-0001) (push) Has been cancelled
Run Common Crawl Tests / build (0002-0003) (push) Has been cancelled
Run Common Crawl Tests / build (0004-0005) (push) Has been cancelled
Run Common Crawl Tests / build (0006-0007) (push) Has been cancelled
Run Integration Tests / build (push) Has been cancelled
Nightly Release / Check if this commit has already been published (push) Has been cancelled
Nightly Release / tests (push) Has been cancelled
Nightly Release / build_and_publish_nightly (push) Has been cancelled

This commit is contained in:
BobLd
2025-12-07 13:28:32 +00:00
parent c70b343caa
commit ee0cb1dc4a
5 changed files with 30 additions and 6 deletions

View File

@@ -11,6 +11,19 @@
public class GithubIssuesTests
{
[Fact]
public void Issue1223()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("23056.PMC2132516.pdf");
using (var document = PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true }))
{
Assert.NotNull(document);
var firstPage = document.GetPage(1);
Assert.NotNull(firstPage);
Assert.Contains("The Rockefeller University Press", firstPage.Text);
}
}
[Fact]
public void Issue1213()
{

View File

@@ -3,6 +3,7 @@
using System.Text;
using PdfPig.Core;
using PdfPig.Encryption;
using PdfPig.Parser.FileStructure;
using PdfPig.Tokenization.Scanner;
using PdfPig.Tokens;
@@ -720,8 +721,12 @@ endobj";
{
var input = StringBytesTestConverter.Convert(s, false);
return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(),
new TestFilterProvider(), NoOpEncryptionHandler.Instance, useLenientParsing ? new ParsingOptions() : ParsingOptions.LenientParsingOff);
return new PdfTokenScanner(input.Bytes,
locationProvider ?? new TestObjectLocationProvider(),
new TestFilterProvider(),
NoOpEncryptionHandler.Instance,
new FileHeaderOffset(0),
useLenientParsing ? new ParsingOptions() : ParsingOptions.LenientParsingOff);
}
private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner)

View File

@@ -124,8 +124,10 @@
var version = FileHeaderParser.Parse(scanner, inputBytes, parsingOptions.UseLenientParsing, parsingOptions.Logger);
var fileHeaderOffset = new FileHeaderOffset((int)version.OffsetInFile);
var initialParse = FirstPassParser.Parse(
new FileHeaderOffset((int)version.OffsetInFile),
fileHeaderOffset,
inputBytes,
scanner,
parsingOptions.Logger);
@@ -143,7 +145,7 @@
initialParse.BruteForceOffsets,
inputBytes);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance, parsingOptions);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance, fileHeaderOffset, parsingOptions);
var (rootReference, rootDictionary) = ParseTrailer(
trailer,

View File

@@ -11,6 +11,7 @@
using Encryption;
using Filters;
using Tokens;
using UglyToad.PdfPig.Parser.FileStructure;
internal class PdfTokenScanner : IPdfTokenScanner
{
@@ -23,6 +24,7 @@
private readonly ILookupFilterProvider filterProvider;
private readonly CoreTokenScanner coreTokenScanner;
private readonly ParsingOptions parsingOptions;
private readonly FileHeaderOffset fileHeaderOffset;
private IEncryptionHandler encryptionHandler;
private bool isDisposed;
@@ -54,12 +56,14 @@
IObjectLocationProvider objectLocationProvider,
ILookupFilterProvider filterProvider,
IEncryptionHandler encryptionHandler,
FileHeaderOffset fileHeaderOffset,
ParsingOptions parsingOptions)
{
this.inputBytes = inputBytes;
this.objectLocationProvider = objectLocationProvider;
this.filterProvider = filterProvider;
this.encryptionHandler = encryptionHandler;
this.fileHeaderOffset = fileHeaderOffset;
this.parsingOptions = parsingOptions;
coreTokenScanner = new CoreTokenScanner(inputBytes, true, useLenientParsing: parsingOptions.UseLenientParsing);
}
@@ -795,8 +799,8 @@
{
// Brute force read the entire file
isBruteForcing = true;
Seek(0);
Seek(fileHeaderOffset.Value);
while (MoveNext())
{