begin to rework cross-reference parsing

most of the cross-reference code is the earliest code in the project and hasn't been revisited since then. the issue #88 has been reopened due to a bug with brute-force searching so this tidies up the code in this area ahead of trying to fix the bug.
This commit is contained in:
Eliot Jones
2020-03-03 15:21:11 +00:00
parent 4b5c8d510e
commit 58972de7cb
9 changed files with 230 additions and 323 deletions

View File

@@ -36,10 +36,18 @@ startxref
216 216
%%EOF"; %%EOF";
private static readonly long[] TestDataOffsets =
{
TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase),
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase),
TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase),
TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase)
};
[Fact] [Fact]
public void ReaderNull_Throws() public void ReaderNull_Throws()
{ {
Action action = () => new BruteForceSearcher(null); Action action = () => BruteForceSearcher.GetObjectLocations(null);
Assert.Throws<ArgumentNullException>(action); Assert.Throws<ArgumentNullException>(action);
} }
@@ -50,19 +58,11 @@ startxref
{ {
var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData)); var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData));
var searcher = new BruteForceSearcher(input); var locations = BruteForceSearcher.GetObjectLocations(input);
var locations = searcher.GetObjectLocations();
Assert.Equal(4, locations.Count); Assert.Equal(4, locations.Count);
Assert.Equal(locations.Values, new long[] Assert.Equal(TestDataOffsets, locations.Values);
{
TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase),
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase),
TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase),
TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase)
});
} }
[Fact] [Fact]
@@ -70,13 +70,11 @@ startxref
{ {
var reader = StringBytesTestConverter.Convert(TestData, false); var reader = StringBytesTestConverter.Convert(TestData, false);
var searcher = new BruteForceSearcher(reader.Bytes); var locations = BruteForceSearcher.GetObjectLocations(reader.Bytes);
var locations = searcher.GetObjectLocations();
Assert.Equal(4, locations.Count); Assert.Equal(4, locations.Count);
var newLocations = searcher.GetObjectLocations(); var newLocations = BruteForceSearcher.GetObjectLocations(reader.Bytes);
Assert.Equal(4, locations.Count); Assert.Equal(4, locations.Count);
@@ -92,9 +90,8 @@ startxref
using (var fs = File.OpenRead(IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf"))) using (var fs = File.OpenRead(IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf")))
{ {
var bytes = new StreamInputBytes(fs); var bytes = new StreamInputBytes(fs);
var searcher = new BruteForceSearcher(bytes);
var locations = searcher.GetObjectLocations(); var locations = BruteForceSearcher.GetObjectLocations(bytes);
Assert.Equal(13, locations.Count); Assert.Equal(13, locations.Count);
@@ -118,9 +115,7 @@ startxref
{ {
var bytes = new ByteArrayInputBytes(File.ReadAllBytes(IntegrationHelpers.GetDocumentPath("Single Page Simple - from open office.pdf"))); var bytes = new ByteArrayInputBytes(File.ReadAllBytes(IntegrationHelpers.GetDocumentPath("Single Page Simple - from open office.pdf")));
var searcher = new BruteForceSearcher(bytes); var locations = BruteForceSearcher.GetObjectLocations(bytes);
var locations = searcher.GetObjectLocations();
Assert.Equal(13, locations.Count); Assert.Equal(13, locations.Count);
@@ -142,6 +137,18 @@ startxref
Assert.StartsWith("12 0 obj", s); Assert.StartsWith("12 0 obj", s);
} }
[Fact]
public void BruteForceSearcherCorrectlyFindsAllObjectsWhenOffset()
{
var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData));
input.Seek(593);
var locations = BruteForceSearcher.GetObjectLocations(input);
Assert.Equal(TestDataOffsets, locations.Values);
}
private static string GetStringAt(IInputBytes bytes, long location) private static string GetStringAt(IInputBytes bytes, long location)
{ {
bytes.Seek(location); bytes.Seek(location);

View File

@@ -0,0 +1,161 @@
namespace UglyToad.PdfPig.Parser.FileStructure
{
using System;
using System.Collections.Generic;
using Core;
using CrossReference;
using Logging;
using Parts;
internal static class CrossReferenceObjectOffsetValidator
{
private const long MinimumSearchOffset = 6;
/// <summary>
/// Check that the offsets in the cross reference are correct.
/// </summary>
public static bool ValidateCrossReferenceOffsets(IInputBytes bytes, CrossReferenceTable crossReferenceTable, ILog log,
out IReadOnlyDictionary<IndirectReference, long> actualOffsets)
{
actualOffsets = crossReferenceTable.ObjectOffsets;
if (ValidateXrefOffsets(bytes, crossReferenceTable.ObjectOffsets, log))
{
return true;
}
var bruteForceOffsets = BruteForceSearcher.GetObjectLocations(bytes);
if (bruteForceOffsets.Count > 0)
{
var objStreams = new List<IndirectReference>();
// find all object streams
foreach (var entry in crossReferenceTable.ObjectOffsets)
{
var offset = entry.Value;
if (offset < 0)
{
var objStream = new IndirectReference(-offset, 0);
if (!objStreams.Contains(objStream))
{
objStreams.Add(new IndirectReference(-offset, 0));
}
}
// remove all found object streams
if (objStreams.Count > 0)
{
foreach (var key in objStreams)
{
if (bruteForceOffsets.ContainsKey(key))
{
// remove all parsed objects which are part of an object stream
//ISet<long> objects = xrefTrailerResolver
// .getContainedObjectNumbers((int)(key.Number));
//foreach (long objNr in objects)
//{
// CosObjectKey streamObjectKey = new CosObjectKey(objNr, 0);
// if (bfCOSObjectKeyOffsets.TryGetValue(streamObjectKey, out long streamObjectOffset) && streamObjectOffset > 0)
// {
// bfCOSObjectKeyOffsets.Remove(streamObjectKey);
// }
//}
}
else
{
// remove all objects which are part of an object stream which wasn't found
//ISet<long> objects = xrefTrailerResolver
// .getContainedObjectNumbers((int)(key.Number));
//foreach (long objNr in objects)
//{
// xrefOffset.Remove(new CosObjectKey(objNr, 0));
//}
}
}
}
foreach (var item in bruteForceOffsets)
{
//xrefOffset[item.Key] = item.Value;
}
}
}
return false;
}
private static bool ValidateXrefOffsets(IInputBytes bytes, IReadOnlyDictionary<IndirectReference, long> objectOffsets, ILog log)
{
if (objectOffsets == null)
{
return true;
}
foreach (var objectEntry in objectOffsets)
{
var objectKey = objectEntry.Key;
var objectOffset = objectEntry.Value;
if (objectOffset < 0)
{
continue;
}
if (!CheckObjectKeys(bytes, objectKey, objectOffset))
{
log.Error($"At least one cross-reference offset was incorrect. {objectKey} could not be found at {objectOffset}. " +
"Using brute-force search to repair object offsets.");
return false;
}
}
return true;
}
private static bool CheckObjectKeys(IInputBytes bytes, IndirectReference objectKey, long offset)
{
// there can't be any object at the very beginning of a pdf
if (offset < MinimumSearchOffset)
{
return false;
}
var objectNr = objectKey.ObjectNumber;
long objectGen = objectKey.Generation;
var originOffset = bytes.CurrentOffset;
var objectString = ObjectHelper.CreateObjectString(objectNr, objectGen);
try
{
bytes.Seek(offset);
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
bytes.MoveNext();
}
if (ReadHelper.IsString(bytes, objectString))
{
// everything is ok, return origin object key
bytes.Seek(originOffset);
return true;
}
}
catch (Exception)
{
// Swallow the exception, obviously there isn't any valid object number
}
finally
{
bytes.Seek(originOffset);
}
// no valid object number found
return false;
}
}
}

View File

@@ -4,7 +4,6 @@
using System.Collections.Generic; using System.Collections.Generic;
using Core; using Core;
using CrossReference; using CrossReference;
using Exceptions;
using Logging; using Logging;
using Parts.CrossReference; using Parts.CrossReference;
using Tokenization.Scanner; using Tokenization.Scanner;
@@ -15,16 +14,13 @@
private readonly ILog log; private readonly ILog log;
private readonly XrefOffsetValidator offsetValidator; private readonly XrefOffsetValidator offsetValidator;
private readonly CrossReferenceStreamParser crossReferenceStreamParser; private readonly CrossReferenceStreamParser crossReferenceStreamParser;
private readonly XrefCosOffsetChecker xrefCosChecker;
public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator, public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator,
XrefCosOffsetChecker xrefCosChecker,
CrossReferenceStreamParser crossReferenceStreamParser) CrossReferenceStreamParser crossReferenceStreamParser)
{ {
this.log = log; this.log = log;
this.offsetValidator = offsetValidator; this.offsetValidator = offsetValidator;
this.crossReferenceStreamParser = crossReferenceStreamParser; this.crossReferenceStreamParser = crossReferenceStreamParser;
this.xrefCosChecker = xrefCosChecker;
} }
public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long crossReferenceLocation, public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long crossReferenceLocation,
@@ -214,7 +210,10 @@
var resolved = table.Build(crossReferenceLocation, log); var resolved = table.Build(crossReferenceLocation, log);
// check the offsets of all referenced objects // check the offsets of all referenced objects
xrefCosChecker.CheckCrossReferenceOffsets(bytes, resolved, isLenientParsing); if (!CrossReferenceObjectOffsetValidator.ValidateCrossReferenceOffsets(bytes, resolved, log, out var actualOffsets))
{
resolved = new CrossReferenceTable(resolved.Type, actualOffsets, resolved.Trailer, resolved.CrossReferenceOffsets);
}
return resolved; return resolved;
} }

View File

@@ -1,236 +0,0 @@
namespace UglyToad.PdfPig.Parser.FileStructure
{
using System;
using System.Collections.Generic;
using System.Linq;
using Core;
using CrossReference;
using Logging;
using Parts;
internal class XrefCosOffsetChecker
{
private static readonly long MINIMUM_SEARCH_OFFSET = 6;
private readonly ILog log;
private readonly BruteForceSearcher bruteForceSearcher;
private IReadOnlyDictionary<IndirectReference, long> objectKeyOffsets;
public XrefCosOffsetChecker(ILog log, BruteForceSearcher bruteForceSearcher)
{
this.log = log;
this.bruteForceSearcher = bruteForceSearcher;
}
private bool ValidateXrefOffsets(IInputBytes bytes, Dictionary<IndirectReference, long> xrefOffset)
{
if (xrefOffset == null)
{
return true;
}
foreach (var objectEntry in xrefOffset)
{
IndirectReference objectKey = objectEntry.Key;
long objectOffset = objectEntry.Value;
// a negative offset number represents a object number itself
// see type 2 entry in xref stream
if (objectOffset >= 0 && !CheckObjectKeys(bytes, objectKey, objectOffset))
{
log.Debug($"Stop checking xref offsets as at least one ({objectKey}) couldn't be dereferenced");
return false;
}
}
return true;
}
private bool CheckObjectKeys(IInputBytes bytes, IndirectReference objectKey, long offset)
{
// there can't be any object at the very beginning of a pdf
if (offset < MINIMUM_SEARCH_OFFSET)
{
return false;
}
long objectNr = objectKey.ObjectNumber;
long objectGen = objectKey.Generation;
long originOffset = bytes.CurrentOffset;
string objectString = ObjectHelper.CreateObjectString(objectNr, objectGen);
try
{
bytes.Seek(offset);
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
bytes.MoveNext();
}
if (ReadHelper.IsString(bytes, objectString))
{
// everything is ok, return origin object key
bytes.Seek(originOffset);
return true;
}
}
catch (Exception)
{
// Swallow the exception, obviously there isn't any valid object number
}
finally
{
bytes.Seek(originOffset);
}
// no valid object number found
return false;
}
private IReadOnlyDictionary<IndirectReference, long> getBFCosObjectOffsets()
{
if (objectKeyOffsets == null)
{
var offsets = bruteForceSearcher.GetObjectLocations();
objectKeyOffsets = offsets;
}
return objectKeyOffsets;
}
/// <summary>
/// Check that the offsets in the cross reference are correct.
/// </summary>
public void CheckCrossReferenceOffsets(IInputBytes bytes, CrossReferenceTable xrefTrailerResolver, bool isLenientParsing)
{
// repair mode isn't available in non-lenient mode
if (!isLenientParsing)
{
return;
}
Dictionary<IndirectReference, long> xrefOffset = xrefTrailerResolver.ObjectOffsets.ToDictionary(x => x.Key, x => x.Value);
if (ValidateXrefOffsets(bytes, xrefOffset))
{
return;
}
IReadOnlyDictionary<IndirectReference, long> bfCOSObjectKeyOffsets = getBFCosObjectOffsets();
if (bfCOSObjectKeyOffsets.Count > 0)
{
List<IndirectReference> objStreams = new List<IndirectReference>();
// find all object streams
foreach (var entry in xrefOffset)
{
long offset = entry.Value;
if (offset < 0)
{
IndirectReference objStream = new IndirectReference(-offset, 0);
if (!objStreams.Contains(objStream))
{
objStreams.Add(new IndirectReference(-offset, 0));
}
}
}
// remove all found object streams
if (objStreams.Count > 0)
{
foreach (IndirectReference key in objStreams)
{
if (bfCOSObjectKeyOffsets.ContainsKey(key))
{
// remove all parsed objects which are part of an object stream
//ISet<long> objects = xrefTrailerResolver
// .getContainedObjectNumbers((int)(key.Number));
//foreach (long objNr in objects)
//{
// CosObjectKey streamObjectKey = new CosObjectKey(objNr, 0);
// if (bfCOSObjectKeyOffsets.TryGetValue(streamObjectKey, out long streamObjectOffset) && streamObjectOffset > 0)
// {
// bfCOSObjectKeyOffsets.Remove(streamObjectKey);
// }
//}
}
else
{
// remove all objects which are part of an object stream which wasn't found
//ISet<long> objects = xrefTrailerResolver
// .getContainedObjectNumbers((int)(key.Number));
//foreach (long objNr in objects)
//{
// xrefOffset.Remove(new CosObjectKey(objNr, 0));
//}
}
}
}
foreach (var item in bfCOSObjectKeyOffsets)
{
xrefOffset[item.Key] = item.Value;
}
}
}
private long? lastEndOfFileMarker;
private void BruteForceSearchForEndOfFileMarker(IInputBytes source)
{
if (lastEndOfFileMarker != null)
{
return;
}
long startOffset = source.CurrentOffset;
source.Seek(MINIMUM_SEARCH_OFFSET);
while (!source.IsAtEnd())
{
// search for EOF marker
if (ReadHelper.IsString(source, "%%EOF"))
{
long tempMarker = source.CurrentOffset;
if (tempMarker >= source.Length)
{
lastEndOfFileMarker = tempMarker;
break;
}
try
{
source.Seek(tempMarker + 5);
// check if the following data is some valid pdf content
// which most likely indicates that the pdf is linearized,
// updated or just cut off somewhere in the middle
ReadHelper.SkipSpaces(source);
ObjectHelper.ReadObjectNumber(source);
ObjectHelper.ReadGenerationNumber(source);
}
catch (Exception)
{
// save the EOF marker as the following data is most likely some garbage
lastEndOfFileMarker = tempMarker;
}
}
source.MoveNext();
}
source.Seek(startOffset);
// no EOF marker found
if (lastEndOfFileMarker == null)
{
lastEndOfFileMarker = long.MaxValue;
}
}
}
}

View File

@@ -2,20 +2,16 @@
{ {
using System; using System;
using Content; using Content;
using Parts;
/// <summary> /// <summary>
/// For objects which provide document scoped caching. /// For objects which provide document scoped caching.
/// </summary> /// </summary>
internal class ParsingCachingProviders internal class ParsingCachingProviders
{ {
public BruteForceSearcher BruteForceSearcher { get; }
public IResourceStore ResourceContainer { get; } public IResourceStore ResourceContainer { get; }
public ParsingCachingProviders(BruteForceSearcher bruteForceSearcher, IResourceStore resourceContainer) public ParsingCachingProviders(IResourceStore resourceContainer)
{ {
BruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher));
ResourceContainer = resourceContainer ?? throw new ArgumentNullException(nameof(resourceContainer)); ResourceContainer = resourceContainer ?? throw new ArgumentNullException(nameof(resourceContainer));
} }
} }

View File

@@ -8,44 +8,37 @@
using Util.JetBrains.Annotations; using Util.JetBrains.Annotations;
/// <summary> /// <summary>
/// Store the results of a brute force search for all objects in the document so we only do it once. /// Brute force search for all objects in the document.
/// </summary> /// </summary>
internal class BruteForceSearcher internal static class BruteForceSearcher
{ {
private const int MinimumSearchOffset = 6; private const int MinimumSearchOffset = 6;
private readonly IInputBytes bytes; /// <summary>
/// Find the offset of every object contained in the document by searching the entire document contents.
private Dictionary<IndirectReference, long> objectLocations; /// </summary>
/// <param name="bytes">The bytes of the document.</param>
public BruteForceSearcher([NotNull] IInputBytes bytes) /// <returns>The object keys and offsets for the objects in this document.</returns>
{
this.bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
}
[NotNull] [NotNull]
public IReadOnlyDictionary<IndirectReference, long> GetObjectLocations() public static IReadOnlyDictionary<IndirectReference, long> GetObjectLocations(IInputBytes bytes)
{ {
if (objectLocations != null) if (bytes == null)
{ {
return objectLocations; throw new ArgumentNullException(nameof(bytes));
} }
var loopProtection = 0; var loopProtection = 0;
var lastEndOfFile = GetLastEndOfFileMarker(); var lastEndOfFile = GetLastEndOfFileMarker(bytes);
var results = new Dictionary<IndirectReference, long>(); var results = new Dictionary<IndirectReference, long>();
var originPosition = bytes.CurrentOffset; var originPosition = bytes.CurrentOffset;
long currentOffset = MinimumSearchOffset; var currentOffset = (long)MinimumSearchOffset;
long lastObjectId = long.MinValue;
int lastGenerationId = int.MinValue; var currentlyInObject = false;
long lastObjOffset = long.MinValue;
bool inObject = false;
bool endobjFound = false;
do do
{ {
if (loopProtection > 1_000_000) if (loopProtection > 1_000_000)
@@ -55,7 +48,7 @@
loopProtection++; loopProtection++;
if (inObject) if (currentlyInObject)
{ {
if (bytes.CurrentByte == 'e') if (bytes.CurrentByte == 'e')
{ {
@@ -65,8 +58,7 @@
{ {
if (ReadHelper.IsString(bytes, "endobj")) if (ReadHelper.IsString(bytes, "endobj"))
{ {
inObject = false; currentlyInObject = false;
endobjFound = true;
loopProtection = 0; loopProtection = 0;
for (int i = 0; i < "endobj".Length; i++) for (int i = 0; i < "endobj".Length; i++)
@@ -139,8 +131,7 @@
results[new IndirectReference(obj, generation)] = bytes.CurrentOffset; results[new IndirectReference(obj, generation)] = bytes.CurrentOffset;
inObject = true; currentlyInObject = true;
endobjFound = false;
currentOffset++; currentOffset++;
@@ -148,22 +139,13 @@
loopProtection = 0; loopProtection = 0;
} while (currentOffset < lastEndOfFile && !bytes.IsAtEnd()); } while (currentOffset < lastEndOfFile && !bytes.IsAtEnd());
if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0)
{
// if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
// the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
results[new IndirectReference(lastObjectId, lastGenerationId)] = lastObjOffset;
}
// reestablish origin position // reestablish origin position
bytes.Seek(originPosition); bytes.Seek(originPosition);
objectLocations = results; return results;
return objectLocations;
} }
private long GetLastEndOfFileMarker() private static long GetLastEndOfFileMarker(IInputBytes bytes)
{ {
var originalOffset = bytes.CurrentOffset; var originalOffset = bytes.CurrentOffset;

View File

@@ -84,17 +84,15 @@
CrossReferenceTable crossReferenceTable = null; CrossReferenceTable crossReferenceTable = null;
var bruteForceSearcher = new BruteForceSearcher(inputBytes);
var xrefValidator = new XrefOffsetValidator(log); var xrefValidator = new XrefOffsetValidator(log);
var objectChecker = new XrefCosOffsetChecker(log, bruteForceSearcher);
// We're ok with this since our intent is to lazily load the cross reference table. // We're ok with this since our intent is to lazily load the cross reference table.
// ReSharper disable once AccessToModifiedClosure // ReSharper disable once AccessToModifiedClosure
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, bruteForceSearcher); var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, inputBytes);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance); var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance);
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider); var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, objectChecker, crossReferenceStreamParser); var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser);
var version = FileHeaderParser.Parse(scanner, isLenientParsing, log); var version = FileHeaderParser.Parse(scanner, isLenientParsing, log);
@@ -144,7 +142,7 @@
new PageContentParser(new ReflectionGraphicsStateOperationFactory()), new PageContentParser(new ReflectionGraphicsStateOperationFactory()),
log); log);
var caching = new ParsingCachingProviders(bruteForceSearcher, resourceContainer); var caching = new ParsingCachingProviders(resourceContainer);
var acroFormFactory = new AcroFormFactory(pdfScanner, filterProvider, crossReferenceTable); var acroFormFactory = new AcroFormFactory(pdfScanner, filterProvider, crossReferenceTable);
var bookmarksProvider = new BookmarksProvider(log, pdfScanner); var bookmarksProvider = new BookmarksProvider(log, pdfScanner);

View File

@@ -15,7 +15,10 @@
/// Since we want to scan objects while reading the cross reference table we lazily load it when it's ready. /// Since we want to scan objects while reading the cross reference table we lazily load it when it's ready.
/// </summary> /// </summary>
private readonly Func<CrossReferenceTable> crossReferenceTable; private readonly Func<CrossReferenceTable> crossReferenceTable;
private readonly BruteForceSearcher searcher;
private readonly IInputBytes bytes;
private IReadOnlyDictionary<IndirectReference, long> bruteForcedOffsets;
/// <summary> /// <summary>
/// Indicates whether we now have a cross reference table. /// Indicates whether we now have a cross reference table.
@@ -24,10 +27,10 @@
private readonly Dictionary<IndirectReference, long> offsets = new Dictionary<IndirectReference, long>(); private readonly Dictionary<IndirectReference, long> offsets = new Dictionary<IndirectReference, long>();
public ObjectLocationProvider(Func<CrossReferenceTable> crossReferenceTable, BruteForceSearcher searcher) public ObjectLocationProvider(Func<CrossReferenceTable> crossReferenceTable, IInputBytes bytes)
{ {
this.crossReferenceTable = crossReferenceTable; this.crossReferenceTable = crossReferenceTable;
this.searcher = searcher; this.bytes = bytes;
} }
public bool TryGetOffset(IndirectReference reference, out long offset) public bool TryGetOffset(IndirectReference reference, out long offset)
@@ -52,14 +55,12 @@
return true; return true;
} }
var locations = searcher.GetObjectLocations(); if (bruteForcedOffsets == null)
if (locations.TryGetValue(reference, out offset))
{ {
return true; bruteForcedOffsets = BruteForceSearcher.GetObjectLocations(bytes);
} }
return false; return bruteForcedOffsets.TryGetValue(reference, out offset);
} }
public void UpdateOffset(IndirectReference reference, long offset) public void UpdateOffset(IndirectReference reference, long offset)

View File

@@ -73,14 +73,13 @@
var version = FileHeaderParser.Parse(coreScanner, isLenientParsing, Log); var version = FileHeaderParser.Parse(coreScanner, isLenientParsing, Log);
var bruteForceSearcher = new BruteForceSearcher(inputBytes); var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log),
var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log), new XrefCosOffsetChecker(Log, bruteForceSearcher),
new Parser.Parts.CrossReference.CrossReferenceStreamParser(FilterProvider)); new Parser.Parts.CrossReference.CrossReferenceStreamParser(FilterProvider));
CrossReferenceTable crossReference = null; CrossReferenceTable crossReference = null;
// ReSharper disable once AccessToModifiedClosure // ReSharper disable once AccessToModifiedClosure
var locationProvider = new ObjectLocationProvider(() => crossReference, bruteForceSearcher); var locationProvider = new ObjectLocationProvider(() => crossReference, inputBytes);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, FilterProvider, NoOpEncryptionHandler.Instance); var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, FilterProvider, NoOpEncryptionHandler.Instance);