From 58972de7cb5fd6daeb676fbff96d735481392656 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Tue, 3 Mar 2020 15:21:11 +0000 Subject: [PATCH] begin to rework cross-reference parsing most of the cross-reference code is the earliest code in the project and hasn't been revisited since then. the issue #88 has been reopened due to a bug with brute-force searching so this tidies up the code in this area ahead of trying to fix the bug. --- .../Parser/Parts/BruteForceSearcherTests.cs | 51 ++-- .../CrossReferenceObjectOffsetValidator.cs | 161 ++++++++++++ .../FileStructure/CrossReferenceParser.cs | 9 +- .../FileStructure/XrefCosOffsetChecker.cs | 236 ------------------ .../Parser/ParsingCachingProviders.cs | 6 +- .../Parser/Parts/BruteForceSearcher.cs | 60 ++--- .../Parser/PdfDocumentFactory.cs | 8 +- .../Scanner/ObjectLocationProvider.cs | 17 +- src/UglyToad.PdfPig/Writer/PdfMerger.cs | 5 +- 9 files changed, 230 insertions(+), 323 deletions(-) create mode 100644 src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceObjectOffsetValidator.cs delete mode 100644 src/UglyToad.PdfPig/Parser/FileStructure/XrefCosOffsetChecker.cs diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs index db076b95..57a4f707 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs @@ -36,10 +36,18 @@ startxref 216 %%EOF"; + private static readonly long[] TestDataOffsets = + { + TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase), + TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase), + TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase), + TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase) + }; + [Fact] public void ReaderNull_Throws() { - Action action = () => new BruteForceSearcher(null); + Action action = () => BruteForceSearcher.GetObjectLocations(null); Assert.Throws(action); } @@ -49,34 +57,24 @@ startxref public void SearcherFindsCorrectObjects() { var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData)); - - var searcher = new BruteForceSearcher(input); - - var locations = searcher.GetObjectLocations(); + + var locations = BruteForceSearcher.GetObjectLocations(input); Assert.Equal(4, locations.Count); - Assert.Equal(locations.Values, new long[] - { - TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase), - TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase), - TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase), - TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase) - }); + Assert.Equal(TestDataOffsets, locations.Values); } [Fact] public void ReaderOnlyCallsOnce() { var reader = StringBytesTestConverter.Convert(TestData, false); - - var searcher = new BruteForceSearcher(reader.Bytes); - - var locations = searcher.GetObjectLocations(); + + var locations = BruteForceSearcher.GetObjectLocations(reader.Bytes); Assert.Equal(4, locations.Count); - var newLocations = searcher.GetObjectLocations(); + var newLocations = BruteForceSearcher.GetObjectLocations(reader.Bytes); Assert.Equal(4, locations.Count); @@ -92,9 +90,8 @@ startxref using (var fs = File.OpenRead(IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf"))) { var bytes = new StreamInputBytes(fs); - var searcher = new BruteForceSearcher(bytes); - var locations = searcher.GetObjectLocations(); + var locations = BruteForceSearcher.GetObjectLocations(bytes); Assert.Equal(13, locations.Count); @@ -118,9 +115,7 @@ startxref { var bytes = new ByteArrayInputBytes(File.ReadAllBytes(IntegrationHelpers.GetDocumentPath("Single Page Simple - from open office.pdf"))); - var searcher = new BruteForceSearcher(bytes); - - var locations = searcher.GetObjectLocations(); + var locations = BruteForceSearcher.GetObjectLocations(bytes); Assert.Equal(13, locations.Count); @@ -142,6 +137,18 @@ startxref Assert.StartsWith("12 0 obj", s); } + [Fact] + public void BruteForceSearcherCorrectlyFindsAllObjectsWhenOffset() + { + var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData)); + + input.Seek(593); + + var locations = BruteForceSearcher.GetObjectLocations(input); + + Assert.Equal(TestDataOffsets, locations.Values); + } + private static string GetStringAt(IInputBytes bytes, long location) { bytes.Seek(location); diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceObjectOffsetValidator.cs b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceObjectOffsetValidator.cs new file mode 100644 index 00000000..a91b27b4 --- /dev/null +++ b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceObjectOffsetValidator.cs @@ -0,0 +1,161 @@ +namespace UglyToad.PdfPig.Parser.FileStructure +{ + using System; + using System.Collections.Generic; + using Core; + using CrossReference; + using Logging; + using Parts; + + internal static class CrossReferenceObjectOffsetValidator + { + private const long MinimumSearchOffset = 6; + + /// + /// Check that the offsets in the cross reference are correct. + /// + public static bool ValidateCrossReferenceOffsets(IInputBytes bytes, CrossReferenceTable crossReferenceTable, ILog log, + out IReadOnlyDictionary actualOffsets) + { + actualOffsets = crossReferenceTable.ObjectOffsets; + + if (ValidateXrefOffsets(bytes, crossReferenceTable.ObjectOffsets, log)) + { + return true; + } + + var bruteForceOffsets = BruteForceSearcher.GetObjectLocations(bytes); + if (bruteForceOffsets.Count > 0) + { + var objStreams = new List(); + + // find all object streams + foreach (var entry in crossReferenceTable.ObjectOffsets) + { + var offset = entry.Value; + if (offset < 0) + { + var objStream = new IndirectReference(-offset, 0); + if (!objStreams.Contains(objStream)) + { + objStreams.Add(new IndirectReference(-offset, 0)); + } + } + + // remove all found object streams + if (objStreams.Count > 0) + { + foreach (var key in objStreams) + { + if (bruteForceOffsets.ContainsKey(key)) + { + // remove all parsed objects which are part of an object stream + //ISet objects = xrefTrailerResolver + // .getContainedObjectNumbers((int)(key.Number)); + //foreach (long objNr in objects) + //{ + // CosObjectKey streamObjectKey = new CosObjectKey(objNr, 0); + + // if (bfCOSObjectKeyOffsets.TryGetValue(streamObjectKey, out long streamObjectOffset) && streamObjectOffset > 0) + // { + // bfCOSObjectKeyOffsets.Remove(streamObjectKey); + // } + //} + } + else + { + // remove all objects which are part of an object stream which wasn't found + //ISet objects = xrefTrailerResolver + // .getContainedObjectNumbers((int)(key.Number)); + //foreach (long objNr in objects) + //{ + // xrefOffset.Remove(new CosObjectKey(objNr, 0)); + //} + } + } + } + + foreach (var item in bruteForceOffsets) + { + //xrefOffset[item.Key] = item.Value; + } + + } + } + + return false; + } + + private static bool ValidateXrefOffsets(IInputBytes bytes, IReadOnlyDictionary objectOffsets, ILog log) + { + if (objectOffsets == null) + { + return true; + } + + foreach (var objectEntry in objectOffsets) + { + var objectKey = objectEntry.Key; + var objectOffset = objectEntry.Value; + + if (objectOffset < 0) + { + continue; + } + + if (!CheckObjectKeys(bytes, objectKey, objectOffset)) + { + log.Error($"At least one cross-reference offset was incorrect. {objectKey} could not be found at {objectOffset}. " + + "Using brute-force search to repair object offsets."); + + return false; + } + } + + return true; + } + + private static bool CheckObjectKeys(IInputBytes bytes, IndirectReference objectKey, long offset) + { + // there can't be any object at the very beginning of a pdf + if (offset < MinimumSearchOffset) + { + return false; + } + + var objectNr = objectKey.ObjectNumber; + long objectGen = objectKey.Generation; + var originOffset = bytes.CurrentOffset; + + var objectString = ObjectHelper.CreateObjectString(objectNr, objectGen); + + try + { + bytes.Seek(offset); + + if (ReadHelper.IsWhitespace(bytes.CurrentByte)) + { + bytes.MoveNext(); + } + + if (ReadHelper.IsString(bytes, objectString)) + { + // everything is ok, return origin object key + bytes.Seek(originOffset); + return true; + } + } + catch (Exception) + { + // Swallow the exception, obviously there isn't any valid object number + } + finally + { + bytes.Seek(originOffset); + } + + // no valid object number found + return false; + } + } +} diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs index 00c36781..086da01f 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs @@ -4,7 +4,6 @@ using System.Collections.Generic; using Core; using CrossReference; - using Exceptions; using Logging; using Parts.CrossReference; using Tokenization.Scanner; @@ -15,16 +14,13 @@ private readonly ILog log; private readonly XrefOffsetValidator offsetValidator; private readonly CrossReferenceStreamParser crossReferenceStreamParser; - private readonly XrefCosOffsetChecker xrefCosChecker; public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator, - XrefCosOffsetChecker xrefCosChecker, CrossReferenceStreamParser crossReferenceStreamParser) { this.log = log; this.offsetValidator = offsetValidator; this.crossReferenceStreamParser = crossReferenceStreamParser; - this.xrefCosChecker = xrefCosChecker; } public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long crossReferenceLocation, @@ -214,7 +210,10 @@ var resolved = table.Build(crossReferenceLocation, log); // check the offsets of all referenced objects - xrefCosChecker.CheckCrossReferenceOffsets(bytes, resolved, isLenientParsing); + if (!CrossReferenceObjectOffsetValidator.ValidateCrossReferenceOffsets(bytes, resolved, log, out var actualOffsets)) + { + resolved = new CrossReferenceTable(resolved.Type, actualOffsets, resolved.Trailer, resolved.CrossReferenceOffsets); + } return resolved; } diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefCosOffsetChecker.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefCosOffsetChecker.cs deleted file mode 100644 index 50d0897a..00000000 --- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefCosOffsetChecker.cs +++ /dev/null @@ -1,236 +0,0 @@ -namespace UglyToad.PdfPig.Parser.FileStructure -{ - using System; - using System.Collections.Generic; - using System.Linq; - using Core; - using CrossReference; - using Logging; - using Parts; - - internal class XrefCosOffsetChecker - { - private static readonly long MINIMUM_SEARCH_OFFSET = 6; - - private readonly ILog log; - private readonly BruteForceSearcher bruteForceSearcher; - - private IReadOnlyDictionary objectKeyOffsets; - - public XrefCosOffsetChecker(ILog log, BruteForceSearcher bruteForceSearcher) - { - this.log = log; - this.bruteForceSearcher = bruteForceSearcher; - } - - private bool ValidateXrefOffsets(IInputBytes bytes, Dictionary xrefOffset) - { - if (xrefOffset == null) - { - return true; - } - - foreach (var objectEntry in xrefOffset) - { - IndirectReference objectKey = objectEntry.Key; - long objectOffset = objectEntry.Value; - - // a negative offset number represents a object number itself - // see type 2 entry in xref stream - if (objectOffset >= 0 && !CheckObjectKeys(bytes, objectKey, objectOffset)) - { - log.Debug($"Stop checking xref offsets as at least one ({objectKey}) couldn't be dereferenced"); - - return false; - } - } - return true; - } - - private bool CheckObjectKeys(IInputBytes bytes, IndirectReference objectKey, long offset) - { - // there can't be any object at the very beginning of a pdf - if (offset < MINIMUM_SEARCH_OFFSET) - { - return false; - } - - long objectNr = objectKey.ObjectNumber; - long objectGen = objectKey.Generation; - long originOffset = bytes.CurrentOffset; - - string objectString = ObjectHelper.CreateObjectString(objectNr, objectGen); - - try - { - bytes.Seek(offset); - - if (ReadHelper.IsWhitespace(bytes.CurrentByte)) - { - bytes.MoveNext(); - } - - if (ReadHelper.IsString(bytes, objectString)) - { - // everything is ok, return origin object key - bytes.Seek(originOffset); - return true; - } - } - catch (Exception) - { - // Swallow the exception, obviously there isn't any valid object number - } - finally - { - bytes.Seek(originOffset); - } - - // no valid object number found - return false; - } - - - private IReadOnlyDictionary getBFCosObjectOffsets() - { - if (objectKeyOffsets == null) - { - var offsets = bruteForceSearcher.GetObjectLocations(); - - objectKeyOffsets = offsets; - } - - return objectKeyOffsets; - } - - /// - /// Check that the offsets in the cross reference are correct. - /// - public void CheckCrossReferenceOffsets(IInputBytes bytes, CrossReferenceTable xrefTrailerResolver, bool isLenientParsing) - { - // repair mode isn't available in non-lenient mode - if (!isLenientParsing) - { - return; - } - - Dictionary xrefOffset = xrefTrailerResolver.ObjectOffsets.ToDictionary(x => x.Key, x => x.Value); - if (ValidateXrefOffsets(bytes, xrefOffset)) - { - return; - } - - IReadOnlyDictionary bfCOSObjectKeyOffsets = getBFCosObjectOffsets(); - if (bfCOSObjectKeyOffsets.Count > 0) - { - List objStreams = new List(); - // find all object streams - foreach (var entry in xrefOffset) - { - long offset = entry.Value; - if (offset < 0) - { - IndirectReference objStream = new IndirectReference(-offset, 0); - if (!objStreams.Contains(objStream)) - { - objStreams.Add(new IndirectReference(-offset, 0)); - } - } - } - // remove all found object streams - if (objStreams.Count > 0) - { - foreach (IndirectReference key in objStreams) - { - if (bfCOSObjectKeyOffsets.ContainsKey(key)) - { - // remove all parsed objects which are part of an object stream - //ISet objects = xrefTrailerResolver - // .getContainedObjectNumbers((int)(key.Number)); - //foreach (long objNr in objects) - //{ - // CosObjectKey streamObjectKey = new CosObjectKey(objNr, 0); - - // if (bfCOSObjectKeyOffsets.TryGetValue(streamObjectKey, out long streamObjectOffset) && streamObjectOffset > 0) - // { - // bfCOSObjectKeyOffsets.Remove(streamObjectKey); - // } - //} - } - else - { - // remove all objects which are part of an object stream which wasn't found - //ISet objects = xrefTrailerResolver - // .getContainedObjectNumbers((int)(key.Number)); - //foreach (long objNr in objects) - //{ - // xrefOffset.Remove(new CosObjectKey(objNr, 0)); - //} - } - } - } - - foreach (var item in bfCOSObjectKeyOffsets) - { - xrefOffset[item.Key] = item.Value; - } - - } - } - - private long? lastEndOfFileMarker; - - private void BruteForceSearchForEndOfFileMarker(IInputBytes source) - { - if (lastEndOfFileMarker != null) - { - return; - } - - long startOffset = source.CurrentOffset; - - source.Seek(MINIMUM_SEARCH_OFFSET); - - while (!source.IsAtEnd()) - { - // search for EOF marker - if (ReadHelper.IsString(source, "%%EOF")) - { - long tempMarker = source.CurrentOffset; - - if (tempMarker >= source.Length) - { - lastEndOfFileMarker = tempMarker; - break; - } - - try - { - source.Seek(tempMarker + 5); - // check if the following data is some valid pdf content - // which most likely indicates that the pdf is linearized, - // updated or just cut off somewhere in the middle - ReadHelper.SkipSpaces(source); - ObjectHelper.ReadObjectNumber(source); - ObjectHelper.ReadGenerationNumber(source); - } - catch (Exception) - { - // save the EOF marker as the following data is most likely some garbage - lastEndOfFileMarker = tempMarker; - } - } - - source.MoveNext(); - } - - source.Seek(startOffset); - - // no EOF marker found - if (lastEndOfFileMarker == null) - { - lastEndOfFileMarker = long.MaxValue; - } - } - } -} diff --git a/src/UglyToad.PdfPig/Parser/ParsingCachingProviders.cs b/src/UglyToad.PdfPig/Parser/ParsingCachingProviders.cs index 6963d3c0..420cb2a5 100644 --- a/src/UglyToad.PdfPig/Parser/ParsingCachingProviders.cs +++ b/src/UglyToad.PdfPig/Parser/ParsingCachingProviders.cs @@ -2,20 +2,16 @@ { using System; using Content; - using Parts; /// /// For objects which provide document scoped caching. /// internal class ParsingCachingProviders { - public BruteForceSearcher BruteForceSearcher { get; } - public IResourceStore ResourceContainer { get; } - public ParsingCachingProviders(BruteForceSearcher bruteForceSearcher, IResourceStore resourceContainer) + public ParsingCachingProviders(IResourceStore resourceContainer) { - BruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher)); ResourceContainer = resourceContainer ?? throw new ArgumentNullException(nameof(resourceContainer)); } } diff --git a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs index 692f3c14..6927962e 100644 --- a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs +++ b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs @@ -8,44 +8,37 @@ using Util.JetBrains.Annotations; /// - /// Store the results of a brute force search for all objects in the document so we only do it once. + /// Brute force search for all objects in the document. /// - internal class BruteForceSearcher + internal static class BruteForceSearcher { private const int MinimumSearchOffset = 6; - private readonly IInputBytes bytes; - - private Dictionary objectLocations; - - public BruteForceSearcher([NotNull] IInputBytes bytes) - { - this.bytes = bytes ?? throw new ArgumentNullException(nameof(bytes)); - } - + /// + /// Find the offset of every object contained in the document by searching the entire document contents. + /// + /// The bytes of the document. + /// The object keys and offsets for the objects in this document. [NotNull] - public IReadOnlyDictionary GetObjectLocations() + public static IReadOnlyDictionary GetObjectLocations(IInputBytes bytes) { - if (objectLocations != null) + if (bytes == null) { - return objectLocations; + throw new ArgumentNullException(nameof(bytes)); } var loopProtection = 0; - var lastEndOfFile = GetLastEndOfFileMarker(); + var lastEndOfFile = GetLastEndOfFileMarker(bytes); var results = new Dictionary(); var originPosition = bytes.CurrentOffset; - long currentOffset = MinimumSearchOffset; - long lastObjectId = long.MinValue; - int lastGenerationId = int.MinValue; - long lastObjOffset = long.MinValue; + var currentOffset = (long)MinimumSearchOffset; + + var currentlyInObject = false; - bool inObject = false; - bool endobjFound = false; do { if (loopProtection > 1_000_000) @@ -55,7 +48,7 @@ loopProtection++; - if (inObject) + if (currentlyInObject) { if (bytes.CurrentByte == 'e') { @@ -65,8 +58,7 @@ { if (ReadHelper.IsString(bytes, "endobj")) { - inObject = false; - endobjFound = true; + currentlyInObject = false; loopProtection = 0; for (int i = 0; i < "endobj".Length; i++) @@ -139,31 +131,21 @@ results[new IndirectReference(obj, generation)] = bytes.CurrentOffset; - inObject = true; - endobjFound = false; + currentlyInObject = true; currentOffset++; bytes.Seek(currentOffset); loopProtection = 0; } while (currentOffset < lastEndOfFile && !bytes.IsAtEnd()); - - if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0) - { - // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker - // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id - results[new IndirectReference(lastObjectId, lastGenerationId)] = lastObjOffset; - } - + // reestablish origin position bytes.Seek(originPosition); - - objectLocations = results; - - return objectLocations; + + return results; } - private long GetLastEndOfFileMarker() + private static long GetLastEndOfFileMarker(IInputBytes bytes) { var originalOffset = bytes.CurrentOffset; diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index da6e9701..d4c21e48 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -84,17 +84,15 @@ CrossReferenceTable crossReferenceTable = null; - var bruteForceSearcher = new BruteForceSearcher(inputBytes); var xrefValidator = new XrefOffsetValidator(log); - var objectChecker = new XrefCosOffsetChecker(log, bruteForceSearcher); // We're ok with this since our intent is to lazily load the cross reference table. // ReSharper disable once AccessToModifiedClosure - var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, bruteForceSearcher); + var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, inputBytes); var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance); var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider); - var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, objectChecker, crossReferenceStreamParser); + var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser); var version = FileHeaderParser.Parse(scanner, isLenientParsing, log); @@ -144,7 +142,7 @@ new PageContentParser(new ReflectionGraphicsStateOperationFactory()), log); - var caching = new ParsingCachingProviders(bruteForceSearcher, resourceContainer); + var caching = new ParsingCachingProviders(resourceContainer); var acroFormFactory = new AcroFormFactory(pdfScanner, filterProvider, crossReferenceTable); var bookmarksProvider = new BookmarksProvider(log, pdfScanner); diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs index 76a49cf8..e4c85550 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs @@ -15,7 +15,10 @@ /// Since we want to scan objects while reading the cross reference table we lazily load it when it's ready. /// private readonly Func crossReferenceTable; - private readonly BruteForceSearcher searcher; + + private readonly IInputBytes bytes; + + private IReadOnlyDictionary bruteForcedOffsets; /// /// Indicates whether we now have a cross reference table. @@ -24,10 +27,10 @@ private readonly Dictionary offsets = new Dictionary(); - public ObjectLocationProvider(Func crossReferenceTable, BruteForceSearcher searcher) + public ObjectLocationProvider(Func crossReferenceTable, IInputBytes bytes) { this.crossReferenceTable = crossReferenceTable; - this.searcher = searcher; + this.bytes = bytes; } public bool TryGetOffset(IndirectReference reference, out long offset) @@ -52,14 +55,12 @@ return true; } - var locations = searcher.GetObjectLocations(); - - if (locations.TryGetValue(reference, out offset)) + if (bruteForcedOffsets == null) { - return true; + bruteForcedOffsets = BruteForceSearcher.GetObjectLocations(bytes); } - return false; + return bruteForcedOffsets.TryGetValue(reference, out offset); } public void UpdateOffset(IndirectReference reference, long offset) diff --git a/src/UglyToad.PdfPig/Writer/PdfMerger.cs b/src/UglyToad.PdfPig/Writer/PdfMerger.cs index 8d6e04cc..3588d389 100644 --- a/src/UglyToad.PdfPig/Writer/PdfMerger.cs +++ b/src/UglyToad.PdfPig/Writer/PdfMerger.cs @@ -73,14 +73,13 @@ var version = FileHeaderParser.Parse(coreScanner, isLenientParsing, Log); - var bruteForceSearcher = new BruteForceSearcher(inputBytes); - var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log), new XrefCosOffsetChecker(Log, bruteForceSearcher), + var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log), new Parser.Parts.CrossReference.CrossReferenceStreamParser(FilterProvider)); CrossReferenceTable crossReference = null; // ReSharper disable once AccessToModifiedClosure - var locationProvider = new ObjectLocationProvider(() => crossReference, bruteForceSearcher); + var locationProvider = new ObjectLocationProvider(() => crossReference, inputBytes); var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, FilterProvider, NoOpEncryptionHandler.Instance);