From 58972de7cb5fd6daeb676fbff96d735481392656 Mon Sep 17 00:00:00 2001
From: Eliot Jones <elioty@hotmail.co.uk>
Date: Tue, 3 Mar 2020 15:21:11 +0000
Subject: [PATCH] begin to rework cross-reference parsing

most of the cross-reference code is the earliest code in the project and hasn't been revisited since then. the issue #88 has been reopened due to a bug with brute-force searching so this tidies up the code in this area ahead of trying to fix the bug.
---
 .../Parser/Parts/BruteForceSearcherTests.cs   |  51 ++--
 .../CrossReferenceObjectOffsetValidator.cs    | 161 ++++++++++++
 .../FileStructure/CrossReferenceParser.cs     |   9 +-
 .../FileStructure/XrefCosOffsetChecker.cs     | 236 ------------------
 .../Parser/ParsingCachingProviders.cs         |   6 +-
 .../Parser/Parts/BruteForceSearcher.cs        |  60 ++---
 .../Parser/PdfDocumentFactory.cs              |   8 +-
 .../Scanner/ObjectLocationProvider.cs         |  17 +-
 src/UglyToad.PdfPig/Writer/PdfMerger.cs       |   5 +-
 9 files changed, 230 insertions(+), 323 deletions(-)
 create mode 100644 src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceObjectOffsetValidator.cs
 delete mode 100644 src/UglyToad.PdfPig/Parser/FileStructure/XrefCosOffsetChecker.cs

diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs
index db076b95..57a4f707 100644
--- a/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs
@@ -36,10 +36,18 @@ startxref
 216
 %%EOF";
 
+        private static readonly long[] TestDataOffsets = 
+        {
+            TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase),
+            TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase),
+            TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase),
+            TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase)
+        };
+
         [Fact]
         public void ReaderNull_Throws()
         {
-            Action action = () => new BruteForceSearcher(null);
+            Action action = () => BruteForceSearcher.GetObjectLocations(null);
 
             Assert.Throws<ArgumentNullException>(action);
         }
@@ -49,34 +57,24 @@ startxref
         public void SearcherFindsCorrectObjects()
         {
             var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData));
-
-            var searcher = new BruteForceSearcher(input);
-
-            var locations = searcher.GetObjectLocations();
+            
+            var locations = BruteForceSearcher.GetObjectLocations(input);
 
             Assert.Equal(4, locations.Count);
 
-            Assert.Equal(locations.Values, new long[]
-            {
-                TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase),
-                TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase),
-                TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase),
-                TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase)
-            });
+            Assert.Equal(TestDataOffsets, locations.Values);
         }
 
         [Fact]
         public void ReaderOnlyCallsOnce()
         {
             var reader = StringBytesTestConverter.Convert(TestData, false);
-
-            var searcher = new BruteForceSearcher(reader.Bytes);
-
-            var locations = searcher.GetObjectLocations();
+            
+            var locations = BruteForceSearcher.GetObjectLocations(reader.Bytes);
 
             Assert.Equal(4, locations.Count);
             
-            var newLocations = searcher.GetObjectLocations();
+            var newLocations = BruteForceSearcher.GetObjectLocations(reader.Bytes);
 
             Assert.Equal(4, locations.Count);
 
@@ -92,9 +90,8 @@ startxref
             using (var fs = File.OpenRead(IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf")))
             {
                 var bytes = new StreamInputBytes(fs);
-                var searcher = new BruteForceSearcher(bytes);
 
-                var locations = searcher.GetObjectLocations();
+                var locations = BruteForceSearcher.GetObjectLocations(bytes);
 
                 Assert.Equal(13, locations.Count);
 
@@ -118,9 +115,7 @@ startxref
         {
             var bytes = new ByteArrayInputBytes(File.ReadAllBytes(IntegrationHelpers.GetDocumentPath("Single Page Simple - from open office.pdf")));
 
-            var searcher = new BruteForceSearcher(bytes);
-
-            var locations = searcher.GetObjectLocations();
+            var locations = BruteForceSearcher.GetObjectLocations(bytes);
 
             Assert.Equal(13, locations.Count);
 
@@ -142,6 +137,18 @@ startxref
             Assert.StartsWith("12 0 obj", s);
         }
 
+        [Fact]
+        public void BruteForceSearcherCorrectlyFindsAllObjectsWhenOffset()
+        {
+            var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData));
+
+            input.Seek(593);
+
+            var locations = BruteForceSearcher.GetObjectLocations(input);
+
+            Assert.Equal(TestDataOffsets, locations.Values);
+        }
+
         private static string GetStringAt(IInputBytes bytes, long location)
         {
             bytes.Seek(location);
diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceObjectOffsetValidator.cs b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceObjectOffsetValidator.cs
new file mode 100644
index 00000000..a91b27b4
--- /dev/null
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceObjectOffsetValidator.cs
@@ -0,0 +1,161 @@
+﻿namespace UglyToad.PdfPig.Parser.FileStructure
+{
+    using System;
+    using System.Collections.Generic;
+    using Core;
+    using CrossReference;
+    using Logging;
+    using Parts;
+
+    internal static class CrossReferenceObjectOffsetValidator
+    {
+        private const long MinimumSearchOffset = 6;
+        
+        /// <summary>
+        /// Check that the offsets in the cross reference are correct.
+        /// </summary>
+        public static bool ValidateCrossReferenceOffsets(IInputBytes bytes, CrossReferenceTable crossReferenceTable, ILog log,
+            out IReadOnlyDictionary<IndirectReference, long> actualOffsets)
+        {
+            actualOffsets = crossReferenceTable.ObjectOffsets;
+
+            if (ValidateXrefOffsets(bytes, crossReferenceTable.ObjectOffsets, log))
+            {
+                return true;
+            }
+            
+            var bruteForceOffsets = BruteForceSearcher.GetObjectLocations(bytes);
+            if (bruteForceOffsets.Count > 0)
+            {
+                var objStreams = new List<IndirectReference>();
+
+                // find all object streams
+                foreach (var entry in crossReferenceTable.ObjectOffsets)
+                {
+                    var offset = entry.Value;
+                    if (offset < 0)
+                    {
+                        var objStream = new IndirectReference(-offset, 0);
+                        if (!objStreams.Contains(objStream))
+                        {
+                            objStreams.Add(new IndirectReference(-offset, 0));
+                        }
+                    }
+
+                    // remove all found object streams
+                    if (objStreams.Count > 0)
+                    {
+                        foreach (var key in objStreams)
+                        {
+                            if (bruteForceOffsets.ContainsKey(key))
+                            {
+                                // remove all parsed objects which are part of an object stream
+                                //ISet<long> objects = xrefTrailerResolver
+                                //    .getContainedObjectNumbers((int)(key.Number));
+                                //foreach (long objNr in objects)
+                                //{
+                                //    CosObjectKey streamObjectKey = new CosObjectKey(objNr, 0);
+
+                                //    if (bfCOSObjectKeyOffsets.TryGetValue(streamObjectKey, out long streamObjectOffset) && streamObjectOffset > 0)
+                                //    {
+                                //        bfCOSObjectKeyOffsets.Remove(streamObjectKey);
+                                //    }
+                                //}
+                            }
+                            else
+                            {
+                                // remove all objects which are part of an object stream which wasn't found
+                                //ISet<long> objects = xrefTrailerResolver
+                                //    .getContainedObjectNumbers((int)(key.Number));
+                                //foreach (long objNr in objects)
+                                //{
+                                //    xrefOffset.Remove(new CosObjectKey(objNr, 0));
+                                //}
+                            }
+                        }
+                    }
+
+                    foreach (var item in bruteForceOffsets)
+                    {
+                        //xrefOffset[item.Key] = item.Value;
+                    }
+
+                }
+            }
+
+            return false;
+        }
+        
+        private static bool ValidateXrefOffsets(IInputBytes bytes, IReadOnlyDictionary<IndirectReference, long> objectOffsets, ILog log)
+        {
+            if (objectOffsets == null)
+            {
+                return true;
+            }
+
+            foreach (var objectEntry in objectOffsets)
+            {
+                var objectKey = objectEntry.Key;
+                var objectOffset = objectEntry.Value;
+
+                if (objectOffset < 0)
+                {
+                    continue;
+                }
+
+                if (!CheckObjectKeys(bytes, objectKey, objectOffset))
+                {
+                    log.Error($"At least one cross-reference offset was incorrect. {objectKey} could not be found at {objectOffset}. " +
+                              "Using brute-force search to repair object offsets.");
+
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
+        private static bool CheckObjectKeys(IInputBytes bytes, IndirectReference objectKey, long offset)
+        {
+            // there can't be any object at the very beginning of a pdf
+            if (offset < MinimumSearchOffset)
+            {
+                return false;
+            }
+
+            var objectNr = objectKey.ObjectNumber;
+            long objectGen = objectKey.Generation;
+            var originOffset = bytes.CurrentOffset;
+
+            var objectString = ObjectHelper.CreateObjectString(objectNr, objectGen);
+
+            try
+            {
+                bytes.Seek(offset);
+
+                if (ReadHelper.IsWhitespace(bytes.CurrentByte))
+                {
+                    bytes.MoveNext();
+                }
+
+                if (ReadHelper.IsString(bytes, objectString))
+                {
+                    // everything is ok, return origin object key
+                    bytes.Seek(originOffset);
+                    return true;
+                }
+            }
+            catch (Exception)
+            {
+                // Swallow the exception, obviously there isn't any valid object number
+            }
+            finally
+            {
+                bytes.Seek(originOffset);
+            }
+
+            // no valid object number found
+            return false;
+        }
+    }
+}
diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs
index 00c36781..086da01f 100644
--- a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs
@@ -4,7 +4,6 @@
     using System.Collections.Generic;
     using Core;
     using CrossReference;
-    using Exceptions;
     using Logging;
     using Parts.CrossReference;
     using Tokenization.Scanner;
@@ -15,16 +14,13 @@
         private readonly ILog log;
         private readonly XrefOffsetValidator offsetValidator;
         private readonly CrossReferenceStreamParser crossReferenceStreamParser;
-        private readonly XrefCosOffsetChecker xrefCosChecker;
 
         public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator,
-            XrefCosOffsetChecker xrefCosChecker,
             CrossReferenceStreamParser crossReferenceStreamParser)
         {
             this.log = log;
             this.offsetValidator = offsetValidator;
             this.crossReferenceStreamParser = crossReferenceStreamParser;
-            this.xrefCosChecker = xrefCosChecker;
         }
         
         public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long crossReferenceLocation,
@@ -214,7 +210,10 @@
             var resolved = table.Build(crossReferenceLocation, log);
             
             // check the offsets of all referenced objects
-            xrefCosChecker.CheckCrossReferenceOffsets(bytes, resolved, isLenientParsing);
+            if (!CrossReferenceObjectOffsetValidator.ValidateCrossReferenceOffsets(bytes, resolved, log, out var actualOffsets))
+            {
+                resolved = new CrossReferenceTable(resolved.Type, actualOffsets, resolved.Trailer, resolved.CrossReferenceOffsets);
+            }
             
             return resolved;
         }
diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/XrefCosOffsetChecker.cs b/src/UglyToad.PdfPig/Parser/FileStructure/XrefCosOffsetChecker.cs
deleted file mode 100644
index 50d0897a..00000000
--- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefCosOffsetChecker.cs
+++ /dev/null
@@ -1,236 +0,0 @@
-﻿namespace UglyToad.PdfPig.Parser.FileStructure
-{
-    using System;
-    using System.Collections.Generic;
-    using System.Linq;
-    using Core;
-    using CrossReference;
-    using Logging;
-    using Parts;
-
-    internal class XrefCosOffsetChecker
-    {
-        private static readonly long MINIMUM_SEARCH_OFFSET = 6;
-
-        private readonly ILog log;
-        private readonly BruteForceSearcher bruteForceSearcher;
-
-        private IReadOnlyDictionary<IndirectReference, long> objectKeyOffsets;
-
-        public XrefCosOffsetChecker(ILog log, BruteForceSearcher bruteForceSearcher)
-        {
-            this.log = log;
-            this.bruteForceSearcher = bruteForceSearcher;
-        }
-
-        private bool ValidateXrefOffsets(IInputBytes bytes, Dictionary<IndirectReference, long> xrefOffset)
-        {
-            if (xrefOffset == null)
-            {
-                return true;
-            }
-
-            foreach (var objectEntry in xrefOffset)
-            {
-                IndirectReference objectKey = objectEntry.Key;
-                long objectOffset = objectEntry.Value;
-
-                // a negative offset number represents a object number itself
-                // see type 2 entry in xref stream
-                if (objectOffset >= 0 && !CheckObjectKeys(bytes, objectKey, objectOffset))
-                {
-                    log.Debug($"Stop checking xref offsets as at least one ({objectKey}) couldn't be dereferenced");
-
-                    return false;
-                }
-            }
-            return true;
-        }
-
-        private bool CheckObjectKeys(IInputBytes bytes, IndirectReference objectKey, long offset)
-        {
-            // there can't be any object at the very beginning of a pdf
-            if (offset < MINIMUM_SEARCH_OFFSET)
-            {
-                return false;
-            }
-
-            long objectNr = objectKey.ObjectNumber;
-            long objectGen = objectKey.Generation;
-            long originOffset = bytes.CurrentOffset;
-
-            string objectString = ObjectHelper.CreateObjectString(objectNr, objectGen);
-
-            try
-            {
-                bytes.Seek(offset);
-
-                if (ReadHelper.IsWhitespace(bytes.CurrentByte))
-                {
-                    bytes.MoveNext();
-                }
-                
-                if (ReadHelper.IsString(bytes, objectString))
-                {
-                    // everything is ok, return origin object key
-                    bytes.Seek(originOffset);
-                    return true;
-                }
-            }
-            catch (Exception)
-            {
-                // Swallow the exception, obviously there isn't any valid object number
-            }
-            finally
-            {
-                bytes.Seek(originOffset);
-            }
-
-            // no valid object number found
-            return false;
-        }
-
-
-        private IReadOnlyDictionary<IndirectReference, long> getBFCosObjectOffsets()
-        {
-            if (objectKeyOffsets == null)
-            {
-                var offsets = bruteForceSearcher.GetObjectLocations();
-
-                objectKeyOffsets = offsets;
-            }
-
-            return objectKeyOffsets;
-        }
-        
-        /// <summary>
-        /// Check that the offsets in the cross reference are correct.
-        /// </summary>
-        public void CheckCrossReferenceOffsets(IInputBytes bytes, CrossReferenceTable xrefTrailerResolver, bool isLenientParsing)
-        {
-            // repair mode isn't available in non-lenient mode
-            if (!isLenientParsing)
-            {
-                return;
-            }
-
-            Dictionary<IndirectReference, long> xrefOffset = xrefTrailerResolver.ObjectOffsets.ToDictionary(x => x.Key, x => x.Value);
-            if (ValidateXrefOffsets(bytes, xrefOffset))
-            {
-                return;
-            }
-
-            IReadOnlyDictionary<IndirectReference, long> bfCOSObjectKeyOffsets = getBFCosObjectOffsets();
-            if (bfCOSObjectKeyOffsets.Count > 0)
-            {
-                List<IndirectReference> objStreams = new List<IndirectReference>();
-                // find all object streams
-                foreach (var entry in xrefOffset)
-                {
-                    long offset = entry.Value;
-                    if (offset < 0)
-                    {
-                        IndirectReference objStream = new IndirectReference(-offset, 0);
-                        if (!objStreams.Contains(objStream))
-                        {
-                            objStreams.Add(new IndirectReference(-offset, 0));
-                        }
-                    }
-                }
-                // remove all found object streams
-                if (objStreams.Count > 0)
-                {
-                    foreach (IndirectReference key in objStreams)
-                    {
-                        if (bfCOSObjectKeyOffsets.ContainsKey(key))
-                        {
-                            // remove all parsed objects which are part of an object stream
-                            //ISet<long> objects = xrefTrailerResolver
-                            //    .getContainedObjectNumbers((int)(key.Number));
-                            //foreach (long objNr in objects)
-                            //{
-                            //    CosObjectKey streamObjectKey = new CosObjectKey(objNr, 0);
-
-                            //    if (bfCOSObjectKeyOffsets.TryGetValue(streamObjectKey, out long streamObjectOffset) && streamObjectOffset > 0)
-                            //    {
-                            //        bfCOSObjectKeyOffsets.Remove(streamObjectKey);
-                            //    }
-                            //}
-                        }
-                        else
-                        {
-                            // remove all objects which are part of an object stream which wasn't found
-                            //ISet<long> objects = xrefTrailerResolver
-                            //    .getContainedObjectNumbers((int)(key.Number));
-                            //foreach (long objNr in objects)
-                            //{
-                            //    xrefOffset.Remove(new CosObjectKey(objNr, 0));
-                            //}
-                        }
-                    }
-                }
-
-                foreach (var item in bfCOSObjectKeyOffsets)
-                {
-                    xrefOffset[item.Key] = item.Value;
-                }
-
-            }
-        }
-
-        private long? lastEndOfFileMarker;
-
-        private void BruteForceSearchForEndOfFileMarker(IInputBytes source)
-        {
-            if (lastEndOfFileMarker != null)
-            {
-                return;
-            }
-
-            long startOffset = source.CurrentOffset;
-
-            source.Seek(MINIMUM_SEARCH_OFFSET);
-
-            while (!source.IsAtEnd())
-            {
-                // search for EOF marker
-                if (ReadHelper.IsString(source, "%%EOF"))
-                {
-                    long tempMarker = source.CurrentOffset;
-
-                    if (tempMarker >= source.Length)
-                    {
-                        lastEndOfFileMarker = tempMarker;
-                        break;
-                    }
-
-                    try
-                    {
-                        source.Seek(tempMarker + 5);
-                        // check if the following data is some valid pdf content
-                        // which most likely indicates that the pdf is linearized,
-                        // updated or just cut off somewhere in the middle
-                        ReadHelper.SkipSpaces(source);
-                        ObjectHelper.ReadObjectNumber(source);
-                        ObjectHelper.ReadGenerationNumber(source);
-                    }
-                    catch (Exception)
-                    {
-                        // save the EOF marker as the following data is most likely some garbage
-                        lastEndOfFileMarker = tempMarker;
-                    }
-                }
-
-                source.MoveNext();
-            }
-
-            source.Seek(startOffset);
-
-            // no EOF marker found
-            if (lastEndOfFileMarker == null)
-            {
-                lastEndOfFileMarker = long.MaxValue;
-            }
-        }
-    }
-}
diff --git a/src/UglyToad.PdfPig/Parser/ParsingCachingProviders.cs b/src/UglyToad.PdfPig/Parser/ParsingCachingProviders.cs
index 6963d3c0..420cb2a5 100644
--- a/src/UglyToad.PdfPig/Parser/ParsingCachingProviders.cs
+++ b/src/UglyToad.PdfPig/Parser/ParsingCachingProviders.cs
@@ -2,20 +2,16 @@
 {
     using System;
     using Content;
-    using Parts;
 
     /// <summary>
     /// For objects which provide document scoped caching.
     /// </summary>
     internal class ParsingCachingProviders
     {
-        public BruteForceSearcher BruteForceSearcher { get; }
-
         public IResourceStore ResourceContainer { get; }
 
-        public ParsingCachingProviders(BruteForceSearcher bruteForceSearcher, IResourceStore resourceContainer)
+        public ParsingCachingProviders(IResourceStore resourceContainer)
         {
-            BruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher));
             ResourceContainer = resourceContainer ?? throw new ArgumentNullException(nameof(resourceContainer));
         }
     }
diff --git a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
index 692f3c14..6927962e 100644
--- a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
+++ b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
@@ -8,44 +8,37 @@
     using Util.JetBrains.Annotations;
 
     /// <summary>
-    /// Store the results of a brute force search for all objects in the document so we only do it once.
+    /// Brute force search for all objects in the document.
     /// </summary>
-    internal class BruteForceSearcher
+    internal static class BruteForceSearcher
     {
         private const int MinimumSearchOffset = 6;
 
-        private readonly IInputBytes bytes;
-
-        private Dictionary<IndirectReference, long> objectLocations;
-
-        public BruteForceSearcher([NotNull] IInputBytes bytes)
-        {
-            this.bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
-        }
-
+        /// <summary>
+        /// Find the offset of every object contained in the document by searching the entire document contents.
+        /// </summary>
+        /// <param name="bytes">The bytes of the document.</param>
+        /// <returns>The object keys and offsets for the objects in this document.</returns>
         [NotNull]
-        public IReadOnlyDictionary<IndirectReference, long> GetObjectLocations()
+        public static IReadOnlyDictionary<IndirectReference, long> GetObjectLocations(IInputBytes bytes)
         {
-            if (objectLocations != null)
+            if (bytes == null)
             {
-                return objectLocations;
+                throw new ArgumentNullException(nameof(bytes));
             }
 
             var loopProtection = 0;
 
-            var lastEndOfFile = GetLastEndOfFileMarker();
+            var lastEndOfFile = GetLastEndOfFileMarker(bytes);
 
             var results = new Dictionary<IndirectReference, long>();
 
             var originPosition = bytes.CurrentOffset;
 
-            long currentOffset = MinimumSearchOffset;
-            long lastObjectId = long.MinValue;
-            int lastGenerationId = int.MinValue;
-            long lastObjOffset = long.MinValue;
+            var currentOffset = (long)MinimumSearchOffset;
+
+            var currentlyInObject = false;
 
-            bool inObject = false;
-            bool endobjFound = false;
             do
             {
                 if (loopProtection > 1_000_000)
@@ -55,7 +48,7 @@
 
                 loopProtection++;
 
-                if (inObject)
+                if (currentlyInObject)
                 {
                     if (bytes.CurrentByte == 'e')
                     {
@@ -65,8 +58,7 @@
                         {
                             if (ReadHelper.IsString(bytes, "endobj"))
                             {
-                                inObject = false;
-                                endobjFound = true;
+                                currentlyInObject = false;
                                 loopProtection = 0;
 
                                 for (int i = 0; i < "endobj".Length; i++)
@@ -139,31 +131,21 @@
 
                 results[new IndirectReference(obj, generation)] = bytes.CurrentOffset;
 
-                inObject = true;
-                endobjFound = false;
+                currentlyInObject = true;
 
                 currentOffset++;
 
                 bytes.Seek(currentOffset);
                 loopProtection = 0;
             } while (currentOffset < lastEndOfFile && !bytes.IsAtEnd());
-
-            if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0)
-            {
-                // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
-                // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
-                results[new IndirectReference(lastObjectId, lastGenerationId)] = lastObjOffset;
-            }
-
+            
             // reestablish origin position
             bytes.Seek(originPosition);
-
-            objectLocations = results;
-
-            return objectLocations;
+            
+            return results;
         }
 
-        private long GetLastEndOfFileMarker()
+        private static long GetLastEndOfFileMarker(IInputBytes bytes)
         {
             var originalOffset = bytes.CurrentOffset;
 
diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
index da6e9701..d4c21e48 100644
--- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
+++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
@@ -84,17 +84,15 @@
 
             CrossReferenceTable crossReferenceTable = null;
 
-            var bruteForceSearcher = new BruteForceSearcher(inputBytes);
             var xrefValidator = new XrefOffsetValidator(log);
-            var objectChecker = new XrefCosOffsetChecker(log, bruteForceSearcher);
 
             // We're ok with this since our intent is to lazily load the cross reference table.
             // ReSharper disable once AccessToModifiedClosure
-            var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, bruteForceSearcher);
+            var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, inputBytes);
             var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance);
 
             var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
-            var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, objectChecker, crossReferenceStreamParser);
+            var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser);
             
             var version = FileHeaderParser.Parse(scanner, isLenientParsing, log);
             
@@ -144,7 +142,7 @@
                 new PageContentParser(new ReflectionGraphicsStateOperationFactory()), 
                 log);
 
-            var caching = new ParsingCachingProviders(bruteForceSearcher, resourceContainer);
+            var caching = new ParsingCachingProviders(resourceContainer);
 
             var acroFormFactory = new AcroFormFactory(pdfScanner, filterProvider, crossReferenceTable);
             var bookmarksProvider = new BookmarksProvider(log, pdfScanner);
diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs
index 76a49cf8..e4c85550 100644
--- a/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/ObjectLocationProvider.cs
@@ -15,7 +15,10 @@
         /// Since we want to scan objects while reading the cross reference table we lazily load it when it's ready.
         /// </summary>
         private readonly Func<CrossReferenceTable> crossReferenceTable;
-        private readonly BruteForceSearcher searcher;
+
+        private readonly IInputBytes bytes;
+
+        private IReadOnlyDictionary<IndirectReference, long> bruteForcedOffsets;
 
         /// <summary>
         /// Indicates whether we now have a cross reference table.
@@ -24,10 +27,10 @@
 
         private readonly Dictionary<IndirectReference, long> offsets = new Dictionary<IndirectReference, long>();
 
-        public ObjectLocationProvider(Func<CrossReferenceTable> crossReferenceTable, BruteForceSearcher searcher)
+        public ObjectLocationProvider(Func<CrossReferenceTable> crossReferenceTable, IInputBytes bytes)
         {
             this.crossReferenceTable = crossReferenceTable;
-            this.searcher = searcher;
+            this.bytes = bytes;
         }
 
         public bool TryGetOffset(IndirectReference reference, out long offset)
@@ -52,14 +55,12 @@
                 return true;
             }
 
-            var locations = searcher.GetObjectLocations();
-
-            if (locations.TryGetValue(reference, out offset))
+            if (bruteForcedOffsets == null)
             {
-                return true;
+                bruteForcedOffsets = BruteForceSearcher.GetObjectLocations(bytes);
             }
 
-            return false;
+            return bruteForcedOffsets.TryGetValue(reference, out offset);
         }
 
         public void UpdateOffset(IndirectReference reference, long offset)
diff --git a/src/UglyToad.PdfPig/Writer/PdfMerger.cs b/src/UglyToad.PdfPig/Writer/PdfMerger.cs
index 8d6e04cc..3588d389 100644
--- a/src/UglyToad.PdfPig/Writer/PdfMerger.cs
+++ b/src/UglyToad.PdfPig/Writer/PdfMerger.cs
@@ -73,14 +73,13 @@
 
                 var version = FileHeaderParser.Parse(coreScanner, isLenientParsing, Log);
 
-                var bruteForceSearcher = new BruteForceSearcher(inputBytes);
-                var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log), new XrefCosOffsetChecker(Log, bruteForceSearcher), 
+                var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log),
                     new Parser.Parts.CrossReference.CrossReferenceStreamParser(FilterProvider));
 
                 CrossReferenceTable crossReference = null;
 
                 // ReSharper disable once AccessToModifiedClosure
-                var locationProvider = new ObjectLocationProvider(() => crossReference, bruteForceSearcher);
+                var locationProvider = new ObjectLocationProvider(() => crossReference, inputBytes);
 
                 var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, FilterProvider, NoOpEncryptionHandler.Instance);