remove all old parsing logic

2025-10-15 19:54:52 +08:00 · 2018-01-21 14:48:49 +00:00
parent da7d83d863
commit e24a306c31
12 changed files with 141 additions and 1021 deletions
--- a/src/UglyToad.PdfPig.Tests/Parser/Parts/CosDictionaryParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/CosDictionaryParserTests.cs
@@ -1,57 +0,0 @@
-// ReSharper disable ObjectCreationAsStatement
-
-namespace UglyToad.PdfPig.Tests.Parser.Parts
-{
-    using System;
-    using IO;
-    using PdfPig.Cos;
-    using PdfPig.Parser.Parts;
-    using Xunit;
-
-    public class CosDictionaryParserTests
-    {
-        private readonly CosNameParser nameParser = new CosNameParser();
-        private readonly CosDictionaryParser parser;
-
-        public CosDictionaryParserTests()
-        {
-            parser = new CosDictionaryParser(nameParser, new TestingLog());
-        }
-
-        [Fact]
-        public void NameParserIsNull_Throws()
-        {
-            Action action = () => new CosDictionaryParser(null, new TestingLog());
-
-            Assert.Throws<ArgumentNullException>(action);
-        }
-
-        [Fact]
-        public void RandomAccessReadIsNull_Throws()
-        {
-            var baseParser = new CosBaseParser(nameParser, new CosStringParser(), parser, new CosArrayParser());
-
-            Action action = () => parser.Parse(null, baseParser, new CosObjectPool());
-
-            Assert.Throws<ArgumentNullException>(action);
-        }
-
-        [Fact]
-        public void BaseParserIsNull_Throws()
-        {
-            Action action = () => parser.Parse(new RandomAccessBuffer(), null, new CosObjectPool());
-
-            Assert.Throws<ArgumentNullException>(action);
-        }
-
-        [Fact]
-        public void DocumentIsNull_Throws()
-        {
-            var baseParser = new CosBaseParser(nameParser, new CosStringParser(), parser, new CosArrayParser());
-
-            Action action = () => parser.Parse(new RandomAccessBuffer(), baseParser, null);
-
-            Assert.Throws<ArgumentNullException>(action);
-        }
-    }
-}
--- a/src/UglyToad.PdfPig.Tests/TestDictionaryParser.cs
+++ b/src/UglyToad.PdfPig.Tests/TestDictionaryParser.cs
@@ -1,23 +0,0 @@
-namespace UglyToad.PdfPig.Tests
-{
-    using IO;
-    using PdfPig.ContentStream;
-    using PdfPig.Cos;
-    using PdfPig.Parser.Parts;
-
-    internal class TestDictionaryParser : IDictionaryParser
-    {
-        public PdfDictionary Parse(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool)
-        {
-            return new PdfDictionary();
-        }
-    }
-
-    internal class TestBaseParser : IBaseParser
-    {
-        public CosBase Parse(IRandomAccessRead reader, CosObjectPool pool)
-        {
-            return CosNull.Null;
-        }
-    }
-}
--- a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceOffsetValidator.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceOffsetValidator.cs
@@ -1,5 +1,8 @@
 namespace UglyToad.PdfPig.Parser.FileStructure
 {
+    using IO;
+    using Tokenization.Scanner;
+
    internal class CrossReferenceOffsetValidator
    {
        private readonly XrefOffsetValidator offsetValidator;
@@ -9,9 +12,9 @@
            this.offsetValidator = offsetValidator;
        }

-        public long Validate(long crossReferenceOffset, bool isLenientParsing)
+        public long Validate(long crossReferenceOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader, bool isLenientParsing)
        {
-            long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceOffset, isLenientParsing);
+            long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceOffset, scanner, reader, isLenientParsing);
            if (fixedOffset > -1)
            {
                crossReferenceOffset = fixedOffset;
--- a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs
@@ -6,7 +6,6 @@
    using Exceptions;
    using IO;
    using Logging;
-    using Parts;
    using Parts.CrossReference;
    using Tokenization.Scanner;
    using Tokenization.Tokens;
@@ -14,19 +13,17 @@
    internal class CrossReferenceParser
    {
        private readonly ILog log;
-        private readonly CosDictionaryParser dictionaryParser;
-        private readonly CosBaseParser baseParser;
+        private readonly XrefOffsetValidator offsetValidator;
        private readonly CrossReferenceStreamParser crossReferenceStreamParser;
        private readonly CrossReferenceTableParser crossReferenceTableParser;
        private readonly XrefCosOffsetChecker xrefCosChecker;

-        public CrossReferenceParser(ILog log, CosDictionaryParser dictionaryParser, CosBaseParser baseParser,
+        public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator,
            CrossReferenceStreamParser crossReferenceStreamParser,
            CrossReferenceTableParser crossReferenceTableParser)
        {
            this.log = log;
-            this.dictionaryParser = dictionaryParser;
-            this.baseParser = baseParser;
+            this.offsetValidator = offsetValidator;
            this.crossReferenceStreamParser = crossReferenceStreamParser;
            this.crossReferenceTableParser = crossReferenceTableParser;

@@ -36,8 +33,7 @@
        public CrossReferenceTable Parse(IRandomAccessRead reader, bool isLenientParsing, long xrefLocation,
            CosObjectPool pool, IPdfTokenScanner pdfScanner, ISeekableTokenScanner tokenScanner)
        {
-            var xrefOffsetValidator = new XrefOffsetValidator(log, reader, dictionaryParser, baseParser, pool);
-            long fixedOffset = xrefOffsetValidator.CheckXRefOffset(xrefLocation, isLenientParsing);
+            long fixedOffset = offsetValidator.CheckXRefOffset(xrefLocation, tokenScanner, reader, isLenientParsing);
            if (fixedOffset > -1)
            {
                xrefLocation = fixedOffset;
@@ -81,7 +77,7 @@
                        int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;

                        // check the xref stream reference
-                        fixedOffset = xrefOffsetValidator.CheckXRefOffset(streamOffset, isLenientParsing);
+                        fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, reader, isLenientParsing);
                        if (fixedOffset > -1 && fixedOffset != streamOffset)
                        {
                            log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");
@@ -148,7 +144,7 @@
                    if (previousCrossReferenceLocation > 0)
                    {
                        // check the xref table reference
-                        fixedOffset = xrefOffsetValidator.CheckXRefOffset(previousCrossReferenceLocation, isLenientParsing);
+                        fixedOffset = offsetValidator.CheckXRefOffset(previousCrossReferenceLocation, tokenScanner, reader, isLenientParsing);
                        if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
                        {
                            previousCrossReferenceLocation = fixedOffset;
--- a/src/UglyToad.PdfPig/Parser/FileStructure/XrefOffsetValidator.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/XrefOffsetValidator.cs
@@ -2,37 +2,27 @@
 {
    using System;
    using System.Collections.Generic;
-    using ContentStream;
-    using Cos;
    using IO;
    using Logging;
    using Parts;
+    using Tokenization.Scanner;
+    using Tokenization.Tokens;

    internal class XrefOffsetValidator
    {
        private static readonly long MinimumSearchOffset = 6;

        private readonly ILog log;
-        private readonly IRandomAccessRead source;
-        private readonly CosDictionaryParser dictionaryParser;
-        private readonly CosBaseParser baseParser;
-        private readonly CosObjectPool pool;

-        private List<long> bfSearchXRefTablesOffsets = null;
-        private List<long> bfSearchXRefStreamsOffsets = null;
+        private List<long> bfSearchXRefTablesOffsets;
+        private List<long> bfSearchXRefStreamsOffsets;

-        public XrefOffsetValidator(ILog log, IRandomAccessRead source, CosDictionaryParser dictionaryParser, 
-            CosBaseParser baseParser,
-            CosObjectPool pool)
+        public XrefOffsetValidator(ILog log)
        {
            this.log = log;
-            this.source = source;
-            this.dictionaryParser = dictionaryParser;
-            this.baseParser = baseParser;
-            this.pool = pool;
        }

-        public long CheckXRefOffset(long startXRefOffset, bool isLenientParsing)
+        public long CheckXRefOffset(long startXRefOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader, bool isLenientParsing)
        {
            // repair mode isn't available in non-lenient mode
            if (!isLenientParsing)
@@ -40,127 +30,133 @@
                return startXRefOffset;
            }

-            source.Seek(startXRefOffset);
+            reader.Seek(startXRefOffset);

-            ReadHelper.SkipSpaces(source);
+            ReadHelper.SkipSpaces(reader);

-            if (source.Peek() == 'x' && ReadHelper.IsString(source, "xref"))
+            if (reader.Peek() == 'x' && ReadHelper.IsString(reader, "xref"))
            {
                return startXRefOffset;
            }
            if (startXRefOffset > 0)
            {
-                if (CheckXRefStreamOffset(source, startXRefOffset, true, pool))
+                if (CheckXRefStreamOffset(startXRefOffset, scanner, true))
                {
                    return startXRefOffset;
                }

-                return CalculateXRefFixedOffset(startXRefOffset);
+                return CalculateXRefFixedOffset(startXRefOffset, scanner, reader);
            }
+
            // can't find a valid offset
            return -1;
        }
-        
-        private long CalculateXRefFixedOffset(long objectOffset)
+
+        private long CalculateXRefFixedOffset(long objectOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader)
        {
            if (objectOffset < 0)
            {
-                // LOG.error("Invalid object offset " + objectOffset + " when searching for a xref table/stream");
+                log.Error($"Invalid object offset {objectOffset} when searching for a xref table/stream");
                return 0;
            }
+
            // start a brute force search for all xref tables and try to find the offset we are looking for
-            long newOffset = BfSearchForXRef(objectOffset);
+            long newOffset = BfSearchForXRef(objectOffset, scanner, reader);
            if (newOffset > -1)
            {
-                // LOG.debug("Fixed reference for xref table/stream " + objectOffset + " -> " + newOffset);
+                log.Debug($"Fixed reference for xref table/stream {objectOffset} -> {newOffset}");
                return newOffset;
            }
-            // LOG.error("Can't find the object xref table/stream at offset " + objectOffset);
+
+            log.Error($"Can\'t find the object xref table/stream at offset {objectOffset}");
+
            return 0;
        }

-        private void BfSearchForXRefStreams()
+        private void BfSearchForXRefStreams(IRandomAccessRead reader)
        {
-            if (bfSearchXRefStreamsOffsets == null)
+            if (bfSearchXRefStreamsOffsets != null)
            {
-                // a pdf may contain more than one /XRef entry
-                bfSearchXRefStreamsOffsets = new List<long>();
-                long originOffset = source.GetPosition();
-                source.Seek(MinimumSearchOffset);
-                // search for XRef streams
-                var objString = " obj";
-                while (!source.IsEof())
+                return;
+            }
+
+            // a pdf may contain more than one /XRef entry
+            bfSearchXRefStreamsOffsets = new List<long>();
+            long originOffset = reader.GetPosition();
+            reader.Seek(MinimumSearchOffset);
+            // search for XRef streams
+            var objString = " obj";
+            while (!reader.IsEof())
+            {
+                if (ReadHelper.IsString(reader, "xref"))
                {
-                    if (ReadHelper.IsString(source, "xref"))
+                    // search backwards for the beginning of the stream
+                    long newOffset = -1;
+                    long xrefOffset = reader.GetPosition();
+                    bool objFound = false;
+                    for (int i = 1; i < 40 && !objFound; i++)
                    {
-                        // search backwards for the beginning of the stream
-                        long newOffset = -1;
-                        long xrefOffset = source.GetPosition();
-                        bool objFound = false;
-                        for (int i = 1; i < 40 && !objFound; i++)
+                        long currentOffset = xrefOffset - (i * 10);
+                        if (currentOffset > 0)
                        {
-                            long currentOffset = xrefOffset - (i * 10);
-                            if (currentOffset > 0)
+                            reader.Seek(currentOffset);
+                            for (int j = 0; j < 10; j++)
                            {
-                                source.Seek(currentOffset);
-                                for (int j = 0; j < 10; j++)
+                                if (ReadHelper.IsString(reader, objString))
                                {
-                                    if (ReadHelper.IsString(source, objString))
+                                    long tempOffset = currentOffset - 1;
+                                    reader.Seek(tempOffset);
+                                    int genId = reader.Peek();
+                                    // is the next char a digit?
+                                    if (ReadHelper.IsDigit(genId))
                                    {
-                                        long tempOffset = currentOffset - 1;
-                                        source.Seek(tempOffset);
-                                        int genId = source.Peek();
-                                        // is the next char a digit?
-                                        if (ReadHelper.IsDigit(genId))
+                                        tempOffset--;
+                                        reader.Seek(tempOffset);
+                                        if (ReadHelper.IsSpace(reader))
                                        {
-                                            tempOffset--;
-                                            source.Seek(tempOffset);
-                                            if (ReadHelper.IsSpace(source))
+                                            int length = 0;
+                                            reader.Seek(--tempOffset);
+                                            while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(reader))
                                            {
-                                                int length = 0;
-                                                source.Seek(--tempOffset);
-                                                while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(source))
-                                                {
-                                                    source.Seek(--tempOffset);
-                                                    length++;
-                                                }
-                                                if (length > 0)
-                                                {
-                                                    source.Read();
-                                                    newOffset = source.GetPosition();
-                                                }
+                                                reader.Seek(--tempOffset);
+                                                length++;
+                                            }
+                                            if (length > 0)
+                                            {
+                                                reader.Read();
+                                                newOffset = reader.GetPosition();
                                            }
                                        }
-                                        objFound = true;
-                                        break;
-                                    }
-                                    else
-                                    {
-                                        currentOffset++;
-                                        source.Read();
                                    }
+                                    objFound = true;
+                                    break;
+                                }
+                                else
+                                {
+                                    currentOffset++;
+                                    reader.Read();
                                }
                            }
                        }
-                        if (newOffset > -1)
-                        {
-                            bfSearchXRefStreamsOffsets.Add(newOffset);
-                        }
-                        source.Seek(xrefOffset + 5);
                    }
-                    source.Read();
+                    if (newOffset > -1)
+                    {
+                        bfSearchXRefStreamsOffsets.Add(newOffset);
+                    }
+                    reader.Seek(xrefOffset + 5);
                }
-                source.Seek(originOffset);
+                reader.Read();
            }
+            reader.Seek(originOffset);
        }

-        private long BfSearchForXRef(long xrefOffset)
+        private long BfSearchForXRef(long xrefOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader)
        {
            long newOffset = -1;
            long newOffsetTable = -1;
            long newOffsetStream = -1;
-            BfSearchForXRefTables();
-            BfSearchForXRefStreams();
+            BfSearchForXRefTables(reader);
+            BfSearchForXRefStreams(reader);
            if (bfSearchXRefTablesOffsets != null)
            {
                // TODO to be optimized, this won't work in every case
@@ -200,31 +196,31 @@
            return newOffset;
        }

-        private void BfSearchForXRefTables()
+        private void BfSearchForXRefTables(IRandomAccessRead reader)
        {
            if (bfSearchXRefTablesOffsets == null)
            {
                // a pdf may contain more than one xref entry
                bfSearchXRefTablesOffsets = new List<long>();
-                long originOffset = source.GetPosition();
-                source.Seek(MinimumSearchOffset);
+                long originOffset = reader.GetPosition();
+                reader.Seek(MinimumSearchOffset);
                // search for xref tables
-                while (!source.IsEof())
+                while (!reader.IsEof())
                {
-                    if (ReadHelper.IsString(source, "xref"))
+                    if (ReadHelper.IsString(reader, "xref"))
                    {
-                        long newOffset = source.GetPosition();
-                        source.Seek(newOffset - 1);
+                        long newOffset = reader.GetPosition();
+                        reader.Seek(newOffset - 1);
                        // ensure that we don't read "startxref" instead of "xref"
-                        if (ReadHelper.IsWhitespace(source))
+                        if (ReadHelper.IsWhitespace(reader))
                        {
                            bfSearchXRefTablesOffsets.Add(newOffset);
                        }
-                        source.Seek(newOffset + 4);
+                        reader.Seek(newOffset + 4);
                    }
-                    source.Read();
+                    reader.Read();
                }
-                source.Seek(originOffset);
+                reader.Seek(originOffset);
            }
        }

@@ -252,7 +248,7 @@
            return newValue;
        }

-        private bool CheckXRefStreamOffset(IRandomAccessRead source, long startXRefOffset, bool isLenient, CosObjectPool pool)
+        private bool CheckXRefStreamOffset(long startXRefOffset, ISeekableTokenScanner scanner, bool isLenient)
        {
            // repair mode isn't available in non-lenient mode
            if (!isLenient || startXRefOffset == 0)
@@ -260,37 +256,41 @@
                return true;
            }
            // seek to offset-1 
-            source.Seek(startXRefOffset - 1);
-            int nextValue = source.Read();
-            // the first character has to be a whitespace, and then a digit
-            if (ReadHelper.IsWhitespace(nextValue))
+            scanner.Seek(startXRefOffset - 1);
+            if (scanner.TryReadToken(out NumericToken objectNumber))
            {
-                ReadHelper.SkipSpaces(source);
-                if (ReadHelper.IsDigit(source))
+                try
                {
-                    try
+                    if (!scanner.TryReadToken(out NumericToken generation))
                    {
-                        // it's a XRef stream
-                        ObjectHelper.ReadObjectNumber(source);
-                        ObjectHelper.ReadGenerationNumber(source);
-
-                        ReadHelper.ReadExpectedString(source, "obj", true);
-
-                        // check the dictionary to avoid false positives
-                        PdfDictionary dict = dictionaryParser.Parse(source, baseParser, pool);
-                        source.Seek(startXRefOffset);
-                        
-                        if (dict.IsType(CosName.XREF))
-                        {
-                            return true;
-                        }
+                        log.Debug($"When checking offset at {startXRefOffset} did not find the generation number. Got: {objectNumber} {generation}.");
                    }
-                    catch (Exception ex)
+                    
+                    scanner.MoveNext();
+
+                    var obj = scanner.CurrentToken;
+
+                    if (!ReferenceEquals(obj, OperatorToken.StartObject))
                    {
-                        log.Error("Couldn't read the xref stream object.", ex);
-                        // there wasn't an object of a xref stream
-                        source.Seek(startXRefOffset);
+                        scanner.Seek(startXRefOffset);
+                        return false;
                    }
+
+                    // check the dictionary to avoid false positives
+                    if (!scanner.TryReadToken(out DictionaryToken dictionary))
+                    {
+                        scanner.Seek(startXRefOffset);
+
+                    }
+
+                    if (dictionary.TryGet(NameToken.Type, out var type) && NameToken.Xref.Equals(type))
+                    {
+                        return true;
+                    }
+                }
+                catch (Exception ex)
+                {
+                    log.Error("Couldn't read the xref stream object.", ex);
                }
            }
            return false;
--- a/src/UglyToad.PdfPig/Parser/Parts/CosArrayParser.cs
+++ b/src/UglyToad.PdfPig/Parser/Parts/CosArrayParser.cs
@@ -1,71 +0,0 @@
-namespace UglyToad.PdfPig.Parser.Parts
-{
-    using ContentStream;
-    using Cos;
-    using IO;
-    using Util;
-
-    internal class CosArrayParser
-    {
-        public COSArray Parse(IRandomAccessRead reader, CosBaseParser baseParser, CosObjectPool pool)
-        {
-            ReadHelper.ReadExpectedChar(reader, '[');
-            var po = new COSArray();
-            CosBase pbo;
-            ReadHelper.SkipSpaces(reader);
-            int i;
-            while (((i = reader.Peek()) > 0) && ((char)i != ']'))
-            {
-                pbo = baseParser.Parse(reader, pool);
-                if (pbo is CosObject)
-                {
-                    // We have to check if the expected values are there or not PDFBOX-385
-                    if (po.get(po.size() - 1) is CosInt)
-                    {
-                        var genNumber = (CosInt)po.remove(po.size() - 1);
-                        if (po.get(po.size() - 1) is CosInt)
-                        {
-                            var number = (CosInt)po.remove(po.size() - 1);
-                            IndirectReference key = new IndirectReference(number.AsLong(), genNumber.AsInt());
-                            pbo = pool.Get(key);
-                        }
-                        else
-                        {
-                            // the object reference is somehow wrong
-                            pbo = null;
-                        }
-                    }
-                    else
-                    {
-                        pbo = null;
-                    }
-                }
-                if (pbo != null)
-                {
-                    po.add(pbo);
-                }
-                else
-                {
-                    //it could be a bad object in the array which is just skipped
-                    // LOG.warn("Corrupt object reference at offset " + seqSource.getPosition());
-
-                    // This could also be an "endobj" or "endstream" which means we can assume that
-                    // the array has ended.
-                    string isThisTheEnd = ReadHelper.ReadString(reader);
-                    reader.Unread(OtherEncodings.StringAsLatin1Bytes(isThisTheEnd));
-                    if (string.Equals(isThisTheEnd, "endobj") || string.Equals(isThisTheEnd, "endstream"))
-                    {
-                        return po;
-                    }
-                }
-
-                ReadHelper.SkipSpaces(reader);
-            }
-            // read ']'
-            reader.Read();
-            ReadHelper.SkipSpaces(reader);
-            return po;
-        }
-    }
-}
-
--- a/src/UglyToad.PdfPig/Parser/Parts/CosBaseParser.cs
+++ b/src/UglyToad.PdfPig/Parser/Parts/CosBaseParser.cs
@@ -1,166 +0,0 @@
-namespace UglyToad.PdfPig.Parser.Parts
-{
-    using System.IO;
-    using System.Text;
-    using ContentStream;
-    using Cos;
-    using IO;
-    using Util;
-
-    internal interface IBaseParser
-    {
-        CosBase Parse(IRandomAccessRead reader, CosObjectPool pool);
-    }
-
-    internal class CosBaseParser : IBaseParser
-    {
-        private readonly CosNameParser nameParser;
-        private readonly CosStringParser stringParser;
-        private readonly CosDictionaryParser dictionaryParser;
-        private readonly CosArrayParser arrayParser;
-
-        public CosBaseParser(CosNameParser nameParser, CosStringParser stringParser, 
-            CosDictionaryParser dictionaryParser, CosArrayParser arrayParser)
-        {
-            this.nameParser = nameParser;
-            this.stringParser = stringParser;
-            this.dictionaryParser = dictionaryParser;
-            this.arrayParser = arrayParser;
-        }
-
-        public CosBase Parse(IRandomAccessRead reader, CosObjectPool pool)
-        {
-            CosBase retval = null;
-
-            ReadHelper.SkipSpaces(reader);
-            int nextByte = reader.Peek();
-
-            if (nextByte == -1)
-            {
-                return null;
-            }
-
-            char c = (char)nextByte;
-            switch (c)
-            {
-                case '<':
-                    {
-                        // pull off first left bracket
-                        int leftBracket = reader.Read();
-                        // check for second left bracket
-                        c = (char)reader.Peek();
-                        reader.Unread(leftBracket);
-                        if (c == '<')
-                        {
-                            retval = dictionaryParser.Parse(reader, this, pool);
-                            ReadHelper.SkipSpaces(reader);
-                        }
-                        else
-                        {
-                            retval = stringParser.Parse(reader);
-                        }
-                        break;
-                    }
-                case '[':
-                    {
-                        // array
-                        retval = arrayParser.Parse(reader, this, pool);
-                        break;
-                    }
-                case '(':
-                    retval = stringParser.Parse(reader);
-                    break;
-                case '/':
-                    // name
-                    retval = nameParser.Parse(reader);
-                    break;
-                case 'n':
-                    {
-                        // null
-                        ReadHelper.ReadExpectedString(reader, "null");
-                        retval = CosNull.Null;
-                        break;
-                    }
-                case 't':
-                    {
-                        string truestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(4));
-                        if (truestring.Equals("true"))
-                        {
-                            retval = PdfBoolean.True;
-                        }
-                        else
-                        {
-                            throw new IOException("expected true actual='" + truestring + "' " + reader +
-                            "' at offset " + reader.GetPosition());
-                        }
-                        break;
-                    }
-                case 'f':
-                    {
-                        string falsestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(5));
-                        if (falsestring.Equals("false"))
-                        {
-                            retval = PdfBoolean.False;
-                        }
-                        else
-                        {
-                            throw new IOException("expected false actual='" + falsestring + "' " + reader +
-                            "' at offset " + reader.GetPosition());
-                        }
-                        break;
-                    }
-                case 'R':
-                    reader.Read();
-                    retval = new CosObject(null);
-                    break;
-                default:
-
-                    if (char.IsDigit(c) || c == '-' || c == '+' || c == '.')
-                    {
-                        StringBuilder buf = new StringBuilder();
-                        int ic = reader.Read();
-                        c = (char)ic;
-                        while (char.IsDigit(c) ||
-                        c == '-' ||
-                        c == '+' ||
-                        c == '.' ||
-                        c == 'E' ||
-                        c == 'e')
-                        {
-                            buf.Append(c);
-                            ic = reader.Read();
-                            c = (char)ic;
-                        }
-                        if (ic != -1)
-                        {
-                            reader.Unread(ic);
-                        }
-                        retval = CosNumberFactory.get(buf.ToString()) as CosBase;
-                    }
-                    else
-                    {
-                        //This is not suppose to happen, but we will allow for it
-                        //so we are more compatible with POS writers that don't
-                        //follow the spec
-                        string badstring = ReadHelper.ReadString(reader);
-                        if (badstring == string.Empty)
-                        {
-                            int peek = reader.Peek();
-                            // we can end up in an infinite loop otherwise
-                            throw new IOException("Unknown dir object c='" + c +
-                            "' cInt=" + (int)c + " peek='" + (char)peek
-                            + "' peekInt=" + peek + " at offset " + reader.GetPosition());
-                        }
-
-                        // if it's an endstream/endobj, we want to put it back so the caller will see it
-                        if (string.Equals("endobj", badstring) || string.Equals("endstream", badstring))
-                        {
-                            reader.Unread(OtherEncodings.StringAsLatin1Bytes(badstring));
-                        }
-                    }
-                    break;
-            }
-            return retval;
-        }
-    }
-}
--- a/src/UglyToad.PdfPig/Parser/Parts/CosDictionaryParser.cs
+++ b/src/UglyToad.PdfPig/Parser/Parts/CosDictionaryParser.cs
@@ -1,205 +0,0 @@
-namespace UglyToad.PdfPig.Parser.Parts
-{
-    using System;
-    using ContentStream;
-    using Cos;
-    using IO;
-    using Logging;
-    using Util;
-    using Util.JetBrains.Annotations;
-
-    internal interface IDictionaryParser
-    {
-        PdfDictionary Parse(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool);
-    }
-
-    internal class CosDictionaryParser : IDictionaryParser
-    {
-        private readonly ILog log;
-        private readonly CosNameParser nameParser;
-
-        protected static readonly int E = 'e';
-        protected static readonly int N = 'n';
-        protected static readonly int D = 'd';
-
-        protected static readonly int S = 's';
-        protected static readonly int T = 't';
-        protected static readonly int R = 'r';
-        protected static readonly int A = 'a';
-        protected static readonly int M = 'm';
-
-        protected static readonly int O = 'o';
-        protected static readonly int B = 'b';
-        protected static readonly int J = 'j';
-
-        public CosDictionaryParser(CosNameParser nameParser, ILog log)
-        {
-            this.log = log;
-            this.nameParser = nameParser ?? throw new ArgumentNullException();
-        }
-
-        public PdfDictionary Parse(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool)
-        {
-            if (reader == null)
-            {
-                throw new ArgumentNullException(nameof(reader));
-            }
-
-            if (baseParser == null)
-            {
-                throw new ArgumentNullException(nameof(baseParser));
-            }
-
-            if (pool == null)
-            {
-                throw new ArgumentNullException(nameof(pool));
-            }
-
-            ReadHelper.ReadExpectedChar(reader, '<');
-            ReadHelper.ReadExpectedChar(reader, '<');
-            ReadHelper.SkipSpaces(reader);
-            
-            var dictionary = new PdfDictionary();
-
-            var done = false;
-            while (!done)
-            {
-                ReadHelper.SkipSpaces(reader);
-
-                var c = (char)reader.Peek();
-
-                switch (c)
-                {
-                    case '>':
-                        done = true;
-                        break;
-                    case '/':
-                        var nameValue = ParseCosDictionaryNameValuePair(reader, baseParser, pool);
-
-                        if (nameValue.key != null && nameValue.value != null)
-                        {
-                            dictionary.Set(nameValue.key, nameValue.value);
-                        }
-
-                        break;
-                    default:
-                        if (ReadUntilEnd(reader))
-                        {
-                            return new PdfDictionary();
-                        }
-                        break;
-                }
-            }
-
-            ReadHelper.ReadExpectedString(reader, ">>");
-            
-            return dictionary;
-        }
-
-        [ItemCanBeNull]
-        private (CosName key, CosBase value) ParseCosDictionaryNameValuePair(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool)
-        {
-            var key = nameParser.Parse(reader);
-            var value = ParseValue(reader, baseParser, pool);
-            ReadHelper.SkipSpaces(reader);
-
-            if ((char)reader.Peek() == 'd')
-            {
-                // if the next string is 'def' then we are parsing a cmap stream
-                // and want to ignore it, otherwise throw an exception.
-                var potentialDef = ReadHelper.ReadString(reader);
-                if (!potentialDef.Equals("def"))
-                {
-                    reader.Unread(OtherEncodings.StringAsLatin1Bytes(potentialDef));
-                }
-                else
-                {
-                    ReadHelper.SkipSpaces(reader);
-                }
-            }
-
-            if (value == null)
-            {
-                log?.Warn("Bad Dictionary Declaration " + ReadHelper.ReadString(reader));
-                return (null, null);
-            }
-            
-            // label this item as direct, to avoid signature problems.
-            value.Direct = true;
-
-            return (key, value);
-        }
-        
-        private static CosBase ParseValue(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool)
-        {
-            var numOffset = reader.GetPosition();
-            var value = baseParser.Parse(reader, pool);
-
-            ReadHelper.SkipSpaces(reader);
-
-            // proceed if the given object is a number and the following is a number as well
-            if (!(value is ICosNumber) || !ReadHelper.IsDigit(reader))
-            {
-                return value;
-            }
-            // read the remaining information of the object number
-            var genOffset = reader.GetPosition();
-            var generationNumber = baseParser.Parse(reader, pool);
-            ReadHelper.SkipSpaces(reader);
-            ReadHelper.ReadExpectedChar(reader, 'R');
-            if (!(value is CosInt))
-            {
-                throw new InvalidOperationException("expected number, actual=" + value + " at offset " + numOffset);
-            }
-            if (!(generationNumber is CosInt))
-            {
-                throw new InvalidOperationException("expected number, actual=" + value + " at offset " + genOffset);
-            }
-
-            var key = new IndirectReference(((CosInt)value).AsLong(), ((CosInt)generationNumber).AsInt());
-
-            // dereference the object
-            return pool.Get(key);
-        }
-
-        private static bool ReadUntilEnd(IRandomAccessRead reader)
-        {
-            var c = reader.Read();
-            while (c != -1 && c != '/' && c != '>')
-            {
-                // in addition to stopping when we find / or >, we also want
-                // to stop when we find endstream or endobj.
-                if (c == E)
-                {
-                    c = reader.Read();
-                    if (c == N)
-                    {
-                        c = reader.Read();
-                        if (c == D)
-                        {
-                            c = reader.Read();
-                            var isStream = c == S && reader.Read() == T && reader.Read() == R
-                                           && reader.Read() == E && reader.Read() == A && reader.Read() == M;
-                            var isObj = !isStream && c == O && reader.Read() == B && reader.Read() == J;
-                            if (isStream || isObj)
-                            {
-                                // we're done reading this object!
-                                return true;
-                            }
-                        }
-                    }
-                }
-                c = reader.Read();
-            }
-            if (c == -1)
-            {
-                return true;
-            }
-            reader.Unread(c);
-            return false;
-        }
-    }
-}
-
-
-
--- a/src/UglyToad.PdfPig/Parser/Parts/CosNameParser.cs
+++ b/src/UglyToad.PdfPig/Parser/Parts/CosNameParser.cs
@@ -1,88 +0,0 @@
-namespace UglyToad.PdfPig.Parser.Parts
-{
-    using System;
-    using System.IO;
-    using System.Text;
-    using Cos;
-    using IO;
-    using Util.JetBrains.Annotations;
-
-    internal class CosNameParser
-    {
-        [NotNull]
-        public CosName Parse([NotNull]IRandomAccessRead reader)
-        {
-            if (reader == null)
-            {
-                throw new ArgumentNullException(nameof(reader));
-            }
-
-            ReadHelper.ReadExpectedChar(reader, '/');
-
-            using (var memoryStream = new MemoryStream())
-            using (var writer = new BinaryWriter(memoryStream))
-            {
-                int c = reader.Read();
-                while (c != -1)
-                {
-                    byte ch = (byte)c;
-                    if (ch == '#')
-                    {
-                        int ch1 = reader.Read();
-                        int ch2 = reader.Read();
-                        // Prior to PDF v1.2, the # was not a special character.  Also,
-                        // it has been observed that various PDF tools do not follow the
-                        // spec with respect to the # escape, even though they report
-                        // PDF versions of 1.2 or later.  The solution here is that we
-                        // interpret the # as an escape only when it is followed by two
-                        // valid hex digits.
-                        if (ReadHelper.IsHexDigit((char)ch1) && ReadHelper.IsHexDigit((char)ch2))
-                        {
-                            string hex = "" + (char)ch1 + (char)ch2;
-                            try
-                            {
-                                var byteToWrite = (byte)Convert.ToInt32(hex, 16);
-                                writer.Write(byteToWrite);
-                            }
-                            catch (FormatException e)
-                            {
-                                throw new IOException("Error: expected hex digit, actual='" + hex + "'", e);
-                            }
-                            c = reader.Read();
-                        }
-                        else
-                        {
-                            // check for premature EOF
-                            if (ch2 == -1 || ch1 == -1)
-                            {
-                                //LOG.error("Premature EOF in BaseParser#parseCosName");
-                                c = -1;
-                                break;
-                            }
-                            reader.Unread(ch2);
-                            c = ch1;
-                            writer.Write(ch);
-                        }
-                    }
-                    else if (ReadHelper.IsEndOfName(ch))
-                    {
-                        break;
-                    }
-                    else
-                    {
-                        writer.Write(ch);
-                        c = reader.Read();
-                    }
-                }
-                if (c != -1)
-                {
-                    reader.Unread(c);
-                }
-
-                byte[] bytes = memoryStream.ToArray();
-                var str = ReadHelper.IsValidUtf8(bytes) ? Encoding.UTF8.GetString(memoryStream.ToArray()) : Encoding.GetEncoding("windows-1252").GetString(memoryStream.ToArray());
-                return CosName.Create(str);
-            }
-        }
-    }
-}
--- a/src/UglyToad.PdfPig/Parser/Parts/CosStringParser.cs
+++ b/src/UglyToad.PdfPig/Parser/Parts/CosStringParser.cs
@@ -1,260 +0,0 @@
-namespace UglyToad.PdfPig.Parser.Parts
-{
-    using System;
-    using System.IO;
-    using System.Text;
-    using Cos;
-    using IO;
-
-    internal class CosStringParser
-    {
-        public CosString Parse(IRandomAccessRead seqSource)
-        {
-            char nextChar = (char)seqSource.Read();
-            if (nextChar == '<')
-            {
-                return ParseHexString(seqSource);
-            }
-
-            if (nextChar != '(')
-            {
-                throw new IOException("parseCOSstring string should start with '(' or '<' and not '" +
-                nextChar + "' " + seqSource);
-            }
-            
-            using (var memoryStream = new MemoryStream())
-            using (var writer = new StreamWriter(memoryStream))
-            {
-                // This is the number of braces read
-                int braces = 1;
-                int c = seqSource.Read();
-                while (braces > 0 && c != -1)
-                {
-                    char ch = (char) c;
-                    int nextc = -2; // not yet read
-
-                    if (ch == ')')
-                    {
-
-                        braces--;
-                        braces = CheckForEndOfString(seqSource, braces);
-                        if (braces != 0)
-                        {
-                            writer.Write(ch);
-                        }
-                    }
-                    else if (ch == '(')
-                    {
-                        braces++;
-                        writer.Write(ch);
-                    }
-                    else if (ch == '\\')
-                    {
-                        //patched by ram
-                        char next = (char) seqSource.Read();
-                        switch (next)
-                        {
-                            case 'n':
-                                writer.Write('\n');
-                                break;
-                            case 'r':
-                                writer.Write('\r');
-                                break;
-                            case 't':
-                                writer.Write('\t');
-                                break;
-                            case 'b':
-                                writer.Write('\b');
-                                break;
-                            case 'f':
-                                writer.Write('\f');
-                                break;
-                            case ')':
-                                // PDFBox 276 /Title (c:\)
-                                braces = CheckForEndOfString(seqSource, braces);
-                                if (braces != 0)
-                                {
-                                    writer.Write(next);
-                                }
-                                else
-                                {
-                                    writer.Write('\\');
-                                }
-                                break;
-                            case '(':
-                            case '\\':
-                                writer.Write(next);
-                                break;
-                            case '0':
-                            case '1':
-                            case '2':
-                            case '3':
-                            case '4':
-                            case '5':
-                            case '6':
-                            case '7':
-                            {
-                                var octal = new StringBuilder();
-                                octal.Append(next);
-                                c = seqSource.Read();
-                                char digit = (char) c;
-                                if (digit >= '0' && digit <= '7')
-                                {
-                                    octal.Append(digit);
-                                    c = seqSource.Read();
-                                    digit = (char) c;
-                                    if (digit >= '0' && digit <= '7')
-                                    {
-                                        octal.Append(digit);
-                                    }
-                                    else
-                                    {
-                                        nextc = c;
-                                    }
-                                }
-                                else
-                                {
-                                    nextc = c;
-                                }
-
-                                int character;
-                                try
-                                {
-                                    character = Convert.ToInt32(octal.ToString(), 8);
-                                }
-                                catch (FormatException e)
-                                {
-                                    throw new IOException("Error: Expected octal character, actual='" + octal + "'", e);
-                                }
-
-                                writer.Write(character);
-                                break;
-                            }
-                            default:
-                                if (c == ReadHelper.AsciiCarriageReturn || c == ReadHelper.AsciiLineFeed)
-                                {
-                                    // this is a break in the line so ignore it and the newline and continue
-                                    c = seqSource.Read();
-                                    while (ReadHelper.IsEndOfLine(c) && c != -1)
-                                    {
-                                        c = seqSource.Read();
-                                    }
-
-                                    nextc = c;
-
-                                    break;
-                                }
-                                // dropping the backslash
-                                // see 7.3.4.2 Literal strings for further information
-                                writer.Write(next);
-                                break;
-
-                        }
-                    }
-                    else
-                    {
-                        writer.Write(ch);
-                    }
-                    if (nextc != -2)
-                    {
-                        c = nextc;
-                    }
-                    else
-                    {
-                        c = seqSource.Read();
-                    }
-                }
-                if (c != -1)
-                {
-                    seqSource.Unread(c);
-                }
-                writer.Flush();
-                return new CosString(memoryStream.ToArray());
-            }
-        }
-
-        private static int CheckForEndOfString(IRandomAccessRead reader, int bracesParameter)
-        {
-            int braces = bracesParameter;
-            byte[] nextThreeBytes = new byte[3];
-            int amountRead = reader.Read(nextThreeBytes);
-
-            // Check the next 3 bytes if available
-            // The following cases are valid indicators for the end of the string
-            // 1. Next line contains another COSObject: CR + LF + '/'
-            // 2. CosDictionary ends in the next line: CR + LF + '>'
-            // 3. Next line contains another COSObject: CR + '/'
-            // 4. CosDictionary ends in the next line: CR + '>'
-            if (amountRead == 3 && nextThreeBytes[0] == ReadHelper.AsciiCarriageReturn)
-            {
-                if (nextThreeBytes[1] == ReadHelper.AsciiLineFeed && nextThreeBytes[2] == '/' || nextThreeBytes[2] == '>'
-                    || nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')
-                {
-                    braces = 0;
-                }
-            }
-            if (amountRead > 0)
-            {
-                reader.Unread(nextThreeBytes, 0, amountRead);
-            }
-            return braces;
-        }
-
-        /// <summary>
-        /// This will parse a PDF HEX string with fail fast semantic meaning that we stop if a not allowed character is found.
-        /// This is necessary in order to detect malformed input and be able to skip to next object start.
-        /// We assume starting '&lt;' was already read.
-        /// </summary>
-        private static CosString ParseHexString(IRandomAccessRead reader)
-        {
-            var sBuf = new StringBuilder();
-            while (true)
-            {
-                int c = reader.Read();
-                if (ReadHelper.IsHexDigit((char)c))
-                {
-                    sBuf.Append((char)c);
-                }
-                else if (c == '>')
-                {
-                    break;
-                }
-                else if (c < 0)
-                {
-                    throw new IOException("Missing closing bracket for hex string. Reached EOS.");
-                }
-                else if (c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\b' || c == '\f')
-                {
-                }
-                else
-                {
-                    // if invalid chars was found: discard last
-                    // hex character if it is not part of a pair
-                    if (sBuf.Length % 2 != 0)
-                    {
-                        sBuf.Remove(sBuf.Length - 1, 1);
-                    }
-
-                    // read till the closing bracket was found
-                    do
-                    {
-                        c = reader.Read();
-                    }
-                    while (c != '>' && c >= 0);
-
-                    // might have reached EOF while looking for the closing bracket
-                    // this can happen for malformed PDFs only. Make sure that there is
-                    // no endless loop.
-                    if (c < 0)
-                    {
-                        throw new IOException("Missing closing bracket for hex string. Reached EOS.");
-                    }
-
-                    // exit loop
-                    break;
-                }
-            }
-            return CosString.ParseHex(sBuf.ToString());
-        }
-    }
-}
--- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
+++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
@@ -17,6 +17,7 @@
    using IO;
    using Logging;
    using Parts;
+    using Parts.CrossReference;
    using Tokenization.Scanner;
    using Tokenization.Tokens;
    using Util;
@@ -64,18 +65,21 @@
            var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, pool, bruteForceSearcher);
            var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider);

+            var xrefValidator = new XrefOffsetValidator(log);
+
+            var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
+            var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser, new CrossReferenceTableParser());
+            
            var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing);
            
            var crossReferenceOffset = container.Get<FileTrailerParser>().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing);
            
            // TODO: make this use the scanner.
-            var validator = new CrossReferenceOffsetValidator(new XrefOffsetValidator(log, reader, container.Get<CosDictionaryParser>(),
-                container.Get<CosBaseParser>(), pool));
+            var validator = new CrossReferenceOffsetValidator(xrefValidator);

-            crossReferenceOffset = validator.Validate(crossReferenceOffset, isLenientParsing);
+            crossReferenceOffset = validator.Validate(crossReferenceOffset, scanner, reader, isLenientParsing);
            
-            crossReferenceTable = container.Get<CrossReferenceParser>()
-                .Parse(reader, isLenientParsing, crossReferenceOffset, pool, pdfScanner, scanner);
+            crossReferenceTable = crossReferenceParser.Parse(reader, isLenientParsing, crossReferenceOffset, pool, pdfScanner, scanner);
            
            var trueTypeFontParser = new TrueTypeFontParser();
            var fontDescriptorFactory = new FontDescriptorFactory();
--- a/src/UglyToad.PdfPig/Util/Bootstrapper.cs
+++ b/src/UglyToad.PdfPig/Util/Bootstrapper.cs
@@ -4,8 +4,6 @@
    using Fonts.Parser;
    using Logging;
    using Parser.FileStructure;
-    using Parser.Parts;
-    using Parser.Parts.CrossReference;

    internal static class Bootstrapper
    {
@@ -30,13 +28,7 @@

            var headerParser = new FileHeaderParser(logger);
            var trailerParser = new FileTrailerParser();
-            var nameParser = new CosNameParser();
-            var dictionaryParser = new CosDictionaryParser(nameParser, logger);
-            var baseParser = new CosBaseParser(nameParser, new CosStringParser(), dictionaryParser, new CosArrayParser());
            var filterProvider = new MemoryFilterProvider(new DecodeParameterResolver(logger), new PngPredictor(), logger);
-            var crossReferenceParser = new CrossReferenceStreamParser(filterProvider);
-
-            var crossReferenceTableParser = new CrossReferenceParser(logger, dictionaryParser, baseParser, crossReferenceParser, new CrossReferenceTableParser());
            
            var cmapParser = new CMapParser();
            var afmParser = new AdobeFontMetricsParser();
@@ -44,11 +36,6 @@
            var container = new Container();
            container.Register(headerParser);
            container.Register(trailerParser);
-            container.Register(nameParser);
-            container.Register(dictionaryParser);
-            container.Register(baseParser);
-            container.Register(crossReferenceParser);
-            container.Register(crossReferenceTableParser);
            container.Register(filterProvider);
            container.Register(cmapParser);
            container.Register(afmParser);