src/UglyToad.Pdf/Tokenization/Scanner/CoreTokenScanner.cs

namespace UglyToad.Pdf.Tokenization.Scanner
{
    using System;
    using System.Collections.Generic;
    using IO;
    using Parser.Parts;
    using Text.Operators;
    using Tokenization;
    using Tokens;

    internal enum ScannerScope
    {
        None,
        Array,
        Dictionary
    }

    public class CoreTokenScanner : ITokenScanner
    {
        private static readonly HexTokenizer HexTokenizer = new HexTokenizer();
        private static readonly StringTokenizer StringTokenizer = new StringTokenizer();
        private static readonly Tokenization.NumericTokenizer NumericTokenizer = new Tokenization.NumericTokenizer();
        private static readonly NameTokenizer NameTokenizer = new NameTokenizer();
        private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer();
        private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer();

        private readonly ScannerScope scope;
        private readonly IInputBytes inputBytes;
        private readonly List<byte> currentBuffer = new List<byte>();
        
        public IToken CurrentToken { get; private set; }

        private bool hasBytePreRead;

        internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None)
        {
            this.scope = scope;
            this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
        }

        public bool MoveNext()
        {
            currentBuffer.Clear();

            var endAngleBracesRead = 0;

            bool isSkippingSymbol = false;
            while ((hasBytePreRead && !inputBytes.IsAtEnd()) || inputBytes.MoveNext())
            {
                hasBytePreRead = false;
                var currentByte = inputBytes.CurrentByte;
                var c = (char) currentByte;
                
                if (BaseTextComponentApproach.IsEmpty(currentByte)
                    || ReadHelper.IsWhitespace(currentByte))
                {
                    isSkippingSymbol = false;
                    continue;
                }

                // If we failed to read the symbol for whatever reason we pass over it.
                if (isSkippingSymbol)
                {
                    continue;
                }

                ITokenizer tokenizer = null;
                switch (c)
                {
                    case '(':
                        tokenizer = StringTokenizer;
                        break;
                    case '<':
                        var following = inputBytes.Peek();
                        if (following == '<')
                        {
                            isSkippingSymbol = true;
                            // TODO: Dictionary tokenizer
                        }
                        else
                        {
                            tokenizer = HexTokenizer;
                        }
                        break;
                    case '>' when scope == ScannerScope.Dictionary:
                        endAngleBracesRead++;
                        if (endAngleBracesRead == 2)
                        {
                            return false;
                        }
                        break;
                    case '[':
                        tokenizer = ArrayTokenizer;
                        break;
                    case ']' when scope == ScannerScope.Array:
                        return false;
                    case '/':
                        tokenizer = NameTokenizer;
                        break;
                    case '0':
                    case '1':
                    case '2':
                    case '3':
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                    case '8':
                    case '9':
                    case '-':
                    case '+':
                    case '.':
                        tokenizer = NumericTokenizer;
                        break;
                    default:
                        tokenizer = PlainTokenizer;
                        break;
                }

                if (tokenizer == null || !tokenizer.TryTokenize(currentByte, inputBytes, out var token))
                {
                    isSkippingSymbol = true;
                    hasBytePreRead = false;
                    continue;
                }

                CurrentToken = token;

                /* 
                 * Some tokenizers need to read the symbol of the next token to know if they have ended
                 * so we don't want to move on to the next byte, we would lose a byte, e.g.: /NameOne/NameTwo or /Name(string)                
                 */
                hasBytePreRead = tokenizer.ReadsNextByte;

                return true;
            }

            return false;
        }
    }
}
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								namespace UglyToad.Pdf.Tokenization.Scanner
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								{
 								    using System;
 								    using System.Collections.Generic;
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								    using IO;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								    using Parser.Parts;
 								    using Text.Operators;
 								    using Tokenization;
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								    using Tokens;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
-												support for tokenizing arrays and nested arrays

											
										
										
											2017-11-12 14:42:01 +00:00
+								    internal enum ScannerScope
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								    {
-												support for tokenizing arrays and nested arrays

											
										
										
											2017-11-12 14:42:01 +00:00
+								        None,
 								        Array,
 								        Dictionary
 								    }
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
-												support for tokenizing arrays and nested arrays

											
										
										
											2017-11-12 14:42:01 +00:00
+								    public class CoreTokenScanner : ITokenScanner
 								    {
-												rename hex tokenizer and add numeric tokenizer

											
										
										
											2017-11-10 21:13:45 +00:00
+								        private static readonly HexTokenizer HexTokenizer = new HexTokenizer();
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								        private static readonly StringTokenizer StringTokenizer = new StringTokenizer();
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								        private static readonly Tokenization.NumericTokenizer NumericTokenizer = new Tokenization.NumericTokenizer();
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								        private static readonly NameTokenizer NameTokenizer = new NameTokenizer();
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								        private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer();
-												support for tokenizing arrays and nested arrays

											
										
										
											2017-11-12 14:42:01 +00:00
+								        private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer();
 								        private readonly ScannerScope scope;
 								        private readonly IInputBytes inputBytes;
 								        private readonly List<byte> currentBuffer = new List<byte>();
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								        public IToken CurrentToken { get; private set; }
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								        private bool hasBytePreRead;
-												support for tokenizing arrays and nested arrays

											
										
										
											2017-11-12 14:42:01 +00:00
+								        internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None)
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								        {
-												support for tokenizing arrays and nested arrays

											
										
										
											2017-11-12 14:42:01 +00:00
+								            this.scope = scope;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								            this.inputBytes = inputBytes ?? throw new ArgumentNullException(nameof(inputBytes));
 								        }
 								        public bool MoveNext()
 								        {
 								            currentBuffer.Clear();
-												support for tokenizing arrays and nested arrays

											
										
										
											2017-11-12 14:42:01 +00:00
+								            var endAngleBracesRead = 0;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								            bool isSkippingSymbol = false;
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								            while ((hasBytePreRead && !inputBytes.IsAtEnd()) || inputBytes.MoveNext())
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								            {
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								                hasBytePreRead = false;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								                var currentByte = inputBytes.CurrentByte;
-												support for tokenizing arrays and nested arrays

											
										
										
											2017-11-12 14:42:01 +00:00
+								                var c = (char) currentByte;
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								                if (BaseTextComponentApproach.IsEmpty(currentByte)
 								                    || ReadHelper.IsWhitespace(currentByte))
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								                {
 								                    isSkippingSymbol = false;
 								                    continue;
 								                }
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								                // If we failed to read the symbol for whatever reason we pass over it.
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								                if (isSkippingSymbol)
 								                {
 								                    continue;
 								                }
 								                ITokenizer tokenizer = null;
-												support for tokenizing arrays and nested arrays

											
										
										
											2017-11-12 14:42:01 +00:00
+								                switch (c)
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								                {
 								                    case '(':
 								                        tokenizer = StringTokenizer;
 								                        break;
 								                    case '<':
 								                        var following = inputBytes.Peek();
 								                        if (following == '<')
 								                        {
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								                            isSkippingSymbol = true;
 								                            // TODO: Dictionary tokenizer
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								                        }
 								                        else
 								                        {
-												rename hex tokenizer and add numeric tokenizer

											
										
										
											2017-11-10 21:13:45 +00:00
+								                            tokenizer = HexTokenizer;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								                        }
 								                        break;
-												support for tokenizing arrays and nested arrays

											
										
										
											2017-11-12 14:42:01 +00:00
+								                    case '>' when scope == ScannerScope.Dictionary:
 								                        endAngleBracesRead++;
 								                        if (endAngleBracesRead == 2)
 								                        {
 								                            return false;
 								                        }
 								                        break;
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								                    case '[':
-												support for tokenizing arrays and nested arrays

											
										
										
											2017-11-12 14:42:01 +00:00
+								                        tokenizer = ArrayTokenizer;
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								                        break;
-												support for tokenizing arrays and nested arrays

											
										
										
											2017-11-12 14:42:01 +00:00
+								                    case ']' when scope == ScannerScope.Array:
 								                        return false;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								                    case '/':
 								                        tokenizer = NameTokenizer;
 								                        break;
 								                    case '0':
 								                    case '1':
 								                    case '2':
 								                    case '3':
 								                    case '4':
 								                    case '5':
 								                    case '6':
 								                    case '7':
 								                    case '8':
 								                    case '9':
 								                    case '-':
 								                    case '+':
 								                    case '.':
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								                        tokenizer = NumericTokenizer;
 								                        break;
 								                    default:
 								                        tokenizer = PlainTokenizer;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								                        break;
 								                }
 								                if (tokenizer == null || !tokenizer.TryTokenize(currentByte, inputBytes, out var token))
 								                {
 								                    isSkippingSymbol = true;
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								                    hasBytePreRead = false;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								                    continue;
 								                }
 								                CurrentToken = token;
-												add some more tokens and test the core token scanner for some initial test cases, seems like the approach should be valid.

											
										
										
											2017-11-12 01:08:09 +00:00
+								                /*
 								                 * Some tokenizers need to read the symbol of the next token to know if they have ended
 								                 * so we don't want to move on to the next byte, we would lose a byte, e.g.: /NameOne/NameTwo or /Name(string)
 								                 */
 								                hasBytePreRead = tokenizer.ReadsNextByte;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-09 19:14:09 +00:00
+								                return true;
 								            }
 								            return false;
 								        }
 								    }
 								}