Avoid a lot of seeks by making most tokenizers no longer read to far by using seek.

Optimize the FirstPassParser to just fetch a final chunk before doing things char-by-char backwards.
2025-11-24 08:47:01 +08:00 · 2025-10-16 11:36:49 +02:00
parent 40bcc22ea1
commit e11dc6bf40
19 changed files with 177 additions and 95 deletions
--- a/src/UglyToad.PdfPig.Core/ReadHelper.cs
+++ b/src/UglyToad.PdfPig.Core/ReadHelper.cs
@@ -24,12 +24,17 @@
        /// </summary>
        public const byte AsciiCarriageReturn = 13;
        /// <summary>
        /// The tab '\t' character.
        /// </summary>
        public const byte AsciiTab = 9;
        private static readonly HashSet<int> EndOfNameCharacters =
        [
            ' ',
            AsciiCarriageReturn,
            AsciiLineFeed,
-            9,
+            AsciiTab,
            '>',
            '<',
            '[',
--- a/src/UglyToad.PdfPig.Core/StreamInputBytes.cs
+++ b/src/UglyToad.PdfPig.Core/StreamInputBytes.cs
@@ -96,6 +96,17 @@
        /// <inheritdoc />
        public void Seek(long position)
        {
            var current = CurrentOffset;
            if (position == current)
            {
                return;
            }
            else if (peekByte.HasValue && position == current + 1)
            {
                MoveNext();
                return;
            }
            isAtEnd = false;
            peekByte = null;
--- a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1ArrayTokenizer.cs
+++ b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1ArrayTokenizer.cs
@@ -14,7 +14,7 @@
        /// <inheritdoc />
        public bool ReadsNextByte { get; } = false;
-        private static readonly string[] Space = [" "];
+        private static readonly char[] Space = [' '];
        /// <inheritdoc />
        public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
--- a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs
+++ b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1FontParser.cs
@@ -88,6 +88,11 @@
                        {
                            int offset = 0;
                            while (inputBytes.Peek() is { } b && ReadHelper.IsWhitespace(b))
                            {
                                inputBytes.MoveNext();
                            }
                            while (inputBytes.MoveNext())
                            {
                                if (inputBytes.CurrentByte == (byte)ClearToMark[offset])
--- a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1Tokenizer.cs
+++ b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1Tokenizer.cs
@@ -2,6 +2,7 @@
 {
    using System;
    using System.Collections.Generic;
    using System.Diagnostics;
    using System.Globalization;
    using System.Text;
    using Core;
@@ -41,35 +42,43 @@
            do
            {
                skip = false;
-                while (bytes.MoveNext())
+                while (bytes.Peek() is { } b)
                {
                    var b = bytes.CurrentByte;
                    var c = (char)b;
                    switch (c)
                    {
                        case '%':
                            bytes.MoveNext();
                            comments.Add(ReadComment());
                            break;
                        case '(':
                            bytes.MoveNext();
                            return ReadString();
                        case ')':
                            throw new InvalidOperationException("Encountered an end of string ')' outside of string.");
                        case '[':
                            bytes.MoveNext();
                            return new Type1Token(c, Type1Token.TokenType.StartArray);
                        case ']':
                            bytes.MoveNext();
                            return new Type1Token(c, Type1Token.TokenType.EndArray);
                        case '{':
                            bytes.MoveNext();
                            return new Type1Token(c, Type1Token.TokenType.StartProc);
                        case '}':
                            bytes.MoveNext();
                            return new Type1Token(c, Type1Token.TokenType.EndProc);
                        case '/':
                            {
-                                var name = ReadLiteral();
+                                bytes.MoveNext();
                                TryReadLiteral(out var name);
                                Debug.Assert(name != null);
                                return new Type1Token(name, Type1Token.TokenType.Literal);
                            }
                        case '<':
                            {
                                bytes.MoveNext();
                                var following = bytes.Peek();
                                if (following == '<')
                                {
@@ -81,6 +90,7 @@
                            }
                        case '>':
                            {
                                bytes.MoveNext();
                                var following = bytes.Peek();
                                if (following == '>')
                                {
@@ -94,23 +104,24 @@
                            {
                                if (ReadHelper.IsWhitespace(b))
                                {
                                    bytes.MoveNext();
                                    skip = true;
                                    break;
                                }
                                if (b == 0)
                                {
                                    bytes.MoveNext();
                                    skip = true;
                                    break;
                                }
-                                if (TryReadNumber(c, out var number))
+                                if (TryReadNumber(out var number))
                                {
                                    return number;
                                }
-                                var name = ReadLiteral(c);
+                                if (!TryReadLiteral(out var name))
                                if (name == null)
                                {
                                    throw new InvalidOperationException($"The binary portion of the type 1 font was invalid at position {bytes.CurrentOffset}.");
                                }
@@ -197,12 +208,21 @@
            return null;
        }
-        private bool TryReadNumber(char c, out Type1Token numberToken)
+        private bool TryReadNumber(out Type1Token numberToken)
        {
            char GetNext()
            {
                bytes.MoveNext();
-                return (char)bytes.CurrentByte;
+                return (char)(bytes.Peek() ?? 0);
            }
            char c = (char)(bytes.Peek() ?? 0);
            if (!((c >= '0' && c <= '9') || c is '+' or '-'))
            {
                // Easy out. Not a valid number
                numberToken = null;
                return false;
            }
            numberToken = null;
@@ -251,8 +271,6 @@
            else
            {
                // integer
                bytes.Seek(bytes.CurrentOffset - 1);
                numberToken = new Type1Token(sb.ToString(), Type1Token.TokenType.Integer);
                return true;
            }
@@ -309,7 +327,6 @@
                }
            }
            bytes.Seek(bytes.CurrentOffset - 1);
            if (radix != null)
            {
                var number = Convert.ToInt32(sb.ToString(), int.Parse(radix.ToString(), CultureInfo.InvariantCulture));
@@ -323,14 +340,9 @@
            return true;
        }
-        private string ReadLiteral(char? previousCharacter = null)
+        private bool TryReadLiteral(out string? value)
        {
            literalBuffer.Clear();
            if (previousCharacter.HasValue)
            {
                literalBuffer.Append(previousCharacter);
            }
            do
            {
                var b = bytes.Peek();
@@ -350,8 +362,16 @@
                literalBuffer.Append(c);
            } while (bytes.MoveNext());
-            var literal = literalBuffer.ToString();
+            if (literalBuffer.Length > 0)
-            return literal.Length == 0 ? null : literal;
+            {
                value = literalBuffer.ToString();
                return true;
            }
            else
            {
                value = null;
                return false;
            }
        }
        private string ReadComment()
@@ -375,9 +395,10 @@
        private Type1DataToken ReadCharString(int length)
        {
            // Skip preceding space.
-            bytes.MoveNext();
+            if (bytes.Peek() is { } ws && ReadHelper.IsWhitespace(ws))
-            // TODO: may be wrong
+            {
-           // bytes.MoveNext();
+                bytes.MoveNext();
            }
            byte[] data = new byte[length];
            for (int i = 0; i < length; i++)
--- a/src/UglyToad.PdfPig.Tests/Tokenization/NumericTokenizerTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Tokenization/NumericTokenizerTests.cs
@@ -91,7 +91,10 @@
            Assert.True(result);
            Assert.Equal(135.6654, AssertNumericToken(token).Data);
-            Assert.Equal('/', (char)input.Bytes.CurrentByte);
+            if (tokenizer.ReadsNextByte)
                Assert.Equal('/', (char)input.Bytes.CurrentByte);
            else
                Assert.Equal('4', (char)input.Bytes.CurrentByte);
        }
        [Fact]
--- a/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs
@@ -9,7 +9,7 @@
    {
        private readonly bool usePdfDocEncoding;
-        public bool ReadsNextByte { get; } = false;
+        public bool ReadsNextByte => false;
        public ArrayTokenizer(bool usePdfDocEncoding)
        {
--- a/src/UglyToad.PdfPig.Tokenization/CommentTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/CommentTokenizer.cs
@@ -6,7 +6,7 @@
    internal sealed class CommentTokenizer : ITokenizer
    {
-        public bool ReadsNextByte { get; } = true;
+        public bool ReadsNextByte => false;
        public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
        {
@@ -17,10 +17,11 @@
                return false;
            }
-            using var builder = new ValueStringBuilder();
+            using var builder = new ValueStringBuilder(stackalloc char[32]);
-            while (inputBytes.MoveNext() && !ReadHelper.IsEndOfLine(inputBytes.CurrentByte))
+            while (inputBytes.Peek() is { } c && !ReadHelper.IsEndOfLine(c))
            {
                inputBytes.MoveNext();
                builder.Append((char) inputBytes.CurrentByte);
            }
--- a/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/DictionaryTokenizer.cs
@@ -11,7 +11,7 @@
        private readonly IReadOnlyList<NameToken> requiredKeys;
        private readonly bool useLenientParsing;
-        public bool ReadsNextByte { get; } = false;
+        public bool ReadsNextByte => false;
        /// <summary>
        /// Create a new <see cref="DictionaryTokenizer"/>.
--- a/src/UglyToad.PdfPig.Tokenization/EndOfLineTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/EndOfLineTokenizer.cs
@@ -9,7 +9,7 @@
    public sealed class EndOfLineTokenizer : ITokenizer
    {
        /// <inheritdoc />
-        public bool ReadsNextByte { get; } = false;
+        public bool ReadsNextByte => false;
        /// <inheritdoc />
        public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
--- a/src/UglyToad.PdfPig.Tokenization/HexTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/HexTokenizer.cs
@@ -5,7 +5,7 @@
    internal sealed class HexTokenizer : ITokenizer
    {
-        public bool ReadsNextByte { get; } = false;
+        public bool ReadsNextByte => false;
        public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
        {
--- a/src/UglyToad.PdfPig.Tokenization/NameTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/NameTokenizer.cs
@@ -11,14 +11,14 @@
    internal sealed class NameTokenizer : ITokenizer
    {
 #if NET
        static NameTokenizer()
        {
 #if NET
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
 #endif
        }
 #endif
-        public bool ReadsNextByte { get; } = true;
+        public bool ReadsNextByte => false;
        public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
        {
@@ -35,10 +35,8 @@
            int postEscapeRead = 0;
            Span<char> escapedChars = stackalloc char[2];
-            while (inputBytes.MoveNext())
+            while (inputBytes.Peek() is { } b)
            {
                var b = inputBytes.CurrentByte;
                if (b == '#')
                {
                    escapeActive = true;
@@ -52,8 +50,9 @@
                        if (postEscapeRead == 2)
                        {
-                            int high = escapedChars[0] <= '9' ? escapedChars[0] - '0' : char.ToUpper(escapedChars[0]) - 'A' + 10;
+                            // We validated that the char is hex. So assume ASCII rules apply and shortcut hex decoding
-                            int low = escapedChars[1] <= '9' ? escapedChars[1] - '0' : char.ToUpper(escapedChars[1]) - 'A' + 10;
+                            int high = escapedChars[0] <= '9' ? escapedChars[0] - '0' : ((escapedChars[0] & 0xF) + 9);
                            int low = escapedChars[1] <= '9' ? escapedChars[1] - '0' : ((escapedChars[1] & 0xF) + 9);
                            byte characterToWrite = (byte)(high * 16 + low);
@@ -100,6 +99,8 @@
                {
                    bytes.Write(b);
                }
                inputBytes.MoveNext();
            }
 #if NET8_0_OR_GREATER
--- a/src/UglyToad.PdfPig.Tokenization/NumericTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/NumericTokenizer.cs
@@ -7,15 +7,7 @@ using Tokens;
 internal sealed class NumericTokenizer : ITokenizer
 {
-    private const byte Zero = 48;
+    public bool ReadsNextByte => false;
    private const byte Nine = 57;
    private const byte Negative = (byte)'-';
    private const byte Positive = (byte)'+';
    private const byte Period = (byte)'.';
    private const byte ExponentLower = (byte)'e';
    private const byte ExponentUpper = (byte)'E';
    public bool ReadsNextByte => true;
    public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken? token)
    {
@@ -37,30 +29,50 @@ internal sealed class NumericTokenizer : ITokenizer
        var isExponentNegative = false;
        var exponentPart = 0;
-        do
+        byte? firstByte = currentByte;
        bool noRead = true;
        bool acceptSign = true;
        while (!inputBytes.IsAtEnd() || firstByte is { })
        {
-            var b = inputBytes.CurrentByte;
+            if (firstByte is { } b)
            if (b >= Zero && b <= Nine)
            {
                firstByte = null;
            }
            else if (noRead)
            {
                noRead = false;
                b = inputBytes.Peek() ?? 0;
            }
            else
            {
                inputBytes.MoveNext();
                b = inputBytes.Peek() ?? 0;
            }
            if (b >= '0' && b <= '9')
            {
                var value = b - '0';
                if (hasExponent)
                {
-                    exponentPart = (exponentPart * 10) + (b - Zero);
+                    exponentPart = (exponentPart * 10) + value;
                }
                else if (hasFraction)
                {
-                    fractionalPart = (fractionalPart * 10) + (b - Zero);
+                    fractionalPart = (fractionalPart * 10) + value;
                    fractionalCount++;
                }
                else
                {
-                    integerPart = (integerPart * 10) + (b - Zero);
+                    integerPart = (integerPart * 10) + value;
                }
                acceptSign = false;
            }
-            else if (b == Positive)
+            else if (b == '+' && acceptSign)
            {
                // Has no impact
                acceptSign = false;
            }
-            else if (b == Negative)
+            else if (b == '-' && acceptSign)
            {
                if (hasExponent)
                {
@@ -70,30 +82,17 @@ internal sealed class NumericTokenizer : ITokenizer
                {
                    isNegative = true;
                }
                // acceptSign = false; // Somehow we have a test that expects to support "--21.72" to return -21.72
            }
-            else if (b == Period)
+            else if (b == '.' && !hasExponent && !hasFraction)
            {
                if (hasExponent || hasFraction)
                {
                    return false;
                }
                hasFraction = true;
                acceptSign = false;
            }
-            else if (b == ExponentLower || b == ExponentUpper)
+            else if ((b == 'e' || b == 'E') && readBytes > 0 && !hasExponent)
            {
                // Don't allow leading exponent.
                if (readBytes == 0)
                {
                    return false;
                }
                if (hasExponent)
                {
                    return false;
                }
                hasExponent = true;
                acceptSign = true;
            }
            else
            {
@@ -107,7 +106,7 @@ internal sealed class NumericTokenizer : ITokenizer
            }
            readBytes++;
-        } while (inputBytes.MoveNext());
+        }
        if (hasExponent && !isExponentNegative)
        {
--- a/src/UglyToad.PdfPig.Tokenization/PlainTokenizer.cs
+++ b/src/UglyToad.PdfPig.Tokenization/PlainTokenizer.cs
@@ -6,7 +6,7 @@
    internal sealed class PlainTokenizer : ITokenizer
    {
-        public bool ReadsNextByte { get; } = true;
+        public bool ReadsNextByte => false;
        public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
        {
@@ -21,18 +21,11 @@
            builder.Append((char)currentByte);
-            while (inputBytes.MoveNext())
+            while (inputBytes.Peek() is { } b
                && !ReadHelper.IsWhitespace(b)
                && (char)b is not '<' and not '[' and not '/' and not ']' and not '>' and not '(' and not ')')
            {
-                if (ReadHelper.IsWhitespace(inputBytes.CurrentByte))
+                inputBytes.MoveNext();
                {
                    break;
                }
                if (inputBytes.CurrentByte is (byte)'<' or (byte)'[' or (byte)'/' or (byte)']' or (byte)'>' or (byte)'(' or (byte)')')
                {
                    break;
                }
                builder.Append((char) inputBytes.CurrentByte);
            }
--- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
+++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs
@@ -246,7 +246,7 @@
                /* 
                 * Some tokenizers need to read the symbol of the next token to know if they have ended
-                 * so we don't want to move on to the next byte, we would lose a byte, e.g.: /NameOne/NameTwo or /Name(string)                
+                 * so we don't want to move on to the next byte, we would lose a byte, e.g.: /NameOne/NameTwo or /Name(string)
                 */
                hasBytePreRead = tokenizer.ReadsNextByte;
@@ -317,12 +317,13 @@
        {
            // The ID operator should be followed by a single white-space character, and the next character is interpreted
            // as the first byte of image data. 
-            if (!ReadHelper.IsWhitespace(inputBytes.CurrentByte))
+            if (inputBytes.Peek() is { } c
                && !ReadHelper.IsWhitespace(c))
            {
                throw new PdfDocumentFormatException($"No whitespace character following the image data (ID) operator. Position: {inputBytes.CurrentOffset}.");
            }
-            var startsAt = inputBytes.CurrentOffset - 2;
+            var startsAt = inputBytes.CurrentOffset - 1;
            return ReadUntilEndImage(startsAt);
        }
--- a/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs
@@ -80,9 +80,8 @@
            }
            var atEnd = scanner.CurrentPosition == scanner.Length;
            var rewind = atEnd ? 1 : 2;
-            var commentOffset = scanner.CurrentPosition - comment.Data.Length - rewind;
+            var commentOffset = scanner.CurrentPosition - comment.Data.Length - 1;
            scanner.Seek(0);
--- a/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.StartXref.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.StartXref.cs
@@ -10,13 +10,56 @@ internal static partial class FirstPassParser
 {
    private static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
    public const long EndOfFileBufferSize = 1024;
    public static StartXRefLocation GetFirstCrossReferenceOffset(
        IInputBytes bytes,
        ISeekableTokenScanner scanner,
        ILog log)
    {
        // We used to read backward through the file, but this is quite expensive for streams that directly wrap OS files.
        // Instead we fetch the last 1024 bytes of the file and do a memory search, as cheap first attempt. This is significantly faster
        // in practice, if there is no in-process caching of the file involved
        // 
        // If that fails (in practice it should never) we fall back to the old method of reading backwards.
        var fileLength = bytes.Length;
        {
            var fetchFrom = Math.Max(bytes.Length - EndOfFileBufferSize, 0L);
            bytes.Seek(fetchFrom);
            Span<byte> byteBuffer = new byte[bytes.Length - fetchFrom];   // TODO: Maybe use PoolArray?
            int n = bytes.Read(byteBuffer);
            if (n == byteBuffer.Length)
            {
                int lx = byteBuffer.LastIndexOf("startxref"u8);
                if (lx < 0)
                {
                    // See old code. We also try a mangled version
                    lx = byteBuffer.LastIndexOf("startref"u8);
                }
                if (lx >= 0)
                {
                    scanner.Seek(fetchFrom + lx);
                    if (scanner.TryReadToken(out OperatorToken startXrefOp) && (startXrefOp.Data == "startxref" || startXrefOp.Data == "startref"))
                    {
                        var pos = GetNumericTokenFollowingCurrent(scanner);
                        log.Debug($"Found startxref at {pos}");
                        return new StartXRefLocation(fetchFrom + lx, pos);
                    }
                }
            }
        }
        // Now fall through in the old code
        var buffer = new CircularByteBuffer(StartXRefBytes.Length);
        // Start from the end of the file
--- a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
+++ b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
@@ -57,7 +57,7 @@
                    {
                        var next = bytes.Peek();
-                        if (next.HasValue && next == 'n')
+                        if (next == 'n')
                        {
                            if (ReadHelper.IsString(bytes, "endobj"))
                            {
--- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
@@ -465,7 +465,7 @@
                read++;
            }
-            long streamDataEnd = inputBytes.CurrentOffset + 1;
+            long streamDataEnd = inputBytes.CurrentOffset;
            if (possibleEndLocation == null)
                return false;