Avoid a lot of seeks by making most tokenizers no longer read to far by using seek.

Optimize the FirstPassParser to just fetch a final chunk before doing things char-by-char backwards.
This commit is contained in:
Bert Huijben
2025-10-16 11:36:49 +02:00
committed by BobLd
parent 40bcc22ea1
commit e11dc6bf40
19 changed files with 177 additions and 95 deletions

View File

@@ -24,12 +24,17 @@
/// </summary> /// </summary>
public const byte AsciiCarriageReturn = 13; public const byte AsciiCarriageReturn = 13;
/// <summary>
/// The tab '\t' character.
/// </summary>
public const byte AsciiTab = 9;
private static readonly HashSet<int> EndOfNameCharacters = private static readonly HashSet<int> EndOfNameCharacters =
[ [
' ', ' ',
AsciiCarriageReturn, AsciiCarriageReturn,
AsciiLineFeed, AsciiLineFeed,
9, AsciiTab,
'>', '>',
'<', '<',
'[', '[',

View File

@@ -96,6 +96,17 @@
/// <inheritdoc /> /// <inheritdoc />
public void Seek(long position) public void Seek(long position)
{ {
var current = CurrentOffset;
if (position == current)
{
return;
}
else if (peekByte.HasValue && position == current + 1)
{
MoveNext();
return;
}
isAtEnd = false; isAtEnd = false;
peekByte = null; peekByte = null;

View File

@@ -14,7 +14,7 @@
/// <inheritdoc /> /// <inheritdoc />
public bool ReadsNextByte { get; } = false; public bool ReadsNextByte { get; } = false;
private static readonly string[] Space = [" "]; private static readonly char[] Space = [' '];
/// <inheritdoc /> /// <inheritdoc />
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)

View File

@@ -88,6 +88,11 @@
{ {
int offset = 0; int offset = 0;
while (inputBytes.Peek() is { } b && ReadHelper.IsWhitespace(b))
{
inputBytes.MoveNext();
}
while (inputBytes.MoveNext()) while (inputBytes.MoveNext())
{ {
if (inputBytes.CurrentByte == (byte)ClearToMark[offset]) if (inputBytes.CurrentByte == (byte)ClearToMark[offset])

View File

@@ -2,6 +2,7 @@
{ {
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization; using System.Globalization;
using System.Text; using System.Text;
using Core; using Core;
@@ -41,35 +42,43 @@
do do
{ {
skip = false; skip = false;
while (bytes.MoveNext()) while (bytes.Peek() is { } b)
{ {
var b = bytes.CurrentByte;
var c = (char)b; var c = (char)b;
switch (c) switch (c)
{ {
case '%': case '%':
bytes.MoveNext();
comments.Add(ReadComment()); comments.Add(ReadComment());
break; break;
case '(': case '(':
bytes.MoveNext();
return ReadString(); return ReadString();
case ')': case ')':
throw new InvalidOperationException("Encountered an end of string ')' outside of string."); throw new InvalidOperationException("Encountered an end of string ')' outside of string.");
case '[': case '[':
bytes.MoveNext();
return new Type1Token(c, Type1Token.TokenType.StartArray); return new Type1Token(c, Type1Token.TokenType.StartArray);
case ']': case ']':
bytes.MoveNext();
return new Type1Token(c, Type1Token.TokenType.EndArray); return new Type1Token(c, Type1Token.TokenType.EndArray);
case '{': case '{':
bytes.MoveNext();
return new Type1Token(c, Type1Token.TokenType.StartProc); return new Type1Token(c, Type1Token.TokenType.StartProc);
case '}': case '}':
bytes.MoveNext();
return new Type1Token(c, Type1Token.TokenType.EndProc); return new Type1Token(c, Type1Token.TokenType.EndProc);
case '/': case '/':
{ {
var name = ReadLiteral(); bytes.MoveNext();
TryReadLiteral(out var name);
Debug.Assert(name != null);
return new Type1Token(name, Type1Token.TokenType.Literal); return new Type1Token(name, Type1Token.TokenType.Literal);
} }
case '<': case '<':
{ {
bytes.MoveNext();
var following = bytes.Peek(); var following = bytes.Peek();
if (following == '<') if (following == '<')
{ {
@@ -81,6 +90,7 @@
} }
case '>': case '>':
{ {
bytes.MoveNext();
var following = bytes.Peek(); var following = bytes.Peek();
if (following == '>') if (following == '>')
{ {
@@ -94,23 +104,24 @@
{ {
if (ReadHelper.IsWhitespace(b)) if (ReadHelper.IsWhitespace(b))
{ {
bytes.MoveNext();
skip = true; skip = true;
break; break;
} }
if (b == 0) if (b == 0)
{ {
bytes.MoveNext();
skip = true; skip = true;
break; break;
} }
if (TryReadNumber(c, out var number)) if (TryReadNumber(out var number))
{ {
return number; return number;
} }
var name = ReadLiteral(c); if (!TryReadLiteral(out var name))
if (name == null)
{ {
throw new InvalidOperationException($"The binary portion of the type 1 font was invalid at position {bytes.CurrentOffset}."); throw new InvalidOperationException($"The binary portion of the type 1 font was invalid at position {bytes.CurrentOffset}.");
} }
@@ -197,12 +208,21 @@
return null; return null;
} }
private bool TryReadNumber(char c, out Type1Token numberToken) private bool TryReadNumber(out Type1Token numberToken)
{ {
char GetNext() char GetNext()
{ {
bytes.MoveNext(); bytes.MoveNext();
return (char)bytes.CurrentByte; return (char)(bytes.Peek() ?? 0);
}
char c = (char)(bytes.Peek() ?? 0);
if (!((c >= '0' && c <= '9') || c is '+' or '-'))
{
// Easy out. Not a valid number
numberToken = null;
return false;
} }
numberToken = null; numberToken = null;
@@ -251,8 +271,6 @@
else else
{ {
// integer // integer
bytes.Seek(bytes.CurrentOffset - 1);
numberToken = new Type1Token(sb.ToString(), Type1Token.TokenType.Integer); numberToken = new Type1Token(sb.ToString(), Type1Token.TokenType.Integer);
return true; return true;
} }
@@ -309,7 +327,6 @@
} }
} }
bytes.Seek(bytes.CurrentOffset - 1);
if (radix != null) if (radix != null)
{ {
var number = Convert.ToInt32(sb.ToString(), int.Parse(radix.ToString(), CultureInfo.InvariantCulture)); var number = Convert.ToInt32(sb.ToString(), int.Parse(radix.ToString(), CultureInfo.InvariantCulture));
@@ -323,14 +340,9 @@
return true; return true;
} }
private string ReadLiteral(char? previousCharacter = null) private bool TryReadLiteral(out string? value)
{ {
literalBuffer.Clear(); literalBuffer.Clear();
if (previousCharacter.HasValue)
{
literalBuffer.Append(previousCharacter);
}
do do
{ {
var b = bytes.Peek(); var b = bytes.Peek();
@@ -350,8 +362,16 @@
literalBuffer.Append(c); literalBuffer.Append(c);
} while (bytes.MoveNext()); } while (bytes.MoveNext());
var literal = literalBuffer.ToString(); if (literalBuffer.Length > 0)
return literal.Length == 0 ? null : literal; {
value = literalBuffer.ToString();
return true;
}
else
{
value = null;
return false;
}
} }
private string ReadComment() private string ReadComment()
@@ -375,9 +395,10 @@
private Type1DataToken ReadCharString(int length) private Type1DataToken ReadCharString(int length)
{ {
// Skip preceding space. // Skip preceding space.
if (bytes.Peek() is { } ws && ReadHelper.IsWhitespace(ws))
{
bytes.MoveNext(); bytes.MoveNext();
// TODO: may be wrong }
// bytes.MoveNext();
byte[] data = new byte[length]; byte[] data = new byte[length];
for (int i = 0; i < length; i++) for (int i = 0; i < length; i++)

View File

@@ -91,7 +91,10 @@
Assert.True(result); Assert.True(result);
Assert.Equal(135.6654, AssertNumericToken(token).Data); Assert.Equal(135.6654, AssertNumericToken(token).Data);
if (tokenizer.ReadsNextByte)
Assert.Equal('/', (char)input.Bytes.CurrentByte); Assert.Equal('/', (char)input.Bytes.CurrentByte);
else
Assert.Equal('4', (char)input.Bytes.CurrentByte);
} }
[Fact] [Fact]

View File

@@ -9,7 +9,7 @@
{ {
private readonly bool usePdfDocEncoding; private readonly bool usePdfDocEncoding;
public bool ReadsNextByte { get; } = false; public bool ReadsNextByte => false;
public ArrayTokenizer(bool usePdfDocEncoding) public ArrayTokenizer(bool usePdfDocEncoding)
{ {

View File

@@ -6,7 +6,7 @@
internal sealed class CommentTokenizer : ITokenizer internal sealed class CommentTokenizer : ITokenizer
{ {
public bool ReadsNextByte { get; } = true; public bool ReadsNextByte => false;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{ {
@@ -17,10 +17,11 @@
return false; return false;
} }
using var builder = new ValueStringBuilder(); using var builder = new ValueStringBuilder(stackalloc char[32]);
while (inputBytes.MoveNext() && !ReadHelper.IsEndOfLine(inputBytes.CurrentByte)) while (inputBytes.Peek() is { } c && !ReadHelper.IsEndOfLine(c))
{ {
inputBytes.MoveNext();
builder.Append((char) inputBytes.CurrentByte); builder.Append((char) inputBytes.CurrentByte);
} }

View File

@@ -11,7 +11,7 @@
private readonly IReadOnlyList<NameToken> requiredKeys; private readonly IReadOnlyList<NameToken> requiredKeys;
private readonly bool useLenientParsing; private readonly bool useLenientParsing;
public bool ReadsNextByte { get; } = false; public bool ReadsNextByte => false;
/// <summary> /// <summary>
/// Create a new <see cref="DictionaryTokenizer"/>. /// Create a new <see cref="DictionaryTokenizer"/>.

View File

@@ -9,7 +9,7 @@
public sealed class EndOfLineTokenizer : ITokenizer public sealed class EndOfLineTokenizer : ITokenizer
{ {
/// <inheritdoc /> /// <inheritdoc />
public bool ReadsNextByte { get; } = false; public bool ReadsNextByte => false;
/// <inheritdoc /> /// <inheritdoc />
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)

View File

@@ -5,7 +5,7 @@
internal sealed class HexTokenizer : ITokenizer internal sealed class HexTokenizer : ITokenizer
{ {
public bool ReadsNextByte { get; } = false; public bool ReadsNextByte => false;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{ {

View File

@@ -11,14 +11,14 @@
internal sealed class NameTokenizer : ITokenizer internal sealed class NameTokenizer : ITokenizer
{ {
#if NET
static NameTokenizer() static NameTokenizer()
{ {
#if NET
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
} }
#endif
public bool ReadsNextByte { get; } = true; public bool ReadsNextByte => false;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{ {
@@ -35,10 +35,8 @@
int postEscapeRead = 0; int postEscapeRead = 0;
Span<char> escapedChars = stackalloc char[2]; Span<char> escapedChars = stackalloc char[2];
while (inputBytes.MoveNext()) while (inputBytes.Peek() is { } b)
{ {
var b = inputBytes.CurrentByte;
if (b == '#') if (b == '#')
{ {
escapeActive = true; escapeActive = true;
@@ -52,8 +50,9 @@
if (postEscapeRead == 2) if (postEscapeRead == 2)
{ {
int high = escapedChars[0] <= '9' ? escapedChars[0] - '0' : char.ToUpper(escapedChars[0]) - 'A' + 10; // We validated that the char is hex. So assume ASCII rules apply and shortcut hex decoding
int low = escapedChars[1] <= '9' ? escapedChars[1] - '0' : char.ToUpper(escapedChars[1]) - 'A' + 10; int high = escapedChars[0] <= '9' ? escapedChars[0] - '0' : ((escapedChars[0] & 0xF) + 9);
int low = escapedChars[1] <= '9' ? escapedChars[1] - '0' : ((escapedChars[1] & 0xF) + 9);
byte characterToWrite = (byte)(high * 16 + low); byte characterToWrite = (byte)(high * 16 + low);
@@ -100,6 +99,8 @@
{ {
bytes.Write(b); bytes.Write(b);
} }
inputBytes.MoveNext();
} }
#if NET8_0_OR_GREATER #if NET8_0_OR_GREATER

View File

@@ -7,15 +7,7 @@ using Tokens;
internal sealed class NumericTokenizer : ITokenizer internal sealed class NumericTokenizer : ITokenizer
{ {
private const byte Zero = 48; public bool ReadsNextByte => false;
private const byte Nine = 57;
private const byte Negative = (byte)'-';
private const byte Positive = (byte)'+';
private const byte Period = (byte)'.';
private const byte ExponentLower = (byte)'e';
private const byte ExponentUpper = (byte)'E';
public bool ReadsNextByte => true;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken? token) public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken? token)
{ {
@@ -37,30 +29,50 @@ internal sealed class NumericTokenizer : ITokenizer
var isExponentNegative = false; var isExponentNegative = false;
var exponentPart = 0; var exponentPart = 0;
do byte? firstByte = currentByte;
bool noRead = true;
bool acceptSign = true;
while (!inputBytes.IsAtEnd() || firstByte is { })
{ {
var b = inputBytes.CurrentByte; if (firstByte is { } b)
if (b >= Zero && b <= Nine)
{ {
firstByte = null;
}
else if (noRead)
{
noRead = false;
b = inputBytes.Peek() ?? 0;
}
else
{
inputBytes.MoveNext();
b = inputBytes.Peek() ?? 0;
}
if (b >= '0' && b <= '9')
{
var value = b - '0';
if (hasExponent) if (hasExponent)
{ {
exponentPart = (exponentPart * 10) + (b - Zero); exponentPart = (exponentPart * 10) + value;
} }
else if (hasFraction) else if (hasFraction)
{ {
fractionalPart = (fractionalPart * 10) + (b - Zero); fractionalPart = (fractionalPart * 10) + value;
fractionalCount++; fractionalCount++;
} }
else else
{ {
integerPart = (integerPart * 10) + (b - Zero); integerPart = (integerPart * 10) + value;
} }
acceptSign = false;
} }
else if (b == Positive) else if (b == '+' && acceptSign)
{ {
// Has no impact // Has no impact
acceptSign = false;
} }
else if (b == Negative) else if (b == '-' && acceptSign)
{ {
if (hasExponent) if (hasExponent)
{ {
@@ -70,30 +82,17 @@ internal sealed class NumericTokenizer : ITokenizer
{ {
isNegative = true; isNegative = true;
} }
// acceptSign = false; // Somehow we have a test that expects to support "--21.72" to return -21.72
} }
else if (b == Period) else if (b == '.' && !hasExponent && !hasFraction)
{ {
if (hasExponent || hasFraction)
{
return false;
}
hasFraction = true; hasFraction = true;
acceptSign = false;
} }
else if (b == ExponentLower || b == ExponentUpper) else if ((b == 'e' || b == 'E') && readBytes > 0 && !hasExponent)
{ {
// Don't allow leading exponent.
if (readBytes == 0)
{
return false;
}
if (hasExponent)
{
return false;
}
hasExponent = true; hasExponent = true;
acceptSign = true;
} }
else else
{ {
@@ -107,7 +106,7 @@ internal sealed class NumericTokenizer : ITokenizer
} }
readBytes++; readBytes++;
} while (inputBytes.MoveNext()); }
if (hasExponent && !isExponentNegative) if (hasExponent && !isExponentNegative)
{ {

View File

@@ -6,7 +6,7 @@
internal sealed class PlainTokenizer : ITokenizer internal sealed class PlainTokenizer : ITokenizer
{ {
public bool ReadsNextByte { get; } = true; public bool ReadsNextByte => false;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{ {
@@ -21,18 +21,11 @@
builder.Append((char)currentByte); builder.Append((char)currentByte);
while (inputBytes.MoveNext()) while (inputBytes.Peek() is { } b
&& !ReadHelper.IsWhitespace(b)
&& (char)b is not '<' and not '[' and not '/' and not ']' and not '>' and not '(' and not ')')
{ {
if (ReadHelper.IsWhitespace(inputBytes.CurrentByte)) inputBytes.MoveNext();
{
break;
}
if (inputBytes.CurrentByte is (byte)'<' or (byte)'[' or (byte)'/' or (byte)']' or (byte)'>' or (byte)'(' or (byte)')')
{
break;
}
builder.Append((char) inputBytes.CurrentByte); builder.Append((char) inputBytes.CurrentByte);
} }

View File

@@ -317,12 +317,13 @@
{ {
// The ID operator should be followed by a single white-space character, and the next character is interpreted // The ID operator should be followed by a single white-space character, and the next character is interpreted
// as the first byte of image data. // as the first byte of image data.
if (!ReadHelper.IsWhitespace(inputBytes.CurrentByte)) if (inputBytes.Peek() is { } c
&& !ReadHelper.IsWhitespace(c))
{ {
throw new PdfDocumentFormatException($"No whitespace character following the image data (ID) operator. Position: {inputBytes.CurrentOffset}."); throw new PdfDocumentFormatException($"No whitespace character following the image data (ID) operator. Position: {inputBytes.CurrentOffset}.");
} }
var startsAt = inputBytes.CurrentOffset - 2; var startsAt = inputBytes.CurrentOffset - 1;
return ReadUntilEndImage(startsAt); return ReadUntilEndImage(startsAt);
} }

View File

@@ -80,9 +80,8 @@
} }
var atEnd = scanner.CurrentPosition == scanner.Length; var atEnd = scanner.CurrentPosition == scanner.Length;
var rewind = atEnd ? 1 : 2;
var commentOffset = scanner.CurrentPosition - comment.Data.Length - rewind; var commentOffset = scanner.CurrentPosition - comment.Data.Length - 1;
scanner.Seek(0); scanner.Seek(0);

View File

@@ -10,13 +10,56 @@ internal static partial class FirstPassParser
{ {
private static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8; private static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
public const long EndOfFileBufferSize = 1024;
public static StartXRefLocation GetFirstCrossReferenceOffset( public static StartXRefLocation GetFirstCrossReferenceOffset(
IInputBytes bytes, IInputBytes bytes,
ISeekableTokenScanner scanner, ISeekableTokenScanner scanner,
ILog log) ILog log)
{ {
// We used to read backward through the file, but this is quite expensive for streams that directly wrap OS files.
// Instead we fetch the last 1024 bytes of the file and do a memory search, as cheap first attempt. This is significantly faster
// in practice, if there is no in-process caching of the file involved
//
// If that fails (in practice it should never) we fall back to the old method of reading backwards.
var fileLength = bytes.Length; var fileLength = bytes.Length;
{
var fetchFrom = Math.Max(bytes.Length - EndOfFileBufferSize, 0L);
bytes.Seek(fetchFrom);
Span<byte> byteBuffer = new byte[bytes.Length - fetchFrom]; // TODO: Maybe use PoolArray?
int n = bytes.Read(byteBuffer);
if (n == byteBuffer.Length)
{
int lx = byteBuffer.LastIndexOf("startxref"u8);
if (lx < 0)
{
// See old code. We also try a mangled version
lx = byteBuffer.LastIndexOf("startref"u8);
}
if (lx >= 0)
{
scanner.Seek(fetchFrom + lx);
if (scanner.TryReadToken(out OperatorToken startXrefOp) && (startXrefOp.Data == "startxref" || startXrefOp.Data == "startref"))
{
var pos = GetNumericTokenFollowingCurrent(scanner);
log.Debug($"Found startxref at {pos}");
return new StartXRefLocation(fetchFrom + lx, pos);
}
}
}
}
// Now fall through in the old code
var buffer = new CircularByteBuffer(StartXRefBytes.Length); var buffer = new CircularByteBuffer(StartXRefBytes.Length);
// Start from the end of the file // Start from the end of the file

View File

@@ -57,7 +57,7 @@
{ {
var next = bytes.Peek(); var next = bytes.Peek();
if (next.HasValue && next == 'n') if (next == 'n')
{ {
if (ReadHelper.IsString(bytes, "endobj")) if (ReadHelper.IsString(bytes, "endobj"))
{ {

View File

@@ -465,7 +465,7 @@
read++; read++;
} }
long streamDataEnd = inputBytes.CurrentOffset + 1; long streamDataEnd = inputBytes.CurrentOffset;
if (possibleEndLocation == null) if (possibleEndLocation == null)
return false; return false;