mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-11-24 08:47:01 +08:00
Avoid a lot of seeks by making most tokenizers no longer read to far by using seek.
Optimize the FirstPassParser to just fetch a final chunk before doing things char-by-char backwards.
This commit is contained in:
@@ -24,12 +24,17 @@
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public const byte AsciiCarriageReturn = 13;
|
public const byte AsciiCarriageReturn = 13;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The tab '\t' character.
|
||||||
|
/// </summary>
|
||||||
|
public const byte AsciiTab = 9;
|
||||||
|
|
||||||
private static readonly HashSet<int> EndOfNameCharacters =
|
private static readonly HashSet<int> EndOfNameCharacters =
|
||||||
[
|
[
|
||||||
' ',
|
' ',
|
||||||
AsciiCarriageReturn,
|
AsciiCarriageReturn,
|
||||||
AsciiLineFeed,
|
AsciiLineFeed,
|
||||||
9,
|
AsciiTab,
|
||||||
'>',
|
'>',
|
||||||
'<',
|
'<',
|
||||||
'[',
|
'[',
|
||||||
|
|||||||
@@ -96,6 +96,17 @@
|
|||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
public void Seek(long position)
|
public void Seek(long position)
|
||||||
{
|
{
|
||||||
|
var current = CurrentOffset;
|
||||||
|
if (position == current)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else if (peekByte.HasValue && position == current + 1)
|
||||||
|
{
|
||||||
|
MoveNext();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
isAtEnd = false;
|
isAtEnd = false;
|
||||||
peekByte = null;
|
peekByte = null;
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@
|
|||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
public bool ReadsNextByte { get; } = false;
|
public bool ReadsNextByte { get; } = false;
|
||||||
|
|
||||||
private static readonly string[] Space = [" "];
|
private static readonly char[] Space = [' '];
|
||||||
|
|
||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||||
|
|||||||
@@ -88,6 +88,11 @@
|
|||||||
{
|
{
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
|
|
||||||
|
while (inputBytes.Peek() is { } b && ReadHelper.IsWhitespace(b))
|
||||||
|
{
|
||||||
|
inputBytes.MoveNext();
|
||||||
|
}
|
||||||
|
|
||||||
while (inputBytes.MoveNext())
|
while (inputBytes.MoveNext())
|
||||||
{
|
{
|
||||||
if (inputBytes.CurrentByte == (byte)ClearToMark[offset])
|
if (inputBytes.CurrentByte == (byte)ClearToMark[offset])
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
{
|
{
|
||||||
using System;
|
using System;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
|
using System.Diagnostics;
|
||||||
using System.Globalization;
|
using System.Globalization;
|
||||||
using System.Text;
|
using System.Text;
|
||||||
using Core;
|
using Core;
|
||||||
@@ -41,35 +42,43 @@
|
|||||||
do
|
do
|
||||||
{
|
{
|
||||||
skip = false;
|
skip = false;
|
||||||
while (bytes.MoveNext())
|
while (bytes.Peek() is { } b)
|
||||||
{
|
{
|
||||||
var b = bytes.CurrentByte;
|
|
||||||
var c = (char)b;
|
var c = (char)b;
|
||||||
|
|
||||||
switch (c)
|
switch (c)
|
||||||
{
|
{
|
||||||
case '%':
|
case '%':
|
||||||
|
bytes.MoveNext();
|
||||||
comments.Add(ReadComment());
|
comments.Add(ReadComment());
|
||||||
break;
|
break;
|
||||||
case '(':
|
case '(':
|
||||||
|
bytes.MoveNext();
|
||||||
return ReadString();
|
return ReadString();
|
||||||
case ')':
|
case ')':
|
||||||
throw new InvalidOperationException("Encountered an end of string ')' outside of string.");
|
throw new InvalidOperationException("Encountered an end of string ')' outside of string.");
|
||||||
case '[':
|
case '[':
|
||||||
|
bytes.MoveNext();
|
||||||
return new Type1Token(c, Type1Token.TokenType.StartArray);
|
return new Type1Token(c, Type1Token.TokenType.StartArray);
|
||||||
case ']':
|
case ']':
|
||||||
|
bytes.MoveNext();
|
||||||
return new Type1Token(c, Type1Token.TokenType.EndArray);
|
return new Type1Token(c, Type1Token.TokenType.EndArray);
|
||||||
case '{':
|
case '{':
|
||||||
|
bytes.MoveNext();
|
||||||
return new Type1Token(c, Type1Token.TokenType.StartProc);
|
return new Type1Token(c, Type1Token.TokenType.StartProc);
|
||||||
case '}':
|
case '}':
|
||||||
|
bytes.MoveNext();
|
||||||
return new Type1Token(c, Type1Token.TokenType.EndProc);
|
return new Type1Token(c, Type1Token.TokenType.EndProc);
|
||||||
case '/':
|
case '/':
|
||||||
{
|
{
|
||||||
var name = ReadLiteral();
|
bytes.MoveNext();
|
||||||
|
TryReadLiteral(out var name);
|
||||||
|
Debug.Assert(name != null);
|
||||||
return new Type1Token(name, Type1Token.TokenType.Literal);
|
return new Type1Token(name, Type1Token.TokenType.Literal);
|
||||||
}
|
}
|
||||||
case '<':
|
case '<':
|
||||||
{
|
{
|
||||||
|
bytes.MoveNext();
|
||||||
var following = bytes.Peek();
|
var following = bytes.Peek();
|
||||||
if (following == '<')
|
if (following == '<')
|
||||||
{
|
{
|
||||||
@@ -81,6 +90,7 @@
|
|||||||
}
|
}
|
||||||
case '>':
|
case '>':
|
||||||
{
|
{
|
||||||
|
bytes.MoveNext();
|
||||||
var following = bytes.Peek();
|
var following = bytes.Peek();
|
||||||
if (following == '>')
|
if (following == '>')
|
||||||
{
|
{
|
||||||
@@ -94,23 +104,24 @@
|
|||||||
{
|
{
|
||||||
if (ReadHelper.IsWhitespace(b))
|
if (ReadHelper.IsWhitespace(b))
|
||||||
{
|
{
|
||||||
|
bytes.MoveNext();
|
||||||
skip = true;
|
skip = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (b == 0)
|
if (b == 0)
|
||||||
{
|
{
|
||||||
|
bytes.MoveNext();
|
||||||
skip = true;
|
skip = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (TryReadNumber(c, out var number))
|
if (TryReadNumber(out var number))
|
||||||
{
|
{
|
||||||
return number;
|
return number;
|
||||||
}
|
}
|
||||||
|
|
||||||
var name = ReadLiteral(c);
|
if (!TryReadLiteral(out var name))
|
||||||
if (name == null)
|
|
||||||
{
|
{
|
||||||
throw new InvalidOperationException($"The binary portion of the type 1 font was invalid at position {bytes.CurrentOffset}.");
|
throw new InvalidOperationException($"The binary portion of the type 1 font was invalid at position {bytes.CurrentOffset}.");
|
||||||
}
|
}
|
||||||
@@ -197,12 +208,21 @@
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private bool TryReadNumber(char c, out Type1Token numberToken)
|
private bool TryReadNumber(out Type1Token numberToken)
|
||||||
{
|
{
|
||||||
char GetNext()
|
char GetNext()
|
||||||
{
|
{
|
||||||
bytes.MoveNext();
|
bytes.MoveNext();
|
||||||
return (char)bytes.CurrentByte;
|
return (char)(bytes.Peek() ?? 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
char c = (char)(bytes.Peek() ?? 0);
|
||||||
|
|
||||||
|
if (!((c >= '0' && c <= '9') || c is '+' or '-'))
|
||||||
|
{
|
||||||
|
// Easy out. Not a valid number
|
||||||
|
numberToken = null;
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
numberToken = null;
|
numberToken = null;
|
||||||
@@ -251,8 +271,6 @@
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
// integer
|
// integer
|
||||||
bytes.Seek(bytes.CurrentOffset - 1);
|
|
||||||
|
|
||||||
numberToken = new Type1Token(sb.ToString(), Type1Token.TokenType.Integer);
|
numberToken = new Type1Token(sb.ToString(), Type1Token.TokenType.Integer);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -309,7 +327,6 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bytes.Seek(bytes.CurrentOffset - 1);
|
|
||||||
if (radix != null)
|
if (radix != null)
|
||||||
{
|
{
|
||||||
var number = Convert.ToInt32(sb.ToString(), int.Parse(radix.ToString(), CultureInfo.InvariantCulture));
|
var number = Convert.ToInt32(sb.ToString(), int.Parse(radix.ToString(), CultureInfo.InvariantCulture));
|
||||||
@@ -323,14 +340,9 @@
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private string ReadLiteral(char? previousCharacter = null)
|
private bool TryReadLiteral(out string? value)
|
||||||
{
|
{
|
||||||
literalBuffer.Clear();
|
literalBuffer.Clear();
|
||||||
if (previousCharacter.HasValue)
|
|
||||||
{
|
|
||||||
literalBuffer.Append(previousCharacter);
|
|
||||||
}
|
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
var b = bytes.Peek();
|
var b = bytes.Peek();
|
||||||
@@ -350,8 +362,16 @@
|
|||||||
literalBuffer.Append(c);
|
literalBuffer.Append(c);
|
||||||
} while (bytes.MoveNext());
|
} while (bytes.MoveNext());
|
||||||
|
|
||||||
var literal = literalBuffer.ToString();
|
if (literalBuffer.Length > 0)
|
||||||
return literal.Length == 0 ? null : literal;
|
{
|
||||||
|
value = literalBuffer.ToString();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
value = null;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private string ReadComment()
|
private string ReadComment()
|
||||||
@@ -375,9 +395,10 @@
|
|||||||
private Type1DataToken ReadCharString(int length)
|
private Type1DataToken ReadCharString(int length)
|
||||||
{
|
{
|
||||||
// Skip preceding space.
|
// Skip preceding space.
|
||||||
bytes.MoveNext();
|
if (bytes.Peek() is { } ws && ReadHelper.IsWhitespace(ws))
|
||||||
// TODO: may be wrong
|
{
|
||||||
// bytes.MoveNext();
|
bytes.MoveNext();
|
||||||
|
}
|
||||||
|
|
||||||
byte[] data = new byte[length];
|
byte[] data = new byte[length];
|
||||||
for (int i = 0; i < length; i++)
|
for (int i = 0; i < length; i++)
|
||||||
|
|||||||
@@ -91,7 +91,10 @@
|
|||||||
Assert.True(result);
|
Assert.True(result);
|
||||||
Assert.Equal(135.6654, AssertNumericToken(token).Data);
|
Assert.Equal(135.6654, AssertNumericToken(token).Data);
|
||||||
|
|
||||||
Assert.Equal('/', (char)input.Bytes.CurrentByte);
|
if (tokenizer.ReadsNextByte)
|
||||||
|
Assert.Equal('/', (char)input.Bytes.CurrentByte);
|
||||||
|
else
|
||||||
|
Assert.Equal('4', (char)input.Bytes.CurrentByte);
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
{
|
{
|
||||||
private readonly bool usePdfDocEncoding;
|
private readonly bool usePdfDocEncoding;
|
||||||
|
|
||||||
public bool ReadsNextByte { get; } = false;
|
public bool ReadsNextByte => false;
|
||||||
|
|
||||||
public ArrayTokenizer(bool usePdfDocEncoding)
|
public ArrayTokenizer(bool usePdfDocEncoding)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
internal sealed class CommentTokenizer : ITokenizer
|
internal sealed class CommentTokenizer : ITokenizer
|
||||||
{
|
{
|
||||||
public bool ReadsNextByte { get; } = true;
|
public bool ReadsNextByte => false;
|
||||||
|
|
||||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||||
{
|
{
|
||||||
@@ -17,10 +17,11 @@
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
using var builder = new ValueStringBuilder();
|
using var builder = new ValueStringBuilder(stackalloc char[32]);
|
||||||
|
|
||||||
while (inputBytes.MoveNext() && !ReadHelper.IsEndOfLine(inputBytes.CurrentByte))
|
while (inputBytes.Peek() is { } c && !ReadHelper.IsEndOfLine(c))
|
||||||
{
|
{
|
||||||
|
inputBytes.MoveNext();
|
||||||
builder.Append((char) inputBytes.CurrentByte);
|
builder.Append((char) inputBytes.CurrentByte);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
private readonly IReadOnlyList<NameToken> requiredKeys;
|
private readonly IReadOnlyList<NameToken> requiredKeys;
|
||||||
private readonly bool useLenientParsing;
|
private readonly bool useLenientParsing;
|
||||||
|
|
||||||
public bool ReadsNextByte { get; } = false;
|
public bool ReadsNextByte => false;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Create a new <see cref="DictionaryTokenizer"/>.
|
/// Create a new <see cref="DictionaryTokenizer"/>.
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
public sealed class EndOfLineTokenizer : ITokenizer
|
public sealed class EndOfLineTokenizer : ITokenizer
|
||||||
{
|
{
|
||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
public bool ReadsNextByte { get; } = false;
|
public bool ReadsNextByte => false;
|
||||||
|
|
||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
internal sealed class HexTokenizer : ITokenizer
|
internal sealed class HexTokenizer : ITokenizer
|
||||||
{
|
{
|
||||||
public bool ReadsNextByte { get; } = false;
|
public bool ReadsNextByte => false;
|
||||||
|
|
||||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -11,14 +11,14 @@
|
|||||||
|
|
||||||
internal sealed class NameTokenizer : ITokenizer
|
internal sealed class NameTokenizer : ITokenizer
|
||||||
{
|
{
|
||||||
|
#if NET
|
||||||
static NameTokenizer()
|
static NameTokenizer()
|
||||||
{
|
{
|
||||||
#if NET
|
|
||||||
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
|
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
public bool ReadsNextByte { get; } = true;
|
public bool ReadsNextByte => false;
|
||||||
|
|
||||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||||
{
|
{
|
||||||
@@ -35,10 +35,8 @@
|
|||||||
int postEscapeRead = 0;
|
int postEscapeRead = 0;
|
||||||
Span<char> escapedChars = stackalloc char[2];
|
Span<char> escapedChars = stackalloc char[2];
|
||||||
|
|
||||||
while (inputBytes.MoveNext())
|
while (inputBytes.Peek() is { } b)
|
||||||
{
|
{
|
||||||
var b = inputBytes.CurrentByte;
|
|
||||||
|
|
||||||
if (b == '#')
|
if (b == '#')
|
||||||
{
|
{
|
||||||
escapeActive = true;
|
escapeActive = true;
|
||||||
@@ -52,8 +50,9 @@
|
|||||||
|
|
||||||
if (postEscapeRead == 2)
|
if (postEscapeRead == 2)
|
||||||
{
|
{
|
||||||
int high = escapedChars[0] <= '9' ? escapedChars[0] - '0' : char.ToUpper(escapedChars[0]) - 'A' + 10;
|
// We validated that the char is hex. So assume ASCII rules apply and shortcut hex decoding
|
||||||
int low = escapedChars[1] <= '9' ? escapedChars[1] - '0' : char.ToUpper(escapedChars[1]) - 'A' + 10;
|
int high = escapedChars[0] <= '9' ? escapedChars[0] - '0' : ((escapedChars[0] & 0xF) + 9);
|
||||||
|
int low = escapedChars[1] <= '9' ? escapedChars[1] - '0' : ((escapedChars[1] & 0xF) + 9);
|
||||||
|
|
||||||
byte characterToWrite = (byte)(high * 16 + low);
|
byte characterToWrite = (byte)(high * 16 + low);
|
||||||
|
|
||||||
@@ -100,6 +99,8 @@
|
|||||||
{
|
{
|
||||||
bytes.Write(b);
|
bytes.Write(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inputBytes.MoveNext();
|
||||||
}
|
}
|
||||||
|
|
||||||
#if NET8_0_OR_GREATER
|
#if NET8_0_OR_GREATER
|
||||||
|
|||||||
@@ -7,15 +7,7 @@ using Tokens;
|
|||||||
|
|
||||||
internal sealed class NumericTokenizer : ITokenizer
|
internal sealed class NumericTokenizer : ITokenizer
|
||||||
{
|
{
|
||||||
private const byte Zero = 48;
|
public bool ReadsNextByte => false;
|
||||||
private const byte Nine = 57;
|
|
||||||
private const byte Negative = (byte)'-';
|
|
||||||
private const byte Positive = (byte)'+';
|
|
||||||
private const byte Period = (byte)'.';
|
|
||||||
private const byte ExponentLower = (byte)'e';
|
|
||||||
private const byte ExponentUpper = (byte)'E';
|
|
||||||
|
|
||||||
public bool ReadsNextByte => true;
|
|
||||||
|
|
||||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken? token)
|
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken? token)
|
||||||
{
|
{
|
||||||
@@ -37,30 +29,50 @@ internal sealed class NumericTokenizer : ITokenizer
|
|||||||
var isExponentNegative = false;
|
var isExponentNegative = false;
|
||||||
var exponentPart = 0;
|
var exponentPart = 0;
|
||||||
|
|
||||||
do
|
byte? firstByte = currentByte;
|
||||||
|
bool noRead = true;
|
||||||
|
bool acceptSign = true;
|
||||||
|
while (!inputBytes.IsAtEnd() || firstByte is { })
|
||||||
{
|
{
|
||||||
var b = inputBytes.CurrentByte;
|
if (firstByte is { } b)
|
||||||
if (b >= Zero && b <= Nine)
|
|
||||||
{
|
{
|
||||||
|
firstByte = null;
|
||||||
|
}
|
||||||
|
else if (noRead)
|
||||||
|
{
|
||||||
|
noRead = false;
|
||||||
|
b = inputBytes.Peek() ?? 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
inputBytes.MoveNext();
|
||||||
|
b = inputBytes.Peek() ?? 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (b >= '0' && b <= '9')
|
||||||
|
{
|
||||||
|
var value = b - '0';
|
||||||
if (hasExponent)
|
if (hasExponent)
|
||||||
{
|
{
|
||||||
exponentPart = (exponentPart * 10) + (b - Zero);
|
exponentPart = (exponentPart * 10) + value;
|
||||||
}
|
}
|
||||||
else if (hasFraction)
|
else if (hasFraction)
|
||||||
{
|
{
|
||||||
fractionalPart = (fractionalPart * 10) + (b - Zero);
|
fractionalPart = (fractionalPart * 10) + value;
|
||||||
fractionalCount++;
|
fractionalCount++;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
integerPart = (integerPart * 10) + (b - Zero);
|
integerPart = (integerPart * 10) + value;
|
||||||
}
|
}
|
||||||
|
acceptSign = false;
|
||||||
}
|
}
|
||||||
else if (b == Positive)
|
else if (b == '+' && acceptSign)
|
||||||
{
|
{
|
||||||
// Has no impact
|
// Has no impact
|
||||||
|
acceptSign = false;
|
||||||
}
|
}
|
||||||
else if (b == Negative)
|
else if (b == '-' && acceptSign)
|
||||||
{
|
{
|
||||||
if (hasExponent)
|
if (hasExponent)
|
||||||
{
|
{
|
||||||
@@ -70,30 +82,17 @@ internal sealed class NumericTokenizer : ITokenizer
|
|||||||
{
|
{
|
||||||
isNegative = true;
|
isNegative = true;
|
||||||
}
|
}
|
||||||
|
// acceptSign = false; // Somehow we have a test that expects to support "--21.72" to return -21.72
|
||||||
}
|
}
|
||||||
else if (b == Period)
|
else if (b == '.' && !hasExponent && !hasFraction)
|
||||||
{
|
{
|
||||||
if (hasExponent || hasFraction)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
hasFraction = true;
|
hasFraction = true;
|
||||||
|
acceptSign = false;
|
||||||
}
|
}
|
||||||
else if (b == ExponentLower || b == ExponentUpper)
|
else if ((b == 'e' || b == 'E') && readBytes > 0 && !hasExponent)
|
||||||
{
|
{
|
||||||
// Don't allow leading exponent.
|
|
||||||
if (readBytes == 0)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hasExponent)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
hasExponent = true;
|
hasExponent = true;
|
||||||
|
acceptSign = true;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -107,7 +106,7 @@ internal sealed class NumericTokenizer : ITokenizer
|
|||||||
}
|
}
|
||||||
|
|
||||||
readBytes++;
|
readBytes++;
|
||||||
} while (inputBytes.MoveNext());
|
}
|
||||||
|
|
||||||
if (hasExponent && !isExponentNegative)
|
if (hasExponent && !isExponentNegative)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
internal sealed class PlainTokenizer : ITokenizer
|
internal sealed class PlainTokenizer : ITokenizer
|
||||||
{
|
{
|
||||||
public bool ReadsNextByte { get; } = true;
|
public bool ReadsNextByte => false;
|
||||||
|
|
||||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||||
{
|
{
|
||||||
@@ -21,18 +21,11 @@
|
|||||||
|
|
||||||
builder.Append((char)currentByte);
|
builder.Append((char)currentByte);
|
||||||
|
|
||||||
while (inputBytes.MoveNext())
|
while (inputBytes.Peek() is { } b
|
||||||
|
&& !ReadHelper.IsWhitespace(b)
|
||||||
|
&& (char)b is not '<' and not '[' and not '/' and not ']' and not '>' and not '(' and not ')')
|
||||||
{
|
{
|
||||||
if (ReadHelper.IsWhitespace(inputBytes.CurrentByte))
|
inputBytes.MoveNext();
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (inputBytes.CurrentByte is (byte)'<' or (byte)'[' or (byte)'/' or (byte)']' or (byte)'>' or (byte)'(' or (byte)')')
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
builder.Append((char) inputBytes.CurrentByte);
|
builder.Append((char) inputBytes.CurrentByte);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -246,7 +246,7 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Some tokenizers need to read the symbol of the next token to know if they have ended
|
* Some tokenizers need to read the symbol of the next token to know if they have ended
|
||||||
* so we don't want to move on to the next byte, we would lose a byte, e.g.: /NameOne/NameTwo or /Name(string)
|
* so we don't want to move on to the next byte, we would lose a byte, e.g.: /NameOne/NameTwo or /Name(string)
|
||||||
*/
|
*/
|
||||||
hasBytePreRead = tokenizer.ReadsNextByte;
|
hasBytePreRead = tokenizer.ReadsNextByte;
|
||||||
|
|
||||||
@@ -317,12 +317,13 @@
|
|||||||
{
|
{
|
||||||
// The ID operator should be followed by a single white-space character, and the next character is interpreted
|
// The ID operator should be followed by a single white-space character, and the next character is interpreted
|
||||||
// as the first byte of image data.
|
// as the first byte of image data.
|
||||||
if (!ReadHelper.IsWhitespace(inputBytes.CurrentByte))
|
if (inputBytes.Peek() is { } c
|
||||||
|
&& !ReadHelper.IsWhitespace(c))
|
||||||
{
|
{
|
||||||
throw new PdfDocumentFormatException($"No whitespace character following the image data (ID) operator. Position: {inputBytes.CurrentOffset}.");
|
throw new PdfDocumentFormatException($"No whitespace character following the image data (ID) operator. Position: {inputBytes.CurrentOffset}.");
|
||||||
}
|
}
|
||||||
|
|
||||||
var startsAt = inputBytes.CurrentOffset - 2;
|
var startsAt = inputBytes.CurrentOffset - 1;
|
||||||
|
|
||||||
return ReadUntilEndImage(startsAt);
|
return ReadUntilEndImage(startsAt);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -80,9 +80,8 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
var atEnd = scanner.CurrentPosition == scanner.Length;
|
var atEnd = scanner.CurrentPosition == scanner.Length;
|
||||||
var rewind = atEnd ? 1 : 2;
|
|
||||||
|
|
||||||
var commentOffset = scanner.CurrentPosition - comment.Data.Length - rewind;
|
var commentOffset = scanner.CurrentPosition - comment.Data.Length - 1;
|
||||||
|
|
||||||
scanner.Seek(0);
|
scanner.Seek(0);
|
||||||
|
|
||||||
|
|||||||
@@ -10,13 +10,56 @@ internal static partial class FirstPassParser
|
|||||||
{
|
{
|
||||||
private static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
|
private static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
|
||||||
|
|
||||||
|
public const long EndOfFileBufferSize = 1024;
|
||||||
|
|
||||||
public static StartXRefLocation GetFirstCrossReferenceOffset(
|
public static StartXRefLocation GetFirstCrossReferenceOffset(
|
||||||
IInputBytes bytes,
|
IInputBytes bytes,
|
||||||
ISeekableTokenScanner scanner,
|
ISeekableTokenScanner scanner,
|
||||||
ILog log)
|
ILog log)
|
||||||
{
|
{
|
||||||
|
// We used to read backward through the file, but this is quite expensive for streams that directly wrap OS files.
|
||||||
|
// Instead we fetch the last 1024 bytes of the file and do a memory search, as cheap first attempt. This is significantly faster
|
||||||
|
// in practice, if there is no in-process caching of the file involved
|
||||||
|
//
|
||||||
|
// If that fails (in practice it should never) we fall back to the old method of reading backwards.
|
||||||
var fileLength = bytes.Length;
|
var fileLength = bytes.Length;
|
||||||
|
{
|
||||||
|
var fetchFrom = Math.Max(bytes.Length - EndOfFileBufferSize, 0L);
|
||||||
|
|
||||||
|
bytes.Seek(fetchFrom);
|
||||||
|
|
||||||
|
Span<byte> byteBuffer = new byte[bytes.Length - fetchFrom]; // TODO: Maybe use PoolArray?
|
||||||
|
|
||||||
|
int n = bytes.Read(byteBuffer);
|
||||||
|
|
||||||
|
if (n == byteBuffer.Length)
|
||||||
|
{
|
||||||
|
int lx = byteBuffer.LastIndexOf("startxref"u8);
|
||||||
|
|
||||||
|
if (lx < 0)
|
||||||
|
{
|
||||||
|
// See old code. We also try a mangled version
|
||||||
|
lx = byteBuffer.LastIndexOf("startref"u8);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lx >= 0)
|
||||||
|
{
|
||||||
|
scanner.Seek(fetchFrom + lx);
|
||||||
|
|
||||||
|
if (scanner.TryReadToken(out OperatorToken startXrefOp) && (startXrefOp.Data == "startxref" || startXrefOp.Data == "startref"))
|
||||||
|
{
|
||||||
|
var pos = GetNumericTokenFollowingCurrent(scanner);
|
||||||
|
|
||||||
|
log.Debug($"Found startxref at {pos}");
|
||||||
|
|
||||||
|
return new StartXRefLocation(fetchFrom + lx, pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now fall through in the old code
|
||||||
var buffer = new CircularByteBuffer(StartXRefBytes.Length);
|
var buffer = new CircularByteBuffer(StartXRefBytes.Length);
|
||||||
|
|
||||||
// Start from the end of the file
|
// Start from the end of the file
|
||||||
|
|||||||
@@ -57,7 +57,7 @@
|
|||||||
{
|
{
|
||||||
var next = bytes.Peek();
|
var next = bytes.Peek();
|
||||||
|
|
||||||
if (next.HasValue && next == 'n')
|
if (next == 'n')
|
||||||
{
|
{
|
||||||
if (ReadHelper.IsString(bytes, "endobj"))
|
if (ReadHelper.IsString(bytes, "endobj"))
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -465,7 +465,7 @@
|
|||||||
read++;
|
read++;
|
||||||
}
|
}
|
||||||
|
|
||||||
long streamDataEnd = inputBytes.CurrentOffset + 1;
|
long streamDataEnd = inputBytes.CurrentOffset;
|
||||||
|
|
||||||
if (possibleEndLocation == null)
|
if (possibleEndLocation == null)
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
Reference in New Issue
Block a user