cache some more common values and improve performance of tokenizers

This commit is contained in:
Eliot Jones
2019-12-22 23:34:33 +00:00
parent e048bb8c2c
commit ba9fe40bc1
4 changed files with 118 additions and 44 deletions

View File

@@ -1,6 +1,7 @@
namespace UglyToad.PdfPig.Tokenization namespace UglyToad.PdfPig.Tokenization
{ {
using System; using System;
using System.Collections.Generic;
using System.Globalization; using System.Globalization;
using System.Text; using System.Text;
using IO; using IO;
@@ -8,6 +9,11 @@
internal class NumericTokenizer : ITokenizer internal class NumericTokenizer : ITokenizer
{ {
private const byte Zero = 48;
private const byte Nine = 57;
private readonly Dictionary<string, NumericToken> cachedTokens = new Dictionary<string, NumericToken>();
public bool ReadsNextByte { get; } = true; public bool ReadsNextByte { get; } = true;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
@@ -16,7 +22,7 @@
StringBuilder characters; StringBuilder characters;
if ((currentByte >= '0' && currentByte <= '9') || currentByte == '-' || currentByte == '+' || currentByte == '.') if ((currentByte >= Zero && currentByte <= Nine) || currentByte == '-' || currentByte == '+' || currentByte == '.')
{ {
characters = new StringBuilder(); characters = new StringBuilder();
characters.Append((char)currentByte); characters.Append((char)currentByte);
@@ -29,16 +35,15 @@
while (inputBytes.MoveNext()) while (inputBytes.MoveNext())
{ {
var b = inputBytes.CurrentByte; var b = inputBytes.CurrentByte;
var c = (char) b;
if (char.IsDigit(c) || if ((b >= Zero && b <= Nine) ||
c == '-' || b == '-' ||
c == '+' || b == '+' ||
c == '.' || b == '.' ||
c == 'E' || b == 'E' ||
c == 'e') b == 'e')
{ {
characters.Append(c); characters.Append((char)b);
} }
else else
{ {
@@ -56,7 +61,38 @@
} }
else else
{ {
value = decimal.Parse(characters.ToString(), NumberStyles.Any, CultureInfo.InvariantCulture); var str = characters.ToString();
switch (str)
{
case "0":
token = NumericToken.Zero;
return true;
case "1":
token = NumericToken.One;
return true;
case "2":
token = NumericToken.Two;
return true;
case "3":
token = NumericToken.Three;
return true;
case "8":
token = NumericToken.Eight;
return true;
default:
{
if (!cachedTokens.TryGetValue(str, out var result))
{
value = decimal.Parse(str, NumberStyles.Any, CultureInfo.InvariantCulture);
result = new NumericToken(value);
cachedTokens[str] = result;
}
token = result;
return true;
}
}
} }
} }
catch (FormatException) catch (FormatException)

View File

@@ -11,13 +11,13 @@
{ {
private static readonly HexTokenizer HexTokenizer = new HexTokenizer(); private static readonly HexTokenizer HexTokenizer = new HexTokenizer();
private static readonly StringTokenizer StringTokenizer = new StringTokenizer(); private static readonly StringTokenizer StringTokenizer = new StringTokenizer();
private static readonly NumericTokenizer NumericTokenizer = new NumericTokenizer();
private static readonly NameTokenizer NameTokenizer = new NameTokenizer(); private static readonly NameTokenizer NameTokenizer = new NameTokenizer();
private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer(); private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer();
private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer(); private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer();
private static readonly DictionaryTokenizer DictionaryTokenizer = new DictionaryTokenizer(); private static readonly DictionaryTokenizer DictionaryTokenizer = new DictionaryTokenizer();
private static readonly CommentTokenizer CommentTokenizer = new CommentTokenizer(); private static readonly CommentTokenizer CommentTokenizer = new CommentTokenizer();
private readonly NumericTokenizer numericTokenizer = new NumericTokenizer();
private readonly ScannerScope scope; private readonly ScannerScope scope;
private readonly IInputBytes inputBytes; private readonly IInputBytes inputBytes;
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>(); private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
@@ -83,7 +83,7 @@
if (tokenizer == null) if (tokenizer == null)
{ {
if (IsEmpty(currentByte) || ReadHelper.IsWhitespace(currentByte)) if (ReadHelper.IsWhitespace(currentByte))
{ {
isSkippingSymbol = false; isSkippingSymbol = false;
continue; continue;
@@ -143,7 +143,7 @@
case '-': case '-':
case '+': case '+':
case '.': case '.':
tokenizer = NumericTokenizer; tokenizer = numericTokenizer;
break; break;
default: default:
tokenizer = PlainTokenizer; tokenizer = PlainTokenizer;
@@ -284,10 +284,5 @@
throw new PdfDocumentFormatException($"No end of inline image data (EI) was found for image data at position {startsAt}."); throw new PdfDocumentFormatException($"No end of inline image data (EI) was found for image data at position {startsAt}.");
} }
private static bool IsEmpty(byte b)
{
return b == ' ' || b == '\r' || b == '\n' || b == 0;
}
} }
} }

View File

@@ -10,6 +10,12 @@
/// </summary> /// </summary>
public class NumericToken : IDataToken<decimal> public class NumericToken : IDataToken<decimal>
{ {
internal static readonly NumericToken Zero = new NumericToken(0);
internal static readonly NumericToken One = new NumericToken(1);
internal static readonly NumericToken Two = new NumericToken(2);
internal static readonly NumericToken Three = new NumericToken(3);
internal static readonly NumericToken Eight = new NumericToken(8);
/// <inheritdoc /> /// <inheritdoc />
public decimal Data { get; } public decimal Data { get; }
@@ -21,17 +27,17 @@
/// <summary> /// <summary>
/// The value of this number as an <see langword="int"/>. /// The value of this number as an <see langword="int"/>.
/// </summary> /// </summary>
public int Int => (int) Data; public int Int => (int)Data;
/// <summary> /// <summary>
/// The value of this number as a <see langword="long"/>. /// The value of this number as a <see langword="long"/>.
/// </summary> /// </summary>
public long Long => (long) Data; public long Long => (long)Data;
/// <summary> /// <summary>
/// The value of this number as a <see langword="double"/>. /// The value of this number as a <see langword="double"/>.
/// </summary> /// </summary>
public double Double => (double) Data; public double Double => (double)Data;
/// <summary> /// <summary>
/// Create a <see cref="NumericToken"/>. /// Create a <see cref="NumericToken"/>.
@@ -42,6 +48,12 @@
Data = value; Data = value;
} }
/// <inheritdoc />
public override int GetHashCode()
{
return Data.GetHashCode();
}
/// <inheritdoc /> /// <inheritdoc />
public override string ToString() public override string ToString()
{ {

View File

@@ -7,18 +7,26 @@
private static readonly object Lock = new object(); private static readonly object Lock = new object();
private static readonly Dictionary<string, string> PooledNames = new Dictionary<string, string>(); private static readonly Dictionary<string, string> PooledNames = new Dictionary<string, string>();
public static readonly OperatorToken R = new OperatorToken("R"); public static readonly OperatorToken Bt = new OperatorToken("BT");
public static readonly OperatorToken StartObject = new OperatorToken("obj");
public static readonly OperatorToken EndObject = new OperatorToken("endobj");
public static readonly OperatorToken StartStream = new OperatorToken("stream");
public static readonly OperatorToken EndStream = new OperatorToken("endstream");
public static readonly OperatorToken Eexec = new OperatorToken("eexec");
public static readonly OperatorToken Def = new OperatorToken("def"); public static readonly OperatorToken Def = new OperatorToken("def");
public static readonly OperatorToken Dict = new OperatorToken("dict"); public static readonly OperatorToken Dict = new OperatorToken("dict");
public static readonly OperatorToken Readonly = new OperatorToken("readonly");
public static readonly OperatorToken Dup = new OperatorToken("dup"); public static readonly OperatorToken Dup = new OperatorToken("dup");
public static readonly OperatorToken Eexec = new OperatorToken("eexec");
public static readonly OperatorToken EndObject = new OperatorToken("endobj");
public static readonly OperatorToken EndStream = new OperatorToken("endstream");
public static readonly OperatorToken Et = new OperatorToken("ET");
public static readonly OperatorToken For = new OperatorToken("for"); public static readonly OperatorToken For = new OperatorToken("for");
public static readonly OperatorToken N = new OperatorToken("n");
public static readonly OperatorToken Put = new OperatorToken("put"); public static readonly OperatorToken Put = new OperatorToken("put");
public static readonly OperatorToken QPop = new OperatorToken("Q");
public static readonly OperatorToken QPush = new OperatorToken("q");
public static readonly OperatorToken R = new OperatorToken("R");
public static readonly OperatorToken Re = new OperatorToken("re");
public static readonly OperatorToken Readonly = new OperatorToken("readonly");
public static readonly OperatorToken StartObject = new OperatorToken("obj");
public static readonly OperatorToken StartStream = new OperatorToken("stream");
public static readonly OperatorToken Tf = new OperatorToken("Tf");
public static readonly OperatorToken WStar = new OperatorToken("W*");
public static readonly OperatorToken Xref = new OperatorToken("xref"); public static readonly OperatorToken Xref = new OperatorToken("xref");
public string Data { get; } public string Data { get; }
@@ -43,30 +51,46 @@
{ {
switch (data) switch (data)
{ {
case "R": case "BT":
return R; return Bt;
case "obj":
return StartObject;
case "endobj":
return EndObject;
case "stream":
return StartStream;
case "endstream":
return EndStream;
case "eexec": case "eexec":
return Eexec; return Eexec;
case "endobj":
return EndObject;
case "endstream":
return EndStream;
case "ET":
return Et;
case "def": case "def":
return Def; return Def;
case "dict": case "dict":
return Dict; return Dict;
case "readonly":
return Readonly;
case "dup":
return Dup;
case "for": case "for":
return For; return For;
case "dup":
return Dup;
case "n":
return N;
case "obj":
return StartObject;
case "put": case "put":
return Put; return Put;
case "Q":
return QPop;
case "q":
return QPush;
case "R":
return R;
case "re":
return Re;
case "readonly":
return Readonly;
case "stream":
return StartStream;
case "Tf":
return Tf;
case "W*":
return WStar;
case "xref": case "xref":
return Xref; return Xref;
default: default:
@@ -74,6 +98,13 @@
} }
} }
/// <inheritdoc />
public override int GetHashCode()
{
return Data.GetHashCode();
}
/// <inheritdoc />
public override string ToString() public override string ToString()
{ {
return Data; return Data;