cache some more common values and improve performance of tokenizers

This commit is contained in:
Eliot Jones
2019-12-22 23:34:33 +00:00
parent e048bb8c2c
commit ba9fe40bc1
4 changed files with 118 additions and 44 deletions

View File

@@ -1,6 +1,7 @@
namespace UglyToad.PdfPig.Tokenization
{
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Text;
using IO;
@@ -8,6 +9,11 @@
internal class NumericTokenizer : ITokenizer
{
private const byte Zero = 48;
private const byte Nine = 57;
private readonly Dictionary<string, NumericToken> cachedTokens = new Dictionary<string, NumericToken>();
public bool ReadsNextByte { get; } = true;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
@@ -16,7 +22,7 @@
StringBuilder characters;
if ((currentByte >= '0' && currentByte <= '9') || currentByte == '-' || currentByte == '+' || currentByte == '.')
if ((currentByte >= Zero && currentByte <= Nine) || currentByte == '-' || currentByte == '+' || currentByte == '.')
{
characters = new StringBuilder();
characters.Append((char)currentByte);
@@ -29,16 +35,15 @@
while (inputBytes.MoveNext())
{
var b = inputBytes.CurrentByte;
var c = (char) b;
if (char.IsDigit(c) ||
c == '-' ||
c == '+' ||
c == '.' ||
c == 'E' ||
c == 'e')
if ((b >= Zero && b <= Nine) ||
b == '-' ||
b == '+' ||
b == '.' ||
b == 'E' ||
b == 'e')
{
characters.Append(c);
characters.Append((char)b);
}
else
{
@@ -56,7 +61,38 @@
}
else
{
value = decimal.Parse(characters.ToString(), NumberStyles.Any, CultureInfo.InvariantCulture);
var str = characters.ToString();
switch (str)
{
case "0":
token = NumericToken.Zero;
return true;
case "1":
token = NumericToken.One;
return true;
case "2":
token = NumericToken.Two;
return true;
case "3":
token = NumericToken.Three;
return true;
case "8":
token = NumericToken.Eight;
return true;
default:
{
if (!cachedTokens.TryGetValue(str, out var result))
{
value = decimal.Parse(str, NumberStyles.Any, CultureInfo.InvariantCulture);
result = new NumericToken(value);
cachedTokens[str] = result;
}
token = result;
return true;
}
}
}
}
catch (FormatException)

View File

@@ -11,13 +11,13 @@
{
private static readonly HexTokenizer HexTokenizer = new HexTokenizer();
private static readonly StringTokenizer StringTokenizer = new StringTokenizer();
private static readonly NumericTokenizer NumericTokenizer = new NumericTokenizer();
private static readonly NameTokenizer NameTokenizer = new NameTokenizer();
private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer();
private static readonly ArrayTokenizer ArrayTokenizer = new ArrayTokenizer();
private static readonly DictionaryTokenizer DictionaryTokenizer = new DictionaryTokenizer();
private static readonly CommentTokenizer CommentTokenizer = new CommentTokenizer();
private readonly NumericTokenizer numericTokenizer = new NumericTokenizer();
private readonly ScannerScope scope;
private readonly IInputBytes inputBytes;
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
@@ -83,7 +83,7 @@
if (tokenizer == null)
{
if (IsEmpty(currentByte) || ReadHelper.IsWhitespace(currentByte))
if (ReadHelper.IsWhitespace(currentByte))
{
isSkippingSymbol = false;
continue;
@@ -143,7 +143,7 @@
case '-':
case '+':
case '.':
tokenizer = NumericTokenizer;
tokenizer = numericTokenizer;
break;
default:
tokenizer = PlainTokenizer;
@@ -284,10 +284,5 @@
throw new PdfDocumentFormatException($"No end of inline image data (EI) was found for image data at position {startsAt}.");
}
private static bool IsEmpty(byte b)
{
return b == ' ' || b == '\r' || b == '\n' || b == 0;
}
}
}

View File

@@ -10,6 +10,12 @@
/// </summary>
public class NumericToken : IDataToken<decimal>
{
internal static readonly NumericToken Zero = new NumericToken(0);
internal static readonly NumericToken One = new NumericToken(1);
internal static readonly NumericToken Two = new NumericToken(2);
internal static readonly NumericToken Three = new NumericToken(3);
internal static readonly NumericToken Eight = new NumericToken(8);
/// <inheritdoc />
public decimal Data { get; }
@@ -42,6 +48,12 @@
Data = value;
}
/// <inheritdoc />
public override int GetHashCode()
{
return Data.GetHashCode();
}
/// <inheritdoc />
public override string ToString()
{

View File

@@ -7,18 +7,26 @@
private static readonly object Lock = new object();
private static readonly Dictionary<string, string> PooledNames = new Dictionary<string, string>();
public static readonly OperatorToken R = new OperatorToken("R");
public static readonly OperatorToken StartObject = new OperatorToken("obj");
public static readonly OperatorToken EndObject = new OperatorToken("endobj");
public static readonly OperatorToken StartStream = new OperatorToken("stream");
public static readonly OperatorToken EndStream = new OperatorToken("endstream");
public static readonly OperatorToken Eexec = new OperatorToken("eexec");
public static readonly OperatorToken Bt = new OperatorToken("BT");
public static readonly OperatorToken Def = new OperatorToken("def");
public static readonly OperatorToken Dict = new OperatorToken("dict");
public static readonly OperatorToken Readonly = new OperatorToken("readonly");
public static readonly OperatorToken Dup = new OperatorToken("dup");
public static readonly OperatorToken Eexec = new OperatorToken("eexec");
public static readonly OperatorToken EndObject = new OperatorToken("endobj");
public static readonly OperatorToken EndStream = new OperatorToken("endstream");
public static readonly OperatorToken Et = new OperatorToken("ET");
public static readonly OperatorToken For = new OperatorToken("for");
public static readonly OperatorToken N = new OperatorToken("n");
public static readonly OperatorToken Put = new OperatorToken("put");
public static readonly OperatorToken QPop = new OperatorToken("Q");
public static readonly OperatorToken QPush = new OperatorToken("q");
public static readonly OperatorToken R = new OperatorToken("R");
public static readonly OperatorToken Re = new OperatorToken("re");
public static readonly OperatorToken Readonly = new OperatorToken("readonly");
public static readonly OperatorToken StartObject = new OperatorToken("obj");
public static readonly OperatorToken StartStream = new OperatorToken("stream");
public static readonly OperatorToken Tf = new OperatorToken("Tf");
public static readonly OperatorToken WStar = new OperatorToken("W*");
public static readonly OperatorToken Xref = new OperatorToken("xref");
public string Data { get; }
@@ -43,30 +51,46 @@
{
switch (data)
{
case "R":
return R;
case "obj":
return StartObject;
case "endobj":
return EndObject;
case "stream":
return StartStream;
case "endstream":
return EndStream;
case "BT":
return Bt;
case "eexec":
return Eexec;
case "endobj":
return EndObject;
case "endstream":
return EndStream;
case "ET":
return Et;
case "def":
return Def;
case "dict":
return Dict;
case "readonly":
return Readonly;
case "dup":
return Dup;
case "for":
return For;
case "dup":
return Dup;
case "n":
return N;
case "obj":
return StartObject;
case "put":
return Put;
case "Q":
return QPop;
case "q":
return QPush;
case "R":
return R;
case "re":
return Re;
case "readonly":
return Readonly;
case "stream":
return StartStream;
case "Tf":
return Tf;
case "W*":
return WStar;
case "xref":
return Xref;
default:
@@ -74,6 +98,13 @@
}
}
/// <inheritdoc />
public override int GetHashCode()
{
return Data.GetHashCode();
}
/// <inheritdoc />
public override string ToString()
{
return Data;