2025-07-24 21:16:25 -05:00
|
|
|
|
#nullable enable
|
|
|
|
|
|
namespace UglyToad.PdfPig.Tokenization;
|
|
|
|
|
|
|
|
|
|
|
|
using System;
|
|
|
|
|
|
using Core;
|
|
|
|
|
|
using Tokens;
|
|
|
|
|
|
|
|
|
|
|
|
internal sealed class NumericTokenizer : ITokenizer
|
2021-01-19 18:39:51 -06:00
|
|
|
|
{
|
2025-07-24 21:16:25 -05:00
|
|
|
|
private const byte Zero = 48;
|
|
|
|
|
|
private const byte Nine = 57;
|
|
|
|
|
|
private const byte Negative = (byte)'-';
|
|
|
|
|
|
private const byte Positive = (byte)'+';
|
|
|
|
|
|
private const byte Period = (byte)'.';
|
|
|
|
|
|
private const byte ExponentLower = (byte)'e';
|
|
|
|
|
|
private const byte ExponentUpper = (byte)'E';
|
|
|
|
|
|
|
|
|
|
|
|
public bool ReadsNextByte => true;
|
2021-01-19 18:39:51 -06:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken? token)
|
2021-01-19 18:39:51 -06:00
|
|
|
|
{
|
2025-07-24 21:16:25 -05:00
|
|
|
|
token = null;
|
2021-01-19 18:39:51 -06:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
var readBytes = 0;
|
2021-01-19 18:39:51 -06:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
// Everything before the decimal part.
|
|
|
|
|
|
var isNegative = false;
|
|
|
|
|
|
double integerPart = 0;
|
2021-01-19 18:39:51 -06:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
// Everything after the decimal point.
|
|
|
|
|
|
var hasFraction = false;
|
|
|
|
|
|
long fractionalPart = 0;
|
|
|
|
|
|
var fractionalCount = 0;
|
2024-04-28 10:55:58 -07:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
// Support scientific notation in some font files.
|
|
|
|
|
|
var hasExponent = false;
|
|
|
|
|
|
var isExponentNegative = false;
|
|
|
|
|
|
var exponentPart = 0;
|
2021-01-19 18:39:51 -06:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
do
|
|
|
|
|
|
{
|
|
|
|
|
|
var b = inputBytes.CurrentByte;
|
|
|
|
|
|
if (b >= Zero && b <= Nine)
|
2021-01-19 18:39:51 -06:00
|
|
|
|
{
|
2025-07-24 21:16:25 -05:00
|
|
|
|
if (hasExponent)
|
|
|
|
|
|
{
|
|
|
|
|
|
exponentPart = (exponentPart * 10) + (b - Zero);
|
|
|
|
|
|
}
|
|
|
|
|
|
else if (hasFraction)
|
|
|
|
|
|
{
|
|
|
|
|
|
fractionalPart = (fractionalPart * 10) + (b - Zero);
|
|
|
|
|
|
fractionalCount++;
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
integerPart = (integerPart * 10) + (b - Zero);
|
|
|
|
|
|
}
|
2021-01-19 18:39:51 -06:00
|
|
|
|
}
|
2025-07-24 21:16:25 -05:00
|
|
|
|
else if (b == Positive)
|
2021-08-11 20:56:06 -04:00
|
|
|
|
{
|
2025-07-24 21:16:25 -05:00
|
|
|
|
// Has no impact
|
2021-08-11 20:56:06 -04:00
|
|
|
|
}
|
2025-07-24 21:16:25 -05:00
|
|
|
|
else if (b == Negative)
|
2021-01-19 18:39:51 -06:00
|
|
|
|
{
|
2025-07-24 21:16:25 -05:00
|
|
|
|
if (hasExponent)
|
|
|
|
|
|
{
|
|
|
|
|
|
isExponentNegative = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
isNegative = true;
|
|
|
|
|
|
}
|
2021-01-19 18:39:51 -06:00
|
|
|
|
}
|
2025-07-24 21:16:25 -05:00
|
|
|
|
else if (b == Period)
|
2021-01-19 18:39:51 -06:00
|
|
|
|
{
|
2025-07-24 21:16:25 -05:00
|
|
|
|
if (hasExponent || hasFraction)
|
2021-08-11 20:56:06 -04:00
|
|
|
|
{
|
2025-07-24 21:16:25 -05:00
|
|
|
|
return false;
|
2021-08-11 20:56:06 -04:00
|
|
|
|
}
|
2025-07-24 21:16:25 -05:00
|
|
|
|
|
|
|
|
|
|
hasFraction = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
else if (b == ExponentLower || b == ExponentUpper)
|
|
|
|
|
|
{
|
|
|
|
|
|
// Don't allow leading exponent.
|
|
|
|
|
|
if (readBytes == 0)
|
2021-01-19 18:39:51 -06:00
|
|
|
|
{
|
2025-07-24 21:16:25 -05:00
|
|
|
|
return false;
|
2021-01-19 18:39:51 -06:00
|
|
|
|
}
|
2025-07-24 21:16:25 -05:00
|
|
|
|
|
|
|
|
|
|
if (hasExponent)
|
2021-01-19 18:39:51 -06:00
|
|
|
|
{
|
2025-07-24 21:16:25 -05:00
|
|
|
|
return false;
|
2021-01-19 18:39:51 -06:00
|
|
|
|
}
|
2025-07-24 21:16:25 -05:00
|
|
|
|
|
|
|
|
|
|
hasExponent = true;
|
2021-01-19 18:39:51 -06:00
|
|
|
|
}
|
2025-07-24 21:16:25 -05:00
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
// No valid first character.
|
|
|
|
|
|
if (readBytes == 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
2021-01-19 18:39:51 -06:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
2021-01-19 18:39:51 -06:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
readBytes++;
|
|
|
|
|
|
} while (inputBytes.MoveNext());
|
2022-06-17 20:35:21 -04:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
if (hasExponent && !isExponentNegative)
|
2022-06-17 20:35:21 -04:00
|
|
|
|
{
|
2025-07-24 21:16:25 -05:00
|
|
|
|
// Apply the multiplication before any fraction logic to avoid loss of precision.
|
|
|
|
|
|
// E.g. 1.53E3 should be exactly 1,530.
|
2022-06-17 20:35:21 -04:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
// Move the whole part to the left of the decimal point.
|
|
|
|
|
|
var combined = integerPart * Pow10(fractionalCount) + fractionalPart;
|
|
|
|
|
|
|
|
|
|
|
|
// For 1.53E3 we changed this to 153 above, 2 fractional parts, so now we are missing (3-2) 1 additional power of 10.
|
|
|
|
|
|
var shift = exponentPart - fractionalCount;
|
|
|
|
|
|
|
|
|
|
|
|
if (shift >= 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
integerPart = combined * Pow10(shift);
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
2022-06-17 20:35:21 -04:00
|
|
|
|
{
|
2025-07-24 21:16:25 -05:00
|
|
|
|
// Still a positive exponent, but not enough to fully shift
|
|
|
|
|
|
// For example 1.457E2 becomes 1,457 but shift is (2-3) -1, the outcome should be 145.7
|
|
|
|
|
|
integerPart = combined / Pow10(-shift);
|
2022-06-17 20:35:21 -04:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
hasFraction = false;
|
|
|
|
|
|
hasExponent = false;
|
|
|
|
|
|
}
|
2022-06-17 20:35:21 -04:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
if (hasFraction && fractionalCount > 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (fractionalCount)
|
2022-06-17 20:35:21 -04:00
|
|
|
|
{
|
2025-07-24 21:16:25 -05:00
|
|
|
|
case 1:
|
|
|
|
|
|
integerPart += fractionalPart / 10.0;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case 2:
|
|
|
|
|
|
integerPart += fractionalPart / 100.0;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case 3:
|
|
|
|
|
|
integerPart += fractionalPart / 1000.0;
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
integerPart += fractionalPart / Math.Pow(10, fractionalCount);
|
|
|
|
|
|
break;
|
2022-06-17 20:35:21 -04:00
|
|
|
|
}
|
2025-07-24 21:16:25 -05:00
|
|
|
|
}
|
2022-06-17 20:35:21 -04:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
if (hasExponent)
|
|
|
|
|
|
{
|
|
|
|
|
|
var signedExponent = isExponentNegative ? -exponentPart : exponentPart;
|
|
|
|
|
|
integerPart *= Math.Pow(10, signedExponent);
|
|
|
|
|
|
}
|
2022-06-17 20:35:21 -04:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
if (isNegative)
|
|
|
|
|
|
{
|
|
|
|
|
|
integerPart = -integerPart;
|
|
|
|
|
|
}
|
2022-06-17 20:35:21 -04:00
|
|
|
|
|
2025-07-24 21:16:25 -05:00
|
|
|
|
if (integerPart == 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
token = NumericToken.Zero;
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
token = new NumericToken(integerPart);
|
2022-06-17 20:35:21 -04:00
|
|
|
|
}
|
2025-07-24 21:16:25 -05:00
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private static double Pow10(int exp)
|
|
|
|
|
|
{
|
|
|
|
|
|
return exp switch
|
|
|
|
|
|
{
|
|
|
|
|
|
0 => 1,
|
|
|
|
|
|
1 => 10,
|
|
|
|
|
|
2 => 100,
|
|
|
|
|
|
3 => 1000,
|
|
|
|
|
|
4 => 10000,
|
|
|
|
|
|
5 => 100000,
|
|
|
|
|
|
6 => 1000000,
|
|
|
|
|
|
7 => 10000000,
|
|
|
|
|
|
8 => 100000000,
|
|
|
|
|
|
9 => 1000000000,
|
|
|
|
|
|
_ => Math.Pow(10, exp)
|
|
|
|
|
|
};
|
2021-01-19 18:39:51 -06:00
|
|
|
|
}
|
2025-07-24 21:16:25 -05:00
|
|
|
|
}
|