2018-01-10 19:49:32 +00:00
|
|
|
|
namespace UglyToad.PdfPig.Tokenization
|
2017-11-10 23:58:28 +00:00
|
|
|
|
{
|
|
|
|
|
|
using System;
|
|
|
|
|
|
using System.Text;
|
2020-01-04 16:38:18 +00:00
|
|
|
|
using Core;
|
2017-11-10 23:58:28 +00:00
|
|
|
|
using Tokens;
|
|
|
|
|
|
|
2024-05-02 23:36:19 -07:00
|
|
|
|
#if NET
|
2024-04-18 11:58:40 -07:00
|
|
|
|
using System.Text.Unicode;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
2024-04-28 10:55:58 -07:00
|
|
|
|
internal sealed class NameTokenizer : ITokenizer
|
2017-11-10 23:58:28 +00:00
|
|
|
|
{
|
2024-04-18 11:58:40 -07:00
|
|
|
|
static NameTokenizer()
|
|
|
|
|
|
{
|
2024-05-02 23:36:19 -07:00
|
|
|
|
#if NET
|
2024-04-18 11:58:40 -07:00
|
|
|
|
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
2020-04-05 15:34:47 +01:00
|
|
|
|
|
2017-11-12 01:08:09 +00:00
|
|
|
|
public bool ReadsNextByte { get; } = true;
|
|
|
|
|
|
|
2017-11-10 23:58:28 +00:00
|
|
|
|
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
|
|
|
|
|
{
|
|
|
|
|
|
token = null;
|
|
|
|
|
|
|
|
|
|
|
|
if (currentByte != '/')
|
|
|
|
|
|
{
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2024-04-18 11:58:40 -07:00
|
|
|
|
using var bytes = new ArrayPoolBufferWriter<byte>();
|
2017-11-10 23:58:28 +00:00
|
|
|
|
|
|
|
|
|
|
bool escapeActive = false;
|
|
|
|
|
|
int postEscapeRead = 0;
|
2024-04-18 11:58:40 -07:00
|
|
|
|
Span<char> escapedChars = stackalloc char[2];
|
2018-01-15 21:16:36 +00:00
|
|
|
|
|
2017-11-10 23:58:28 +00:00
|
|
|
|
while (inputBytes.MoveNext())
|
|
|
|
|
|
{
|
|
|
|
|
|
var b = inputBytes.CurrentByte;
|
|
|
|
|
|
|
|
|
|
|
|
if (b == '#')
|
|
|
|
|
|
{
|
|
|
|
|
|
escapeActive = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
else if (escapeActive)
|
|
|
|
|
|
{
|
2018-04-11 22:51:31 +01:00
|
|
|
|
if (ReadHelper.IsHex((char)b))
|
2017-11-10 23:58:28 +00:00
|
|
|
|
{
|
|
|
|
|
|
escapedChars[postEscapeRead] = (char)b;
|
|
|
|
|
|
postEscapeRead++;
|
|
|
|
|
|
|
|
|
|
|
|
if (postEscapeRead == 2)
|
|
|
|
|
|
{
|
2024-04-18 11:58:40 -07:00
|
|
|
|
int high = escapedChars[0] <= '9' ? escapedChars[0] - '0' : char.ToUpper(escapedChars[0]) - 'A' + 10;
|
|
|
|
|
|
int low = escapedChars[1] <= '9' ? escapedChars[1] - '0' : char.ToUpper(escapedChars[1]) - 'A' + 10;
|
2018-01-15 21:16:36 +00:00
|
|
|
|
|
2024-04-18 11:58:40 -07:00
|
|
|
|
byte characterToWrite = (byte)(high * 16 + low);
|
|
|
|
|
|
|
|
|
|
|
|
bytes.Write(characterToWrite);
|
2017-11-10 23:58:28 +00:00
|
|
|
|
|
|
|
|
|
|
escapeActive = false;
|
|
|
|
|
|
postEscapeRead = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
2024-04-18 11:58:40 -07:00
|
|
|
|
bytes.Write((byte)'#');
|
2017-11-10 23:58:28 +00:00
|
|
|
|
|
|
|
|
|
|
if (postEscapeRead == 1)
|
|
|
|
|
|
{
|
2024-04-18 11:58:40 -07:00
|
|
|
|
bytes.Write((byte)escapedChars[0]);
|
2017-11-10 23:58:28 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (ReadHelper.IsEndOfName(b))
|
|
|
|
|
|
{
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (b == '#')
|
|
|
|
|
|
{
|
|
|
|
|
|
// Make it clear what's going on, we read something like #m#AE
|
|
|
|
|
|
// ReSharper disable once RedundantAssignment
|
|
|
|
|
|
escapeActive = true;
|
|
|
|
|
|
postEscapeRead = 0;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2024-04-18 11:58:40 -07:00
|
|
|
|
bytes.Write(b);
|
2017-11-10 23:58:28 +00:00
|
|
|
|
escapeActive = false;
|
|
|
|
|
|
postEscapeRead = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
else if (ReadHelper.IsEndOfName(b))
|
|
|
|
|
|
{
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
2024-04-18 11:58:40 -07:00
|
|
|
|
bytes.Write(b);
|
2017-11-10 23:58:28 +00:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2024-04-18 11:58:40 -07:00
|
|
|
|
#if NET8_0_OR_GREATER
|
|
|
|
|
|
var byteArray = bytes.WrittenSpan;
|
|
|
|
|
|
bool isValidUtf8 = Utf8.IsValid(byteArray);
|
|
|
|
|
|
#else
|
|
|
|
|
|
var byteArray = bytes.WrittenSpan.ToArray();
|
|
|
|
|
|
bool isValidUtf8 = ReadHelper.IsValidUtf8(byteArray);
|
|
|
|
|
|
#endif
|
2017-11-10 23:58:28 +00:00
|
|
|
|
|
2024-04-18 11:58:40 -07:00
|
|
|
|
var str = isValidUtf8
|
2017-11-10 23:58:28 +00:00
|
|
|
|
? Encoding.UTF8.GetString(byteArray)
|
|
|
|
|
|
: Encoding.GetEncoding("windows-1252").GetString(byteArray);
|
2024-04-18 11:58:40 -07:00
|
|
|
|
|
2018-01-19 00:35:04 +00:00
|
|
|
|
token = NameToken.Create(str);
|
2017-11-10 23:58:28 +00:00
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|