Files
PdfPig/src/UglyToad.PdfPig.Tokenization/NameTokenizer.cs
Jason Nelson da44e1a540 Improve code quality (#825)
* Avoid encoding ASCII in more cases

* Make Space a const

* Use WriteWhiteSpace extension to eliminate possible virtual call

* Use ASCII when encoding constrained character subset

* Simplify pragmas

* Revert Whitespace rename

* Fix using statement order

* Remove obsolete serialization support on .NET

* Remove obsolete serialization support on .NET (part 2)
2024-05-03 07:36:19 +01:00

122 lines
3.5 KiB
C#

namespace UglyToad.PdfPig.Tokenization
{
using System;
using System.Text;
using Core;
using Tokens;
#if NET
using System.Text.Unicode;
#endif
internal sealed class NameTokenizer : ITokenizer
{
static NameTokenizer()
{
#if NET
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
}
public bool ReadsNextByte { get; } = true;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;
if (currentByte != '/')
{
return false;
}
using var bytes = new ArrayPoolBufferWriter<byte>();
bool escapeActive = false;
int postEscapeRead = 0;
Span<char> escapedChars = stackalloc char[2];
while (inputBytes.MoveNext())
{
var b = inputBytes.CurrentByte;
if (b == '#')
{
escapeActive = true;
}
else if (escapeActive)
{
if (ReadHelper.IsHex((char)b))
{
escapedChars[postEscapeRead] = (char)b;
postEscapeRead++;
if (postEscapeRead == 2)
{
int high = escapedChars[0] <= '9' ? escapedChars[0] - '0' : char.ToUpper(escapedChars[0]) - 'A' + 10;
int low = escapedChars[1] <= '9' ? escapedChars[1] - '0' : char.ToUpper(escapedChars[1]) - 'A' + 10;
byte characterToWrite = (byte)(high * 16 + low);
bytes.Write(characterToWrite);
escapeActive = false;
postEscapeRead = 0;
}
}
else
{
bytes.Write((byte)'#');
if (postEscapeRead == 1)
{
bytes.Write((byte)escapedChars[0]);
}
if (ReadHelper.IsEndOfName(b))
{
break;
}
if (b == '#')
{
// Make it clear what's going on, we read something like #m#AE
// ReSharper disable once RedundantAssignment
escapeActive = true;
postEscapeRead = 0;
continue;
}
bytes.Write(b);
escapeActive = false;
postEscapeRead = 0;
}
}
else if (ReadHelper.IsEndOfName(b))
{
break;
}
else
{
bytes.Write(b);
}
}
#if NET8_0_OR_GREATER
var byteArray = bytes.WrittenSpan;
bool isValidUtf8 = Utf8.IsValid(byteArray);
#else
var byteArray = bytes.WrittenSpan.ToArray();
bool isValidUtf8 = ReadHelper.IsValidUtf8(byteArray);
#endif
var str = isValidUtf8
? Encoding.UTF8.GetString(byteArray)
: Encoding.GetEncoding("windows-1252").GetString(byteArray);
token = NameToken.Create(str);
return true;
}
}
}