Skip control chars in CoreTokenScanner.MoveNext() and fix #1048

This commit is contained in:
BobLd 2025-05-27 19:30:37 +01:00
parent 67d3dde04a
commit ca9f70ffb0
3 changed files with 19 additions and 1 deletions

View File

@ -7,6 +7,24 @@
public class GithubIssuesTests
{
[Fact]
public void Issue1048()
{
var path = IntegrationHelpers.GetSpecificTestDocumentPath("InvalidCast.pdf");
using (var document = PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true }))
{
var page = document.GetPage(1);
Assert.NotNull(page.Letters);
var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words);
Assert.Single(blocks);
Assert.Equal("hey, i'm a bug.", blocks[0].Text);
}
}
[Fact]
public void Issue554()
{

View File

@ -113,7 +113,7 @@
if (tokenizer == null)
{
if (ReadHelper.IsWhitespace(currentByte))
if (ReadHelper.IsWhitespace(currentByte) || char.IsControl(c))
{
isSkippingSymbol = false;
continue;