修复WordTree.addWord末尾为特殊字符导致的无法匹配问题(pr#4092@Github)

This commit is contained in:
Looly
2025-10-10 22:47:20 +08:00
parent 665b0b8298
commit bf2509f9bc
2 changed files with 59 additions and 24 deletions

View File

@@ -22,6 +22,7 @@ import cn.hutool.v7.core.map.MapUtil;
import cn.hutool.v7.core.stream.EasyStream;
import cn.hutool.v7.core.text.StrUtil;
import java.io.Serial;
import java.util.*;
import java.util.function.Predicate;
@@ -43,6 +44,7 @@ import java.util.function.Predicate;
* @author Looly
*/
public class WordTree extends HashMap<Character, WordTree> {
@Serial
private static final long serialVersionUID = -4646423269465809276L;
/**
@@ -145,20 +147,23 @@ public class WordTree extends HashMap<Character, WordTree> {
WordTree parent = null;
WordTree current = this;
WordTree child;
char currentChar = 0;
final int length = word.length();
for (int i = 0; i < length; i++) {
currentChar = word.charAt(i);
Character lastAcceptedChar = null;
for (final char c : word.toCharArray()) {
//只处理合法字符
if (charFilter.test(currentChar)) {
if (charFilter.test(c)) {
//无子节点新建一个子节点后存放下一个字符子节点的同级节点不会有太多同级节点默认1个
child = current.computeIfAbsent(currentChar, c -> new WordTree(1));
child = current.computeIfAbsent(c, character -> new WordTree(1));
parent = current;
current = child;
lastAcceptedChar = c;
}
}
// 仅当存在父节点且存在非停顿词时,才设置词尾标记
// 当 null != parent 条件成立时lastAcceptedChar != null 必然成立,故也可以省去
if (null != parent) {
parent.setEnd(currentChar);
parent.setEnd(lastAcceptedChar);
}
return this;
}
@@ -302,7 +307,7 @@ public class WordTree extends HashMap<Character, WordTree> {
for (int j = i; j < length; j++) {
currentChar = text.charAt(j);
if (!charFilter.test(currentChar)) {
if (wordBuffer.length() > 0) {
if (!wordBuffer.isEmpty()) {
//做为关键词中间的停顿词被当作关键词的一部分被返回
wordBuffer.append(currentChar);
} else {

View File

@@ -24,6 +24,8 @@ import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
/**
* DFA单元测试
*
@@ -44,7 +46,7 @@ public class DfaTest {
// 匹配到【大】,就不再继续匹配了,因此【大土豆】不匹配
// 匹配到【刚出锅】,就跳过这三个字了,因此【出锅】不匹配(由于刚首先被匹配,因此长的被匹配,最短匹配只针对第一个字相同选最短)
final List<String> matchAll = tree.matchAll(text, -1, false, false);
Assertions.assertEquals(matchAll, ListUtil.of("", "土^豆", "刚出锅"));
assertEquals(matchAll, ListUtil.of("", "土^豆", "刚出锅"));
}
/**
@@ -60,7 +62,7 @@ public class DfaTest {
// 【大】被匹配,最短匹配原则【大土豆】被跳过,【土豆继续被匹配】
// 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配
final List<String> matchAll = tree.matchAll(text, -1, true, false);
Assertions.assertEquals(ListUtil.of("", "土^豆", "刚出锅", "出锅"), matchAll);
assertEquals(ListUtil.of("", "土^豆", "刚出锅", "出锅"), matchAll);
}
/**
@@ -78,7 +80,7 @@ public class DfaTest {
// 匹配到【大】和【大土豆】,最长匹配则保留【大土豆】,非密集匹配,【土豆】跳过。
// 由于【刚出锅】被匹配,由于非密集匹配,【出锅】被跳过
final List<String> matchAll = tree.matchAll(text, -1, false, true);
Assertions.assertEquals(ListUtil.of("大土^豆", "刚出锅"), matchAll);
assertEquals(ListUtil.of("大土^豆", "刚出锅"), matchAll);
}
/**
@@ -96,7 +98,7 @@ public class DfaTest {
// 匹配到【大】和【大土豆】,由于到最长匹配,因此【大土豆】保留,由于不跳过已经匹配的关键词,【土豆】继续被匹配
// 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配
final List<String> matchAll = tree.matchAll(text, -1, true, true);
Assertions.assertEquals(ListUtil.of("大土^豆", "土^豆", "刚出锅", "出锅"), matchAll);
assertEquals(ListUtil.of("大土^豆", "土^豆", "刚出锅", "出锅"), matchAll);
}
@@ -111,11 +113,11 @@ public class DfaTest {
tree.addWord("赵阿三");
final List<FoundWord> result = tree.matchAllWords("赵阿三在做什么", -1, true, true);
Assertions.assertEquals(1, result.size());
assertEquals(1, result.size());
Assertions.assertEquals("赵阿三", result.get(0).getWord());
Assertions.assertEquals(0, result.get(0).getBeginIndex().intValue());
Assertions.assertEquals(2, result.get(0).getEndIndex().intValue());
assertEquals("赵阿三", result.get(0).getWord());
assertEquals(0, result.get(0).getBeginIndex().intValue());
assertEquals(2, result.get(0).getEndIndex().intValue());
}
/**
@@ -127,7 +129,7 @@ public class DfaTest {
tree.addWord("tio");
final List<String> all = tree.matchAll("AAAAAAAt-ioBBBBBBB");
Assertions.assertEquals(all, ListUtil.of("t-io"));
assertEquals(all, ListUtil.of("t-io"));
}
@Test
@@ -136,7 +138,7 @@ public class DfaTest {
tree.addWord("women");
final String text = "a WOMEN todo.".toLowerCase();
final List<String> matchAll = tree.matchAll(text, -1, false, false);
Assertions.assertEquals("[women]", matchAll.toString());
assertEquals("[women]", matchAll.toString());
}
@Test
@@ -188,7 +190,7 @@ public class DfaTest {
final String text = "This is test Service: UserServiceImpl UserServiceTest...";
final List<String> strings = wordTree.matchAll(text, -1, false, true);
Assertions.assertEquals("[UserServiceImpl, UserService]", strings.toString());
assertEquals("[UserServiceImpl, UserService]", strings.toString());
}
/**
@@ -201,19 +203,19 @@ public class DfaTest {
// 非密集,非贪婪
List<String> strings = wordTree.matchAll("abab", -1, false, false);
Assertions.assertEquals("[ab, ab]", strings.toString());
assertEquals("[ab, ab]", strings.toString());
// 密集,非贪婪
strings = wordTree.matchAll("abab", -1, true, false);
Assertions.assertEquals("[ab, b, ab, b]", strings.toString());
assertEquals("[ab, b, ab, b]", strings.toString());
// 非密集,贪婪
strings = wordTree.matchAll("abab", -1, false, true);
Assertions.assertEquals("[ab, ab]", strings.toString());
assertEquals("[ab, ab]", strings.toString());
// 密集,贪婪
strings = wordTree.matchAll("abab", -1, true, true);
Assertions.assertEquals("[ab, b, ab, b]", strings.toString());
assertEquals("[ab, b, ab, b]", strings.toString());
}
@Test
@@ -223,6 +225,34 @@ public class DfaTest {
wordTree.addWords(list);
final List<String> flattened = wordTree.flatten();
flattened.sort(Comparator.comparingInt(list::indexOf));
Assertions.assertEquals(list, flattened);
assertEquals(list, flattened);
}
/**
* Github Issue #4091
* 测试当关键词以停顿词结尾时,其合法前缀是否能被正确匹配
*/
@Test
public void addWordWithTrailingFilteredCharTest() {
final WordTree tree = new WordTree();
tree.addWord("hello("); // 以停顿词 '(' 结尾
final List<String> matches = tree.matchAll("hello", -1);
assertEquals(1, matches.size());
assertEquals("hello", matches.get(0));
}
/**
* Github Issue #4091
* 测试关键词中间包含停顿词的情况
*/
@Test
public void addWordWithMiddleFilteredCharTest() {
final WordTree tree = new WordTree();
tree.addWord("he(llo"); // 中间 '(' 被过滤
final List<String> matches = tree.matchAll("hello", -1);
assertEquals(1, matches.size());
assertEquals("hello", matches.get(0));
}
}