修复WordTree.addWord末尾为特殊字符导致的无法匹配问题（pr#4092@Github）

2026-02-09 09:16:26 +08:00 · 2025-10-10 22:47:20 +08:00
parent 665b0b8298
commit bf2509f9bc
2 changed files with 59 additions and 24 deletions
--- a/hutool-core/src/main/java/cn/hutool/v7/core/text/dfa/WordTree.java
+++ b/hutool-core/src/main/java/cn/hutool/v7/core/text/dfa/WordTree.java
@@ -22,6 +22,7 @@ import cn.hutool.v7.core.map.MapUtil;
 import cn.hutool.v7.core.stream.EasyStream;
 import cn.hutool.v7.core.text.StrUtil;

+import java.io.Serial;
 import java.util.*;
 import java.util.function.Predicate;

@@ -43,6 +44,7 @@ import java.util.function.Predicate;
 * @author Looly
 */
 public class WordTree extends HashMap<Character, WordTree> {
+	@Serial
 	private static final long serialVersionUID = -4646423269465809276L;

 	/**
@@ -145,20 +147,23 @@ public class WordTree extends HashMap<Character, WordTree> {
 		WordTree parent = null;
 		WordTree current = this;
 		WordTree child;
-		char currentChar = 0;
-		final int length = word.length();
-		for (int i = 0; i < length; i++) {
-			currentChar = word.charAt(i);
+		Character lastAcceptedChar = null;
+
+		for (final char c : word.toCharArray()) {
 			//只处理合法字符
-			if (charFilter.test(currentChar)) {
+			if (charFilter.test(c)) {
 				//无子节点，新建一个子节点后存放下一个字符，子节点的同级节点不会有太多同级节点，默认1个
-				child = current.computeIfAbsent(currentChar, c -> new WordTree(1));
+				child = current.computeIfAbsent(c, character -> new WordTree(1));
 				parent = current;
 				current = child;
+				lastAcceptedChar = c;
 			}
 		}
+
+		// 仅当存在父节点且存在非停顿词时，才设置词尾标记
+		// 当 null != parent 条件成立时，lastAcceptedChar != null 必然成立，故也可以省去
 		if (null != parent) {
-			parent.setEnd(currentChar);
+			parent.setEnd(lastAcceptedChar);
 		}
 		return this;
 	}
@@ -302,7 +307,7 @@ public class WordTree extends HashMap<Character, WordTree> {
 			for (int j = i; j < length; j++) {
 				currentChar = text.charAt(j);
 				if (!charFilter.test(currentChar)) {
-					if (wordBuffer.length() > 0) {
+					if (!wordBuffer.isEmpty()) {
 						//做为关键词中间的停顿词被当作关键词的一部分被返回
 						wordBuffer.append(currentChar);
 					} else {
--- a/hutool-core/src/test/java/cn/hutool/v7/core/text/dfa/DfaTest.java
+++ b/hutool-core/src/test/java/cn/hutool/v7/core/text/dfa/DfaTest.java
@@ -24,6 +24,8 @@ import java.util.Arrays;
 import java.util.Comparator;
 import java.util.List;

+import static org.junit.jupiter.api.Assertions.assertEquals;
+
 /**
 * DFA单元测试
 *
@@ -44,7 +46,7 @@ public class DfaTest {
 		// 匹配到【大】，就不再继续匹配了，因此【大土豆】不匹配
 		// 匹配到【刚出锅】，就跳过这三个字了，因此【出锅】不匹配（由于刚首先被匹配，因此长的被匹配，最短匹配只针对第一个字相同选最短）
 		final List<String> matchAll = tree.matchAll(text, -1, false, false);
-		Assertions.assertEquals(matchAll, ListUtil.of("大", "土^豆", "刚出锅"));
+		assertEquals(matchAll, ListUtil.of("大", "土^豆", "刚出锅"));
 	}

 	/**
@@ -60,7 +62,7 @@ public class DfaTest {
 		// 【大】被匹配，最短匹配原则【大土豆】被跳过，【土豆继续被匹配】
 		// 【刚出锅】被匹配，由于不跳过已经匹配的词，【出锅】被匹配
 		final List<String> matchAll = tree.matchAll(text, -1, true, false);
-		Assertions.assertEquals(ListUtil.of("大", "土^豆", "刚出锅", "出锅"), matchAll);
+		assertEquals(ListUtil.of("大", "土^豆", "刚出锅", "出锅"), matchAll);
 	}

 	/**
@@ -78,7 +80,7 @@ public class DfaTest {
 		// 匹配到【大】和【大土豆】，最长匹配则保留【大土豆】，非密集匹配，【土豆】跳过。
 		// 由于【刚出锅】被匹配，由于非密集匹配，【出锅】被跳过
 		final List<String> matchAll = tree.matchAll(text, -1, false, true);
-		Assertions.assertEquals(ListUtil.of("大土^豆", "刚出锅"), matchAll);
+		assertEquals(ListUtil.of("大土^豆", "刚出锅"), matchAll);
 	}

 	/**
@@ -96,7 +98,7 @@ public class DfaTest {
 		// 匹配到【大】和【大土豆】，由于到最长匹配，因此【大土豆】保留，由于不跳过已经匹配的关键词，【土豆】继续被匹配
 		// 【刚出锅】被匹配，由于不跳过已经匹配的词，【出锅】被匹配
 		final List<String> matchAll = tree.matchAll(text, -1, true, true);
-		Assertions.assertEquals(ListUtil.of("大土^豆", "土^豆", "刚出锅", "出锅"), matchAll);
+		assertEquals(ListUtil.of("大土^豆", "土^豆", "刚出锅", "出锅"), matchAll);

 	}

@@ -111,11 +113,11 @@ public class DfaTest {
 		tree.addWord("赵阿三");

 		final List<FoundWord> result = tree.matchAllWords("赵阿三在做什么", -1, true, true);
-		Assertions.assertEquals(1, result.size());
+		assertEquals(1, result.size());

-		Assertions.assertEquals("赵阿三", result.get(0).getWord());
-		Assertions.assertEquals(0, result.get(0).getBeginIndex().intValue());
-		Assertions.assertEquals(2, result.get(0).getEndIndex().intValue());
+		assertEquals("赵阿三", result.get(0).getWord());
+		assertEquals(0, result.get(0).getBeginIndex().intValue());
+		assertEquals(2, result.get(0).getEndIndex().intValue());
 	}

 	/**
@@ -127,7 +129,7 @@ public class DfaTest {
 		tree.addWord("tio");

 		final List<String> all = tree.matchAll("AAAAAAAt-ioBBBBBBB");
-		Assertions.assertEquals(all, ListUtil.of("t-io"));
+		assertEquals(all, ListUtil.of("t-io"));
 	}

 	@Test
@@ -136,7 +138,7 @@ public class DfaTest {
 		tree.addWord("women");
 		final String text = "a WOMEN todo.".toLowerCase();
 		final List<String> matchAll = tree.matchAll(text, -1, false, false);
-		Assertions.assertEquals("[women]", matchAll.toString());
+		assertEquals("[women]", matchAll.toString());
 	}

 	@Test
@@ -188,7 +190,7 @@ public class DfaTest {

 		final String text = "This is test Service: UserServiceImpl UserServiceTest...";
 		final List<String> strings = wordTree.matchAll(text, -1, false, true);
-		Assertions.assertEquals("[UserServiceImpl, UserService]", strings.toString());
+		assertEquals("[UserServiceImpl, UserService]", strings.toString());
 	}

 	/**
@@ -201,19 +203,19 @@ public class DfaTest {

 		// 非密集，非贪婪
 		List<String> strings = wordTree.matchAll("abab", -1, false, false);
-		Assertions.assertEquals("[ab, ab]", strings.toString());
+		assertEquals("[ab, ab]", strings.toString());

 		// 密集，非贪婪
 		strings = wordTree.matchAll("abab", -1, true, false);
-		Assertions.assertEquals("[ab, b, ab, b]", strings.toString());
+		assertEquals("[ab, b, ab, b]", strings.toString());

 		// 非密集，贪婪
 		strings = wordTree.matchAll("abab", -1, false, true);
-		Assertions.assertEquals("[ab, ab]", strings.toString());
+		assertEquals("[ab, ab]", strings.toString());

 		// 密集，贪婪
 		strings = wordTree.matchAll("abab", -1, true, true);
-		Assertions.assertEquals("[ab, b, ab, b]", strings.toString());
+		assertEquals("[ab, b, ab, b]", strings.toString());
 	}

 	@Test
@@ -223,6 +225,34 @@ public class DfaTest {
 		wordTree.addWords(list);
 		final List<String> flattened = wordTree.flatten();
 		flattened.sort(Comparator.comparingInt(list::indexOf));
-		Assertions.assertEquals(list, flattened);
+		assertEquals(list, flattened);
+	}
+
+	/**
+	 * Github Issue #4091
+	 * 测试当关键词以停顿词结尾时，其合法前缀是否能被正确匹配
+	 */
+	@Test
+	public void addWordWithTrailingFilteredCharTest() {
+		final WordTree tree = new WordTree();
+		tree.addWord("hello("); // 以停顿词 '(' 结尾
+
+		final List<String> matches = tree.matchAll("hello", -1);
+		assertEquals(1, matches.size());
+		assertEquals("hello", matches.get(0));
+	}
+
+	/**
+	 * Github Issue #4091
+	 * 测试关键词中间包含停顿词的情况
+	 */
+	@Test
+	public void addWordWithMiddleFilteredCharTest() {
+		final WordTree tree = new WordTree();
+		tree.addWord("he(llo"); // 中间 '(' 被过滤
+
+		final List<String> matches = tree.matchAll("hello", -1);
+		assertEquals(1, matches.size());
+		assertEquals("hello", matches.get(0));
 	}
 }