mirror of
				https://gitee.com/dromara/hutool.git
				synced 2025-10-25 02:09:19 +08:00 
			
		
		
		
	AC自动机实现
This commit is contained in:
		| @@ -43,10 +43,7 @@ import java.util.HashSet; | ||||
| import java.util.LinkedList; | ||||
| import java.util.List; | ||||
| import java.util.Set; | ||||
| import java.util.function.Function; | ||||
| import java.util.function.Predicate; | ||||
| import java.util.function.Supplier; | ||||
| import java.util.function.UnaryOperator; | ||||
| import java.util.function.*; | ||||
| import java.util.regex.Matcher; | ||||
| import java.util.regex.Pattern; | ||||
|  | ||||
| @@ -1476,9 +1473,9 @@ public class CharSequenceUtil extends StrValidator { | ||||
| 	 * } | ||||
| 	 * </pre> | ||||
| 	 * | ||||
| 	 * @param str    被处理的字符串 | ||||
| 	 * @param prefix 前缀 | ||||
| 	 * @param suffix 后缀 | ||||
| 	 * @param str        被处理的字符串 | ||||
| 	 * @param prefix     前缀 | ||||
| 	 * @param suffix     后缀 | ||||
| 	 * @param ignoreCase 是否忽略大小写 | ||||
| 	 * @return 处理后的字符串 | ||||
| 	 * @since 3.1.2 | ||||
| @@ -1494,17 +1491,17 @@ public class CharSequenceUtil extends StrValidator { | ||||
|  | ||||
| 		if (startWith(str2, prefix, ignoreCase)) { | ||||
| 			from = prefix.length(); | ||||
| 			if(from == to){ | ||||
| 			if (from == to) { | ||||
| 				// "a", "a", "a"  -> "" | ||||
| 				return EMPTY; | ||||
| 			} | ||||
| 		} | ||||
| 		if (endWith(str2, suffix, ignoreCase)) { | ||||
| 			to -= suffix.length(); | ||||
| 			if(from == to){ | ||||
| 			if (from == to) { | ||||
| 				// "a", "a", "a"  -> "" | ||||
| 				return EMPTY; | ||||
| 			} else if(to < from){ | ||||
| 			} else if (to < from) { | ||||
| 				// pre去除后和suffix有重叠,如 ("aba", "ab", "ba") -> "a" | ||||
| 				to += suffix.length(); | ||||
| 			} | ||||
| @@ -1574,22 +1571,22 @@ public class CharSequenceUtil extends StrValidator { | ||||
| 		int from = 0; | ||||
| 		int to = str2.length(); | ||||
|  | ||||
| 		if(!prefixStr.isEmpty()){ | ||||
| 		if (!prefixStr.isEmpty()) { | ||||
| 			while (str2.startsWith(prefixStr, from)) { | ||||
| 				from += prefix.length(); | ||||
| 				if(from == to){ | ||||
| 				if (from == to) { | ||||
| 					// "a", "a", "a"  -> "" | ||||
| 					return EMPTY; | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 		if(!suffixStr.isEmpty()){ | ||||
| 		if (!suffixStr.isEmpty()) { | ||||
| 			while (str2.startsWith(suffixStr, to - suffixStr.length())) { | ||||
| 				to -= suffixStr.length(); | ||||
| 				if(from == to){ | ||||
| 				if (from == to) { | ||||
| 					// "a", "a", "a"  -> "" | ||||
| 					return EMPTY; | ||||
| 				}else if(to < from){ | ||||
| 				} else if (to < from) { | ||||
| 					// pre去除后和suffix有重叠,如 ("aba", "ab", "ba") -> "a" | ||||
| 					to += suffixStr.length(); | ||||
| 					break; | ||||
| @@ -1730,7 +1727,7 @@ public class CharSequenceUtil extends StrValidator { | ||||
|  | ||||
| 		final StringBuilder sb = new StringBuilder(); | ||||
| 		final int subLen = toIndex - fromIndex; | ||||
| 		str.toString().codePoints().skip(fromIndex).limit(subLen).forEach(v -> sb.append(Character.toChars(v))); | ||||
| 		str.codePoints().skip(fromIndex).limit(subLen).forEach(v -> sb.append(Character.toChars(v))); | ||||
| 		return sb.toString(); | ||||
| 	} | ||||
|  | ||||
| @@ -4131,4 +4128,28 @@ public class CharSequenceUtil extends StrValidator { | ||||
| 		} | ||||
| 		return (isCodePoint ? str.codePoints() : str.chars()).toArray(); | ||||
| 	} | ||||
|  | ||||
| 	/** | ||||
| 	 * 遍历字符串的每个字符,并处理 | ||||
| 	 * | ||||
| 	 * @param str      字符串 | ||||
| 	 * @param consumer 字符处理 | ||||
| 	 */ | ||||
| 	public static void forEach(final CharSequence str, final Consumer<Character> consumer) { | ||||
| 		forEach(str, false, (cInt)-> consumer.accept((char) cInt)); | ||||
| 	} | ||||
|  | ||||
| 	/** | ||||
| 	 * 遍历字符串的每个字符,并处理 | ||||
| 	 * | ||||
| 	 * @param str         字符串 | ||||
| 	 * @param isCodePoint 是否为Unicode码点(即支持emoji等多char字符) | ||||
| 	 * @param consumer    字符处理 | ||||
| 	 */ | ||||
| 	public static void forEach(final CharSequence str, final boolean isCodePoint, final IntConsumer consumer) { | ||||
| 		if (null == str) { | ||||
| 			return; | ||||
| 		} | ||||
| 		(isCodePoint ? str.codePoints() : str.chars()).forEach(consumer); | ||||
| 	} | ||||
| } | ||||
|   | ||||
| @@ -1,51 +1,56 @@ | ||||
| package org.dromara.hutool.core.text.finder; | ||||
|  | ||||
| import org.dromara.hutool.core.text.StrUtil; | ||||
|  | ||||
| import java.util.*; | ||||
|  | ||||
| /** | ||||
|  * 多字符串查询器 底层思路 使用 AC 自动机实现 | ||||
|  * | ||||
|  * @author newshiJ | ||||
|  * @date 2024/8/2 上午10:07 | ||||
|  */ | ||||
| public class MultiStrFinder { | ||||
|  | ||||
| 	// 字符索引 | ||||
| 	protected final Map<Character,Integer> charIndex = new HashMap<>(); | ||||
| 	/** | ||||
| 	 * 创建多字符串查询器 | ||||
| 	 * @param source 字符串集合 | ||||
| 	 * @return 多字符串查询器 | ||||
| 	 */ | ||||
| 	public static MultiStrFinder of(final Collection<String> source) { | ||||
| 		return new MultiStrFinder(source); | ||||
| 	} | ||||
|  | ||||
| 	// 字符索引 | ||||
| 	protected final Map<Character, Integer> charIndexMap = new HashMap<>(); | ||||
| 	// 全部字符数量 | ||||
| 	protected final int allCharSize; | ||||
|  | ||||
| 	// 根节点 | ||||
| 	protected final Node root; | ||||
|  | ||||
| 	// 全部节点数量 | ||||
| 	int nodeSize; | ||||
|  | ||||
| 	/** | ||||
| 	 * 构建多字符串查询器 | ||||
| 	 * @param source | ||||
| 	 * | ||||
| 	 * @param source 字符串集合 | ||||
| 	 */ | ||||
| 	public MultiStrFinder(Collection<String> source){ | ||||
| 	public MultiStrFinder(final Collection<String> source) { | ||||
| 		// 待匹配的字符串 | ||||
| 		final Set<String> stringSet = new HashSet<>(); | ||||
|  | ||||
| 		// 所有字符 | ||||
| 		final Set<Character> charSet = new HashSet<>(); | ||||
| 		for (String string : source) { | ||||
| 		for (final String string : source) { | ||||
| 			stringSet.add(string); | ||||
| 			char[] charArray = string.toCharArray(); | ||||
| 			for (char c : charArray) { | ||||
| 				charSet.add(c); | ||||
| 			} | ||||
| 			StrUtil.forEach(string, charSet::add); | ||||
| 		} | ||||
| 		allCharSize = charSet.size(); | ||||
| 		int index = 0; | ||||
| 		for (Character c : charSet) { | ||||
| 			charIndex.put(c,index); | ||||
| 		for (final Character c : charSet) { | ||||
| 			charIndexMap.put(c,index); | ||||
| 			index ++; | ||||
| 		} | ||||
|  | ||||
| 		root = Node.createRoot(allCharSize); | ||||
| 		this.root = Node.createRoot(index); | ||||
|  | ||||
| 		buildPrefixTree(stringSet); | ||||
| 		buildFail(); | ||||
| @@ -53,19 +58,18 @@ public class MultiStrFinder { | ||||
|  | ||||
| 	/** | ||||
| 	 * 构建前缀树 | ||||
| 	 * | ||||
| 	 * @param stringSst 待匹配的字符串 | ||||
| 	 */ | ||||
| 	protected void buildPrefixTree(Collection<String> stringSst){ | ||||
| 	protected void buildPrefixTree(final Collection<String> stringSst) { | ||||
| 		// 节点编号 根节点已经是0了 所以从 1开始编号 | ||||
| 		int nodeIndex = 1; | ||||
| 		for (String string : stringSst) { | ||||
| 		for (final String string : stringSst) { | ||||
| 			Node node = root; | ||||
| 			char[] charArray = string.toCharArray(); | ||||
| 			for (int i = 0; i < charArray.length; i++) { | ||||
| 				char c = charArray[i]; | ||||
| 				boolean addValue = node.addValue(c, nodeIndex, charIndex); | ||||
| 				if(addValue){ | ||||
| 					nodeIndex ++; | ||||
| 			for (final char c : string.toCharArray()) { | ||||
| 				final boolean addValue = node.addValue(c, nodeIndex, charIndexMap); | ||||
| 				if (addValue) { | ||||
| 					nodeIndex++; | ||||
| 				} | ||||
| 				node = node.directRouter[getIndex(c)]; | ||||
| 			} | ||||
| @@ -78,11 +82,11 @@ public class MultiStrFinder { | ||||
| 	 * 构建 fail指针过程 | ||||
| 	 * 构建 directRouter 直接访问路由表 减少跳fail次数 直接跳 router 边 | ||||
| 	 */ | ||||
| 	protected void buildFail(){ | ||||
| 		LinkedList<Node> nodeQueue = new LinkedList<>(); | ||||
| 	protected void buildFail() { | ||||
| 		final LinkedList<Node> nodeQueue = new LinkedList<>(); | ||||
| 		for (int i = 0; i < root.directRouter.length; i++) { | ||||
| 			Node nextNode = root.directRouter[i]; | ||||
| 			if(nextNode == null){ | ||||
| 			final Node nextNode = root.directRouter[i]; | ||||
| 			if (nextNode == null) { | ||||
| 				root.directRouter[i] = root; | ||||
| 				continue; | ||||
| 			} | ||||
| @@ -91,13 +95,13 @@ public class MultiStrFinder { | ||||
| 		} | ||||
|  | ||||
| 		// 进行广度优先遍历 | ||||
| 		while (!nodeQueue.isEmpty()){ | ||||
| 			Node parent = nodeQueue.removeFirst(); | ||||
| 		while (!nodeQueue.isEmpty()) { | ||||
| 			final Node parent = nodeQueue.removeFirst(); | ||||
| 			// 因为 使用了 charIndex 进行字符到下标的映射 i 可以直接认为就是对应字符 char | ||||
| 			for (int i = 0; i < parent.directRouter.length; i++) { | ||||
| 				Node child = parent.directRouter[i]; | ||||
| 				final Node child = parent.directRouter[i]; | ||||
| 				// child 为 null 表示没有子节点 | ||||
| 				if(child == null){ | ||||
| 				if (child == null) { | ||||
| 					parent.directRouter[i] = parent.fail.directRouter[i]; | ||||
| 					continue; | ||||
| 				} | ||||
| @@ -110,27 +114,28 @@ public class MultiStrFinder { | ||||
|  | ||||
| 	/** | ||||
| 	 * 查询匹配的字符串 | ||||
| 	 * | ||||
| 	 * @param text 返回每个匹配的 字符串 value是字符首字母地址 | ||||
| 	 * @return | ||||
| 	 * @return 匹配结果 | ||||
| 	 */ | ||||
| 	public Map<String,List<Integer>> findMatch(String text){ | ||||
| 	public Map<String, List<Integer>> findMatch(final String text) { | ||||
| 		// 节点经过次数 放在方法内部声明变量 希望可以一个构建对象 进行多次匹配 | ||||
| 		HashMap<String, List<Integer>> resultMap = new HashMap<>(); | ||||
| 		final HashMap<String, List<Integer>> resultMap = new HashMap<>(); | ||||
|  | ||||
| 		char[] chars = text.toCharArray(); | ||||
| 		final char[] chars = text.toCharArray(); | ||||
| 		Node currentNode = root; | ||||
| 		for (int i = 0; i < chars.length; i++) { | ||||
| 			char c = chars[i]; | ||||
| 			Integer index = charIndex.get(c); | ||||
| 			final char c = chars[i]; | ||||
| 			final Integer index = charIndexMap.get(c); | ||||
| 			// 找不到字符索引 认为一定不在匹配字符中存在 直接从根节点开始重新计算 | ||||
| 			if(index == null){ | ||||
| 			if (index == null) { | ||||
| 				currentNode = root; | ||||
| 				continue; | ||||
| 			} | ||||
| 			// 进入下一跳 可能是正常下一跳 也可能是fail加上后的 下一跳 | ||||
| 			currentNode = currentNode.directRouter[index]; | ||||
| 			// 判断是否尾部节点 是尾节点 说明已经匹配到了完整的字符串 将匹配结果写入返回对象 | ||||
| 			if(currentNode.isEnd){ | ||||
| 			if (currentNode.isEnd) { | ||||
| 				resultMap.computeIfAbsent(currentNode.tagetString, k -> new ArrayList<>()) | ||||
| 					.add(i - currentNode.tagetString.length() + 1); | ||||
| 			} | ||||
| @@ -143,22 +148,19 @@ public class MultiStrFinder { | ||||
|  | ||||
| 	/** | ||||
| 	 * 获取字符 下标 | ||||
| 	 * @param c | ||||
| 	 * @return | ||||
| 	 * | ||||
| 	 * @param c 字符 | ||||
| 	 * @return 下标 | ||||
| 	 */ | ||||
| 	protected int getIndex(char c){ | ||||
| 		Integer i = charIndex.get(c); | ||||
| 		if(i == null){ | ||||
| 	protected int getIndex(final char c) { | ||||
| 		final Integer i = charIndexMap.get(c); | ||||
| 		if (i == null) { | ||||
| 			return -1; | ||||
| 		} | ||||
| 		return i; | ||||
| 	} | ||||
|  | ||||
|  | ||||
| 	public static MultiStrFinder create(Collection<String> source){ | ||||
| 		return new MultiStrFinder(source); | ||||
| 	} | ||||
|  | ||||
| 	/** | ||||
| 	 * AC 自动机节点 | ||||
| 	 */ | ||||
| @@ -189,19 +191,21 @@ public class MultiStrFinder { | ||||
| 		// fail指针来源 | ||||
| 		public List<Node> failPre = new ArrayList<>(); | ||||
|  | ||||
| 		public Node(){} | ||||
| 		public Node() { | ||||
| 		} | ||||
|  | ||||
| 		/** | ||||
| 		 * 新增子节点 | ||||
| 		 * @param c 字符 | ||||
| 		 * | ||||
| 		 * @param c         字符 | ||||
| 		 * @param nodeIndex 节点编号 | ||||
| 		 * @param charIndex 字符索引 | ||||
| 		 * @return 如果已经存在子节点 false 新增 ture | ||||
| 		 */ | ||||
| 		public boolean addValue(char c, int nodeIndex ,Map<Character,Integer> charIndex){ | ||||
| 			Integer index = charIndex.get(c); | ||||
| 		public boolean addValue(final char c, final int nodeIndex, final Map<Character, Integer> charIndex) { | ||||
| 			final Integer index = charIndex.get(c); | ||||
| 			Node node = directRouter[index]; | ||||
| 			if(node != null){ | ||||
| 			if (node != null) { | ||||
| 				return false; | ||||
| 			} | ||||
| 			node = new Node(); | ||||
| @@ -214,22 +218,24 @@ public class MultiStrFinder { | ||||
|  | ||||
| 		/** | ||||
| 		 * 标记当前节点为 字符串尾节点 | ||||
| 		 * | ||||
| 		 * @param string | ||||
| 		 */ | ||||
| 		public void setEnd(String string){ | ||||
| 		public void setEnd(final String string) { | ||||
| 			tagetString = string; | ||||
| 			isEnd = true; | ||||
| 		} | ||||
|  | ||||
| 		/** | ||||
| 		 * 获取下一跳 | ||||
| 		 * @param c 字符 | ||||
| 		 * | ||||
| 		 * @param c         字符 | ||||
| 		 * @param charIndex 字符索引 | ||||
| 		 * @return | ||||
| 		 */ | ||||
| 		public Node getNext(char c,Map<Character,Integer> charIndex){ | ||||
| 			Integer index = charIndex.get(c); | ||||
| 			if(index == null){ | ||||
| 		public Node getNext(final char c, final Map<Character, Integer> charIndex) { | ||||
| 			final Integer index = charIndex.get(c); | ||||
| 			if (index == null) { | ||||
| 				return null; | ||||
| 			} | ||||
| 			return directRouter[index]; | ||||
| @@ -237,11 +243,12 @@ public class MultiStrFinder { | ||||
|  | ||||
| 		/** | ||||
| 		 * 构建根节点 | ||||
| 		 * | ||||
| 		 * @param allCharSize 全部字符数量 | ||||
| 		 * @return | ||||
| 		 */ | ||||
| 		public static Node createRoot(int allCharSize){ | ||||
| 			Node node = new Node(); | ||||
| 		public static Node createRoot(final int allCharSize) { | ||||
| 			final Node node = new Node(); | ||||
| 			node.nodeIndex = 0; | ||||
| 			node.fail = node; | ||||
| 			node.directRouter = new Node[allCharSize]; | ||||
|   | ||||
| @@ -14,9 +14,9 @@ import java.util.*; | ||||
|  *  3、"abc", "bc"  会优先替换"abc" | ||||
|  * | ||||
|  * @author newshiJ | ||||
|  * @date 2024/8/2 下午3:41 | ||||
|  */ | ||||
| public class HighMultiReplacerV2 extends StrReplacer { | ||||
| 	private static final long serialVersionUID = 1L; | ||||
|  | ||||
| 	private final AhoCorasickAutomaton ahoCorasickAutomaton; | ||||
|  | ||||
| @@ -49,19 +49,17 @@ public class HighMultiReplacerV2 extends StrReplacer { | ||||
| 	protected static class AhoCorasickAutomaton extends MultiStrFinder{ | ||||
| 		protected final Map<String,String> replaceMap; | ||||
|  | ||||
| 		public AhoCorasickAutomaton(Map<String,String> replaceMap){ | ||||
| 		public AhoCorasickAutomaton(final Map<String,String> replaceMap){ | ||||
| 			super(replaceMap.keySet()); | ||||
| 			this.replaceMap = replaceMap; | ||||
| 		} | ||||
|  | ||||
|  | ||||
| 		public void replace(final CharSequence text, final StringBuilder stringBuilder){ | ||||
| 			Node currentNode = root; | ||||
| 			// 临时字符串存储空间 | ||||
| 			StringBuilder temp = new StringBuilder(); | ||||
| 			final StringBuilder temp = new StringBuilder(); | ||||
| 			for (int i = 0; i < text.length(); i++) { | ||||
| 				char ch = text.charAt(i); | ||||
| 				Integer index = charIndex.get(ch); | ||||
| 				final char ch = text.charAt(i); | ||||
| 				final Integer index = charIndexMap.get(ch); | ||||
| 				// 下一个字符在候选转换字符串中都不存在 ch字符一定不会被替换 | ||||
| 				if(index < 0){ | ||||
| 					// 临时缓存空间中的数据写入到输出的 StringBuilder | ||||
| @@ -94,7 +92,7 @@ public class HighMultiReplacerV2 extends StrReplacer { | ||||
|  | ||||
| 				// 表示匹配到 现在进行字符串替换工作 | ||||
| 				if(currentNode.isEnd){ | ||||
| 					int length = currentNode.tagetString.length(); | ||||
| 					final int length = currentNode.tagetString.length(); | ||||
| 					// 先清理匹配到的字符 最后一个字符未加入临时空间 | ||||
| 					temp.delete(temp.length() - length + 1,length - 1); | ||||
| 					if(temp.length() > 0){ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Looly
					Looly