AC自动机实现

This commit is contained in:
Looly
2024-08-10 09:00:14 +08:00
parent f78f9569e6
commit 1578a46761
3 changed files with 111 additions and 85 deletions

View File

@@ -43,10 +43,7 @@ import java.util.HashSet;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.function.Function; import java.util.function.*;
import java.util.function.Predicate;
import java.util.function.Supplier;
import java.util.function.UnaryOperator;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@@ -1494,17 +1491,17 @@ public class CharSequenceUtil extends StrValidator {
if (startWith(str2, prefix, ignoreCase)) { if (startWith(str2, prefix, ignoreCase)) {
from = prefix.length(); from = prefix.length();
if(from == to){ if (from == to) {
// "a", "a", "a" -> "" // "a", "a", "a" -> ""
return EMPTY; return EMPTY;
} }
} }
if (endWith(str2, suffix, ignoreCase)) { if (endWith(str2, suffix, ignoreCase)) {
to -= suffix.length(); to -= suffix.length();
if(from == to){ if (from == to) {
// "a", "a", "a" -> "" // "a", "a", "a" -> ""
return EMPTY; return EMPTY;
} else if(to < from){ } else if (to < from) {
// pre去除后和suffix有重叠如 ("aba", "ab", "ba") -> "a" // pre去除后和suffix有重叠如 ("aba", "ab", "ba") -> "a"
to += suffix.length(); to += suffix.length();
} }
@@ -1574,22 +1571,22 @@ public class CharSequenceUtil extends StrValidator {
int from = 0; int from = 0;
int to = str2.length(); int to = str2.length();
if(!prefixStr.isEmpty()){ if (!prefixStr.isEmpty()) {
while (str2.startsWith(prefixStr, from)) { while (str2.startsWith(prefixStr, from)) {
from += prefix.length(); from += prefix.length();
if(from == to){ if (from == to) {
// "a", "a", "a" -> "" // "a", "a", "a" -> ""
return EMPTY; return EMPTY;
} }
} }
} }
if(!suffixStr.isEmpty()){ if (!suffixStr.isEmpty()) {
while (str2.startsWith(suffixStr, to - suffixStr.length())) { while (str2.startsWith(suffixStr, to - suffixStr.length())) {
to -= suffixStr.length(); to -= suffixStr.length();
if(from == to){ if (from == to) {
// "a", "a", "a" -> "" // "a", "a", "a" -> ""
return EMPTY; return EMPTY;
}else if(to < from){ } else if (to < from) {
// pre去除后和suffix有重叠如 ("aba", "ab", "ba") -> "a" // pre去除后和suffix有重叠如 ("aba", "ab", "ba") -> "a"
to += suffixStr.length(); to += suffixStr.length();
break; break;
@@ -1730,7 +1727,7 @@ public class CharSequenceUtil extends StrValidator {
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();
final int subLen = toIndex - fromIndex; final int subLen = toIndex - fromIndex;
str.toString().codePoints().skip(fromIndex).limit(subLen).forEach(v -> sb.append(Character.toChars(v))); str.codePoints().skip(fromIndex).limit(subLen).forEach(v -> sb.append(Character.toChars(v)));
return sb.toString(); return sb.toString();
} }
@@ -4131,4 +4128,28 @@ public class CharSequenceUtil extends StrValidator {
} }
return (isCodePoint ? str.codePoints() : str.chars()).toArray(); return (isCodePoint ? str.codePoints() : str.chars()).toArray();
} }
/**
* 遍历字符串的每个字符,并处理
*
* @param str 字符串
* @param consumer 字符处理
*/
public static void forEach(final CharSequence str, final Consumer<Character> consumer) {
forEach(str, false, (cInt)-> consumer.accept((char) cInt));
}
/**
* 遍历字符串的每个字符,并处理
*
* @param str 字符串
* @param isCodePoint 是否为Unicode码点即支持emoji等多char字符
* @param consumer 字符处理
*/
public static void forEach(final CharSequence str, final boolean isCodePoint, final IntConsumer consumer) {
if (null == str) {
return;
}
(isCodePoint ? str.codePoints() : str.chars()).forEach(consumer);
}
} }

View File

@@ -1,51 +1,56 @@
package org.dromara.hutool.core.text.finder; package org.dromara.hutool.core.text.finder;
import org.dromara.hutool.core.text.StrUtil;
import java.util.*; import java.util.*;
/** /**
* 多字符串查询器 底层思路 使用 AC 自动机实现 * 多字符串查询器 底层思路 使用 AC 自动机实现
*
* @author newshiJ * @author newshiJ
* @date 2024/8/2 上午10:07
*/ */
public class MultiStrFinder { public class MultiStrFinder {
// 字符索引 /**
protected final Map<Character,Integer> charIndex = new HashMap<>(); * 创建多字符串查询器
* @param source 字符串集合
* @return 多字符串查询器
*/
public static MultiStrFinder of(final Collection<String> source) {
return new MultiStrFinder(source);
}
// 字符索引
protected final Map<Character, Integer> charIndexMap = new HashMap<>();
// 全部字符数量 // 全部字符数量
protected final int allCharSize; protected final int allCharSize;
// 根节点 // 根节点
protected final Node root; protected final Node root;
// 全部节点数量 // 全部节点数量
int nodeSize; int nodeSize;
/** /**
* 构建多字符串查询器 * 构建多字符串查询器
* @param source *
* @param source 字符串集合
*/ */
public MultiStrFinder(Collection<String> source){ public MultiStrFinder(final Collection<String> source) {
// 待匹配的字符串 // 待匹配的字符串
final Set<String> stringSet = new HashSet<>(); final Set<String> stringSet = new HashSet<>();
// 所有字符 // 所有字符
final Set<Character> charSet = new HashSet<>(); final Set<Character> charSet = new HashSet<>();
for (String string : source) { for (final String string : source) {
stringSet.add(string); stringSet.add(string);
char[] charArray = string.toCharArray(); StrUtil.forEach(string, charSet::add);
for (char c : charArray) {
charSet.add(c);
}
} }
allCharSize = charSet.size(); allCharSize = charSet.size();
int index = 0; int index = 0;
for (Character c : charSet) { for (final Character c : charSet) {
charIndex.put(c,index); charIndexMap.put(c,index);
index ++; index ++;
} }
this.root = Node.createRoot(index);
root = Node.createRoot(allCharSize);
buildPrefixTree(stringSet); buildPrefixTree(stringSet);
buildFail(); buildFail();
@@ -53,19 +58,18 @@ public class MultiStrFinder {
/** /**
* 构建前缀树 * 构建前缀树
*
* @param stringSst 待匹配的字符串 * @param stringSst 待匹配的字符串
*/ */
protected void buildPrefixTree(Collection<String> stringSst){ protected void buildPrefixTree(final Collection<String> stringSst) {
// 节点编号 根节点已经是0了 所以从 1开始编号 // 节点编号 根节点已经是0了 所以从 1开始编号
int nodeIndex = 1; int nodeIndex = 1;
for (String string : stringSst) { for (final String string : stringSst) {
Node node = root; Node node = root;
char[] charArray = string.toCharArray(); for (final char c : string.toCharArray()) {
for (int i = 0; i < charArray.length; i++) { final boolean addValue = node.addValue(c, nodeIndex, charIndexMap);
char c = charArray[i]; if (addValue) {
boolean addValue = node.addValue(c, nodeIndex, charIndex); nodeIndex++;
if(addValue){
nodeIndex ++;
} }
node = node.directRouter[getIndex(c)]; node = node.directRouter[getIndex(c)];
} }
@@ -78,11 +82,11 @@ public class MultiStrFinder {
* 构建 fail指针过程 * 构建 fail指针过程
* 构建 directRouter 直接访问路由表 减少跳fail次数 直接跳 router 边 * 构建 directRouter 直接访问路由表 减少跳fail次数 直接跳 router 边
*/ */
protected void buildFail(){ protected void buildFail() {
LinkedList<Node> nodeQueue = new LinkedList<>(); final LinkedList<Node> nodeQueue = new LinkedList<>();
for (int i = 0; i < root.directRouter.length; i++) { for (int i = 0; i < root.directRouter.length; i++) {
Node nextNode = root.directRouter[i]; final Node nextNode = root.directRouter[i];
if(nextNode == null){ if (nextNode == null) {
root.directRouter[i] = root; root.directRouter[i] = root;
continue; continue;
} }
@@ -91,13 +95,13 @@ public class MultiStrFinder {
} }
// 进行广度优先遍历 // 进行广度优先遍历
while (!nodeQueue.isEmpty()){ while (!nodeQueue.isEmpty()) {
Node parent = nodeQueue.removeFirst(); final Node parent = nodeQueue.removeFirst();
// 因为 使用了 charIndex 进行字符到下标的映射 i 可以直接认为就是对应字符 char // 因为 使用了 charIndex 进行字符到下标的映射 i 可以直接认为就是对应字符 char
for (int i = 0; i < parent.directRouter.length; i++) { for (int i = 0; i < parent.directRouter.length; i++) {
Node child = parent.directRouter[i]; final Node child = parent.directRouter[i];
// child 为 null 表示没有子节点 // child 为 null 表示没有子节点
if(child == null){ if (child == null) {
parent.directRouter[i] = parent.fail.directRouter[i]; parent.directRouter[i] = parent.fail.directRouter[i];
continue; continue;
} }
@@ -110,27 +114,28 @@ public class MultiStrFinder {
/** /**
* 查询匹配的字符串 * 查询匹配的字符串
*
* @param text 返回每个匹配的 字符串 value是字符首字母地址 * @param text 返回每个匹配的 字符串 value是字符首字母地址
* @return * @return 匹配结果
*/ */
public Map<String,List<Integer>> findMatch(String text){ public Map<String, List<Integer>> findMatch(final String text) {
// 节点经过次数 放在方法内部声明变量 希望可以一个构建对象 进行多次匹配 // 节点经过次数 放在方法内部声明变量 希望可以一个构建对象 进行多次匹配
HashMap<String, List<Integer>> resultMap = new HashMap<>(); final HashMap<String, List<Integer>> resultMap = new HashMap<>();
char[] chars = text.toCharArray(); final char[] chars = text.toCharArray();
Node currentNode = root; Node currentNode = root;
for (int i = 0; i < chars.length; i++) { for (int i = 0; i < chars.length; i++) {
char c = chars[i]; final char c = chars[i];
Integer index = charIndex.get(c); final Integer index = charIndexMap.get(c);
// 找不到字符索引 认为一定不在匹配字符中存在 直接从根节点开始重新计算 // 找不到字符索引 认为一定不在匹配字符中存在 直接从根节点开始重新计算
if(index == null){ if (index == null) {
currentNode = root; currentNode = root;
continue; continue;
} }
// 进入下一跳 可能是正常下一跳 也可能是fail加上后的 下一跳 // 进入下一跳 可能是正常下一跳 也可能是fail加上后的 下一跳
currentNode = currentNode.directRouter[index]; currentNode = currentNode.directRouter[index];
// 判断是否尾部节点 是尾节点 说明已经匹配到了完整的字符串 将匹配结果写入返回对象 // 判断是否尾部节点 是尾节点 说明已经匹配到了完整的字符串 将匹配结果写入返回对象
if(currentNode.isEnd){ if (currentNode.isEnd) {
resultMap.computeIfAbsent(currentNode.tagetString, k -> new ArrayList<>()) resultMap.computeIfAbsent(currentNode.tagetString, k -> new ArrayList<>())
.add(i - currentNode.tagetString.length() + 1); .add(i - currentNode.tagetString.length() + 1);
} }
@@ -143,22 +148,19 @@ public class MultiStrFinder {
/** /**
* 获取字符 下标 * 获取字符 下标
* @param c *
* @return * @param c 字符
* @return 下标
*/ */
protected int getIndex(char c){ protected int getIndex(final char c) {
Integer i = charIndex.get(c); final Integer i = charIndexMap.get(c);
if(i == null){ if (i == null) {
return -1; return -1;
} }
return i; return i;
} }
public static MultiStrFinder create(Collection<String> source){
return new MultiStrFinder(source);
}
/** /**
* AC 自动机节点 * AC 自动机节点
*/ */
@@ -189,19 +191,21 @@ public class MultiStrFinder {
// fail指针来源 // fail指针来源
public List<Node> failPre = new ArrayList<>(); public List<Node> failPre = new ArrayList<>();
public Node(){} public Node() {
}
/** /**
* 新增子节点 * 新增子节点
*
* @param c 字符 * @param c 字符
* @param nodeIndex 节点编号 * @param nodeIndex 节点编号
* @param charIndex 字符索引 * @param charIndex 字符索引
* @return 如果已经存在子节点 false 新增 ture * @return 如果已经存在子节点 false 新增 ture
*/ */
public boolean addValue(char c, int nodeIndex ,Map<Character,Integer> charIndex){ public boolean addValue(final char c, final int nodeIndex, final Map<Character, Integer> charIndex) {
Integer index = charIndex.get(c); final Integer index = charIndex.get(c);
Node node = directRouter[index]; Node node = directRouter[index];
if(node != null){ if (node != null) {
return false; return false;
} }
node = new Node(); node = new Node();
@@ -214,22 +218,24 @@ public class MultiStrFinder {
/** /**
* 标记当前节点为 字符串尾节点 * 标记当前节点为 字符串尾节点
*
* @param string * @param string
*/ */
public void setEnd(String string){ public void setEnd(final String string) {
tagetString = string; tagetString = string;
isEnd = true; isEnd = true;
} }
/** /**
* 获取下一跳 * 获取下一跳
*
* @param c 字符 * @param c 字符
* @param charIndex 字符索引 * @param charIndex 字符索引
* @return * @return
*/ */
public Node getNext(char c,Map<Character,Integer> charIndex){ public Node getNext(final char c, final Map<Character, Integer> charIndex) {
Integer index = charIndex.get(c); final Integer index = charIndex.get(c);
if(index == null){ if (index == null) {
return null; return null;
} }
return directRouter[index]; return directRouter[index];
@@ -237,11 +243,12 @@ public class MultiStrFinder {
/** /**
* 构建根节点 * 构建根节点
*
* @param allCharSize 全部字符数量 * @param allCharSize 全部字符数量
* @return * @return
*/ */
public static Node createRoot(int allCharSize){ public static Node createRoot(final int allCharSize) {
Node node = new Node(); final Node node = new Node();
node.nodeIndex = 0; node.nodeIndex = 0;
node.fail = node; node.fail = node;
node.directRouter = new Node[allCharSize]; node.directRouter = new Node[allCharSize];

View File

@@ -14,9 +14,9 @@ import java.util.*;
* 3、"abc", "bc" 会优先替换"abc" * 3、"abc", "bc" 会优先替换"abc"
* *
* @author newshiJ * @author newshiJ
* @date 2024/8/2 下午3:41
*/ */
public class HighMultiReplacerV2 extends StrReplacer { public class HighMultiReplacerV2 extends StrReplacer {
private static final long serialVersionUID = 1L;
private final AhoCorasickAutomaton ahoCorasickAutomaton; private final AhoCorasickAutomaton ahoCorasickAutomaton;
@@ -49,19 +49,17 @@ public class HighMultiReplacerV2 extends StrReplacer {
protected static class AhoCorasickAutomaton extends MultiStrFinder{ protected static class AhoCorasickAutomaton extends MultiStrFinder{
protected final Map<String,String> replaceMap; protected final Map<String,String> replaceMap;
public AhoCorasickAutomaton(Map<String,String> replaceMap){ public AhoCorasickAutomaton(final Map<String,String> replaceMap){
super(replaceMap.keySet()); super(replaceMap.keySet());
this.replaceMap = replaceMap; this.replaceMap = replaceMap;
} }
public void replace(final CharSequence text, final StringBuilder stringBuilder){ public void replace(final CharSequence text, final StringBuilder stringBuilder){
Node currentNode = root; Node currentNode = root;
// 临时字符串存储空间 // 临时字符串存储空间
StringBuilder temp = new StringBuilder(); final StringBuilder temp = new StringBuilder();
for (int i = 0; i < text.length(); i++) { for (int i = 0; i < text.length(); i++) {
char ch = text.charAt(i); final char ch = text.charAt(i);
Integer index = charIndex.get(ch); final Integer index = charIndexMap.get(ch);
// 下一个字符在候选转换字符串中都不存在 ch字符一定不会被替换 // 下一个字符在候选转换字符串中都不存在 ch字符一定不会被替换
if(index < 0){ if(index < 0){
// 临时缓存空间中的数据写入到输出的 StringBuilder // 临时缓存空间中的数据写入到输出的 StringBuilder
@@ -94,7 +92,7 @@ public class HighMultiReplacerV2 extends StrReplacer {
// 表示匹配到 现在进行字符串替换工作 // 表示匹配到 现在进行字符串替换工作
if(currentNode.isEnd){ if(currentNode.isEnd){
int length = currentNode.tagetString.length(); final int length = currentNode.tagetString.length();
// 先清理匹配到的字符 最后一个字符未加入临时空间 // 先清理匹配到的字符 最后一个字符未加入临时空间
temp.delete(temp.length() - length + 1,length - 1); temp.delete(temp.length() - length + 1,length - 1);
if(temp.length() > 0){ if(temp.length() > 0){