From a53e994a84026fe4188c3f7d69216f1095cdd6d6 Mon Sep 17 00:00:00 2001 From: kyao <24456865@qq.com> Date: Wed, 30 Aug 2023 11:05:20 +0800 Subject: [PATCH] =?UTF-8?q?feat:=E5=9F=BA=E4=BA=8EAC=E8=87=AA=E5=8A=A8?= =?UTF-8?q?=E6=9C=BA=E7=AE=97=E6=B3=95=E5=AE=9E=E7=8E=B0=E7=9A=84=E9=AB=98?= =?UTF-8?q?=E6=95=88=E6=9B=BF=E6=8D=A2=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/text/replacer/HighMultiReplacer.java | 212 ++++++++++++++++++ .../text/replacer/HighMultiReplacerTest.java | 32 +++ 2 files changed, 244 insertions(+) create mode 100644 hutool-core/src/main/java/org/dromara/hutool/core/text/replacer/HighMultiReplacer.java create mode 100644 hutool-core/src/test/java/org/dromara/hutool/core/text/replacer/HighMultiReplacerTest.java diff --git a/hutool-core/src/main/java/org/dromara/hutool/core/text/replacer/HighMultiReplacer.java b/hutool-core/src/main/java/org/dromara/hutool/core/text/replacer/HighMultiReplacer.java new file mode 100644 index 000000000..4980abc8c --- /dev/null +++ b/hutool-core/src/main/java/org/dromara/hutool/core/text/replacer/HighMultiReplacer.java @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2023 looly(loolly@aliyun.com) + * Hutool is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * https://license.coscl.org.cn/MulanPSL2 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + */ + +package org.dromara.hutool.core.text.replacer; + +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Map; + +/** + * 高效替换器,通过查找指定关键字,替换对应的值 + * 基于AC自动机算法实现,需要被替换的原字符串越大,替换的键值对越多,效率提升越明显 + *

+ * 注意: 如果需要被替换的关键字出现交叉,最先匹配中的关键字会被替换 + * 1、"abc","ab" 会优先替换"ab" + * 2、"abed","be" 会优先替换"abed" + * 3、"abc", "bc" 会优先替换"abc" + * + * @author kyao + */ +public class HighMultiReplacer extends StrReplacer { + + private final AhoCorasickAutomaton ahoCorasickAutomaton; + + /** + * 构造 + * + * @param map key为需要被查找的字符串,value为对应的替换的值 + */ + public HighMultiReplacer(Map map) { + ahoCorasickAutomaton = new AhoCorasickAutomaton(map); + } + + /** + * 生成一个HighMultiReplacer对象 + * + * @param map key为需要被查找的字符串,value为对应的替换的值 + * @return org.dromara.hutool.core.text.replacer.HighMultiReplacer + */ + public static HighMultiReplacer of(Map map) { + return new HighMultiReplacer(map); + } + + @Override + protected int replace(CharSequence str, int pos, StringBuilder out) { + ahoCorasickAutomaton.replace(str, out); + return str.length(); + } + + @Override + public CharSequence apply(final CharSequence str) { + StringBuilder builder = new StringBuilder(); + replace(str, 0, builder); + return builder; + } + + /** + * AC自动机 + */ + private static class AhoCorasickAutomaton { + + /*AC自动机的根结点,根结点不存储任何字符信息*/ + private final Node root; + + /*待查找的目标字符串集合*/ + private final Map target; + + /** + * @param target 待查找的目标字符串集合 + */ + public AhoCorasickAutomaton(Map target) { + root = new Node(); + this.target = target; + buildTrieTree(); + buildAcFromTrie(); + } + + /** + * 用于表示AC自动机的每个结点,在每个结点中我们并没有存储该结点对应的字符 + */ + private static class Node { + + /*如果该结点是一个终点,即,从根结点到此结点表示了一个目标字符串,则str != null, 且str就表示该字符串*/ + String str; + + /*该节点下的子节点*/ + Map children = new HashMap<>(); + + /*当前结点的孩子结点不能匹配文本串中的某个字符时,下一个应该查找的结点*/ + Node fail; + + public boolean isWord() { + return str != null; + } + + } + + /** + * 由目标字符串构建Trie树 + */ + private void buildTrieTree() { + for (String targetStr : target.keySet()) { + Node curr = root; + if (targetStr == null) { + continue; + } + for (int i = 0; i < targetStr.length(); i++) { + char ch = targetStr.charAt(i); + Node node = curr.children.get(ch); + if (node == null) { + node = new Node(); + curr.children.put(ch, node); + } + curr = node; + } + /*将每个目标字符串的最后一个字符对应的结点变成终点*/ + curr.str = targetStr; + } + } + + /** + * 由Trie树构建AC自动机,本质是一个自动机,相当于构建KMP算法的next数组 + */ + private void buildAcFromTrie() { + /*广度优先遍历所使用的队列*/ + LinkedList queue = new LinkedList<>(); + + /*单独处理根结点的所有孩子结点*/ + for (Node x : root.children.values()) { + /*根结点的所有孩子结点的fail都指向根结点*/ + x.fail = root; + queue.addLast(x);/*所有根结点的孩子结点入列*/ + } + + while (!queue.isEmpty()) { + /*确定出列结点的所有孩子结点的fail的指向*/ + Node p = queue.removeFirst(); + for (Map.Entry entry : p.children.entrySet()) { + + /*孩子结点入列*/ + queue.addLast(entry.getValue()); + /*从p.fail开始找起*/ + Node failTo = p.fail; + while (true) { + /*说明找到了根结点还没有找到*/ + if (failTo == null) { + entry.getValue().fail = root; + break; + } + + /*说明有公共前缀*/ + if (failTo.children.get(entry.getKey()) != null) { + entry.getValue().fail = failTo.children.get(entry.getKey()); + break; + } else {/*继续向上寻找*/ + failTo = failTo.fail; + } + } + + } + } + } + + /** + * 在文本串中替换所有的目标字符串 + * + * @param text 被替换的目标字符串 + * @param stringBuilder 替换后的结果 + */ + public void replace(CharSequence text, StringBuilder stringBuilder) { + Node curr = root; + int i = 0; + while (i < text.length()) { + /*文本串中的字符*/ + char ch = text.charAt(i); + /*文本串中的字符和AC自动机中的字符进行比较*/ + Node node = curr.children.get(ch); + if (node != null) { + stringBuilder.append(ch); + /*若相等,自动机进入下一状态*/ + curr = node; + if (curr.isWord()) { + stringBuilder.delete(stringBuilder.length() - curr.str.length(), stringBuilder.length()); + stringBuilder.append(target.get(curr.str)); + curr = root; + } + /*索引自增,指向下一个文本串中的字符*/ + i++; + } else { + /*若不等,找到下一个应该比较的状态*/ + curr = curr.fail; + /*到根结点还未找到,说明文本串中以ch作为结束的字符片段不是任何目标字符串的前缀,状态机重置,比较下一个字符*/ + if (curr == null) { + stringBuilder.append(ch); + curr = root; + i++; + } + } + } + } + + } +} diff --git a/hutool-core/src/test/java/org/dromara/hutool/core/text/replacer/HighMultiReplacerTest.java b/hutool-core/src/test/java/org/dromara/hutool/core/text/replacer/HighMultiReplacerTest.java new file mode 100644 index 000000000..3dc5284b6 --- /dev/null +++ b/hutool-core/src/test/java/org/dromara/hutool/core/text/replacer/HighMultiReplacerTest.java @@ -0,0 +1,32 @@ +package org.dromara.hutool.core.text.replacer; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.HashMap; +import java.util.Map; + +public class HighMultiReplacerTest { + + @Test + public void replaceTest() { + Map map = new HashMap<>(); + map.put("abc", "bar"); + map.put("123", "1234"); + map.put("12", "哈哈哈"); + map.put("bc", "67868"); + map.put("你好", "hello world"); + map.put("AAACC", "%^$%$"); + map.put("_啊", "qqqq"); + HighMultiReplacer replacer = HighMultiReplacer.of(map); + Assertions.assertEquals("bar哈哈哈hello world4", replacer.apply("abc12你好4").toString()); + Assertions.assertEquals("qqqq啊qqqq-啊", replacer.apply("_啊啊_啊-啊").toString()); + Assertions.assertEquals("哈哈哈3456789", replacer.apply("123456789").toString()); + Assertions.assertEquals("AAAC67868", replacer.apply("AAACbc").toString()); + Assertions.assertEquals("哈哈哈哈哈1哈哈", replacer.apply("哈哈121哈哈").toString()); + Assertions.assertEquals("你hello world好%^$%$CACAC", replacer.apply("你你好好AAACCCACAC").toString()); + Assertions.assertEquals("哈哈哈3", replacer.apply("123").toString()); + Assertions.assertEquals("---11", replacer.apply("---11").toString()); + Assertions.assertEquals("", replacer.apply("").toString()); + } +}