fix:避免调用方 显示调用API 触发查找树 优化;并通过内置锁,避免因并行树优化 可能造成的不可预知结果 和 无效重复的 树优化操作

This commit is contained in:
renyp 2023-03-16 20:40:36 +08:00
parent 6cd998f648
commit 00e9af4ffb
2 changed files with 164 additions and 128 deletions

View File

@ -4,141 +4,177 @@ import java.util.*;
/** /**
* <p> * <p>
*
* 基于非确定性有穷自动机NFA 实现的多模匹配工具 * 基于非确定性有穷自动机NFA 实现的多模匹配工具
* </p>
* *
* @author renyp * @author renyp
*/ */
public class Automaton { public class Automaton {
private final Node root; /**
* AC树的根节点
*/
private final Node root;
/**
* 标记是否需要构建AC自动机做树优化
*/
private volatile boolean needBuildAC;
/** /**
* 默认构造 * 内置锁防止并发场景并行建AC树造成不可预知结果
*/ */
public Automaton() { private final Object lock;
this.root = new Node();
}
/** /**
* 构造函数 初始化词库 * 默认构造
* */
* @param words 添加的新词 public Automaton() {
*/ this.root = new Node();
public Automaton(String... words) { this.needBuildAC = true;
this(); this.lock = new Object();
this.insert(words); }
}
/** /**
* 词库添加新词初始化查找树 * 构造函数 初始化词库
* *
* @param word 添加的新词 * @param words 添加的新词
*/ */
public void insert(String word) { public Automaton(String... words) {
Node p = root; this();
for (char curr : word.toCharArray()) { this.insert(words);
int ind = curr; }
if (p.next.get(ind) == null) {
p.next.put(ind, new Node());
}
p = p.next.get(ind);
}
p.flag = true;
p.str = word;
}
/** /**
* 词库批量添加新词初始化查找树 * 词库添加新词初始化查找树
* *
* @param words 添加的新词 * @param word 添加的新词
*/ */
public void insert(String... words) { public void insert(String word) {
for (String word : words) { needBuildAC = true;
this.insert(word); Node p = root;
} for (char curr : word.toCharArray()) {
} int ind = curr;
if (p.next.get(ind) == null) {
p.next.put(ind, new Node());
}
p = p.next.get(ind);
}
p.flag = true;
p.str = word;
}
/** /**
* 构建基于NFA模型的 AC自动机 * 词库批量添加新词初始化查找树
*/ *
public void buildAc() { * @param words 添加的新词
Queue<Node> queue = new LinkedList<>(); */
Node p = root; public void insert(String... words) {
for (Integer key : p.next.keySet()) { for (String word : words) {
p.next.get(key).fail = root; this.insert(word);
queue.offer(p.next.get(key)); }
} }
while (!queue.isEmpty()) {
Node curr = queue.poll();
for (Integer key : curr.next.keySet()) {
Node fail = curr.fail;
// 查找当前节点匹配失败他对应等效匹配的节点是哪个
while (fail != null && fail.next.get(key) == null) {
fail = fail.fail;
}
// 代码到这有两种可能fail不为null说明找到了failfail为null没有找到那么就把fail指向root节点当到该节点匹配失败那么从root节点开始重新匹配
if (fail != null) {
fail = fail.next.get(key);
} else {
fail = root;
}
curr.next.get(key).fail = fail;
queue.offer(curr.next.get(key));
}
}
}
/** /**
* @param text 查询的文本母串 * 构建基于NFA模型的 AC自动机
*/ */
public List<FoundWord> find(String text) { private void buildAc() {
return this.find(text, true); Queue<Node> queue = new LinkedList<>();
} Node p = root;
for (Integer key : p.next.keySet()) {
p.next.get(key).fail = root;
queue.offer(p.next.get(key));
}
while (!queue.isEmpty()) {
Node curr = queue.poll();
for (Integer key : curr.next.keySet()) {
Node fail = curr.fail;
// 查找当前节点匹配失败他对应等效匹配的节点是哪个
while (fail != null && fail.next.get(key) == null) {
fail = fail.fail;
}
// 代码到这有两种可能fail不为null说明找到了failfail为null没有找到那么就把fail指向root节点当到该节点匹配失败那么从root节点开始重新匹配
if (fail != null) {
fail = fail.next.get(key);
} else {
fail = root;
}
curr.next.get(key).fail = fail;
queue.offer(curr.next.get(key));
}
}
needBuildAC = false;
}
/** /**
* @param text 查找的文本母串 * @param text 查询的文本母串
* @param isDensityMatch 是否密集匹配 */
*/ public List<FoundWord> find(String text) {
public List<FoundWord> find(String text, boolean isDensityMatch) { return this.find(text, true);
List<FoundWord> ans = new ArrayList<>(); }
Node p = root, k = null;
for (int i = 0, len = text.length(); i < len; i++) {
int ind = text.charAt(i);
// 状态转移(沿着fail指针链接的链表此处区别于DFA模型)
while (p != null && p.next.get(ind) == null) {
p = p.fail;
}
if (p == null) {
p = root;
} else {
p = p.next.get(ind);
}
// 提取结果(沿着fail指针链接的链表此处区别于DFA模型)
k = p;
while (k != null) {
if (k.flag) {
ans.add(new FoundWord(k.str, k.str, i - k.str.length() + 1, i));
if (!isDensityMatch) {
p = root;
break;
}
}
k = k.fail;
}
}
return ans;
}
private static class Node { /**
* @param text 查找的文本母串
* @param isDensityMatch 是否密集匹配
*/
public List<FoundWord> find(String text, boolean isDensityMatch) {
// double check防止重复无用的 buildAC
if (needBuildAC) {
synchronized (lock) {
if (needBuildAC) {
this.buildAc();
}
}
}
List<FoundWord> ans = new ArrayList<>();
Node p = root, k = null;
for (int i = 0, len = text.length(); i < len; i++) {
int ind = text.charAt(i);
// 状态转移(沿着fail指针链接的链表此处区别于DFA模型)
while (p != null && p.next.get(ind) == null) {
p = p.fail;
}
if (p == null) {
p = root;
} else {
p = p.next.get(ind);
}
// 提取结果(沿着fail指针链接的链表此处区别于DFA模型)
k = p;
while (k != null) {
if (k.flag) {
ans.add(new FoundWord(k.str, k.str, i - k.str.length() + 1, i));
if (!isDensityMatch) {
p = root;
break;
}
}
k = k.fail;
}
}
return ans;
}
boolean flag; private static class Node {
Node fail;
String str;
Map<Integer, Node> next;
public Node() { /**
this.flag = false; * 当前节点是否是一个单词的结尾
next = new HashMap<>(); */
} boolean flag;
} /**
* 指向 当前节点匹配失败应该跳转的下个节点
*/
Node fail;
/**
* 以当前节点结尾的单词
*/
String str;
/**
* 当前节点的子节点
*/
Map<Integer, Node> next;
public Node() {
this.flag = false;
next = new HashMap<>();
}
}
} }

View File

@ -17,7 +17,7 @@ public class AutomatonTest extends TestCase {
Automaton automaton = new Automaton(); Automaton automaton = new Automaton();
WordTree wordTree = new WordTree(); WordTree wordTree = new WordTree();
automaton.insert("say", "her", "he", "she", "shr"); automaton.insert("say", "her", "he", "she", "shr");
automaton.buildAc(); // automaton.buildAc();
wordTree.addWords("say", "her", "he", "she", "shr"); wordTree.addWords("say", "her", "he", "she", "shr");
StopWatch stopWatch = new StopWatch(); StopWatch stopWatch = new StopWatch();
@ -51,7 +51,7 @@ public class AutomatonTest extends TestCase {
Automaton automaton = new Automaton(); Automaton automaton = new Automaton();
WordTree wordTree = new WordTree(); WordTree wordTree = new WordTree();
automaton.insert("say", "her", "he", "she", "shr"); automaton.insert("say", "her", "he", "she", "shr");
automaton.buildAc(); // automaton.buildAc();
wordTree.addWords("say", "her", "he", "she", "shr"); wordTree.addWords("say", "her", "he", "she", "shr");
StopWatch stopWatch = new StopWatch(); StopWatch stopWatch = new StopWatch();
@ -84,7 +84,7 @@ public class AutomatonTest extends TestCase {
stopWatch.start("automaton_char_buid_find"); stopWatch.start("automaton_char_buid_find");
Automaton automatonLocal = new Automaton(); Automaton automatonLocal = new Automaton();
automatonLocal.insert("say", "her", "he", "she", "shr"); automatonLocal.insert("say", "her", "he", "she", "shr");
automatonLocal.buildAc(); // automatonLocal.buildAc();
List<FoundWord> ans1 = automatonLocal.find(input); List<FoundWord> ans1 = automatonLocal.find(input);
stopWatch.stop(); stopWatch.stop();
assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
@ -118,7 +118,7 @@ public class AutomatonTest extends TestCase {
stopWatch.start("automaton_cn_build_find"); stopWatch.start("automaton_cn_build_find");
Automaton automatonLocal = new Automaton(); Automaton automatonLocal = new Automaton();
automatonLocal.insert("", "赵啊", "赵啊三"); automatonLocal.insert("", "赵啊", "赵啊三");
automatonLocal.buildAc(); // automatonLocal.buildAc();
final List<FoundWord> result = automatonLocal.find(input); final List<FoundWord> result = automatonLocal.find(input);
stopWatch.stop(); stopWatch.stop();
@ -156,7 +156,7 @@ public class AutomatonTest extends TestCase {
Automaton automatonLocal = new Automaton(); Automaton automatonLocal = new Automaton();
automatonLocal.insert("", "赵啊", "赵啊三"); automatonLocal.insert("", "赵啊", "赵啊三");
automatonLocal.buildAc(); // automatonLocal.buildAc();
stopWatch.start("automaton_cn_find"); stopWatch.start("automaton_cn_find");
final List<FoundWord> result = automatonLocal.find(input); final List<FoundWord> result = automatonLocal.find(input);
@ -196,7 +196,7 @@ public class AutomatonTest extends TestCase {
Automaton automatonLocal = new Automaton(); Automaton automatonLocal = new Automaton();
automatonLocal.insert("", "赵啊", "赵啊三"); automatonLocal.insert("", "赵啊", "赵啊三");
automatonLocal.buildAc(); // automatonLocal.buildAc();
stopWatch.start("automaton_cn_find_not_density"); stopWatch.start("automaton_cn_find_not_density");
final List<FoundWord> result = automatonLocal.find(input, false); final List<FoundWord> result = automatonLocal.find(input, false);