mirror of
https://gitee.com/dromara/hutool.git
synced 2025-04-30 04:45:55 +08:00
add mynlp
This commit is contained in:
parent
1caf64f4ac
commit
bea37293ad
@ -11,6 +11,7 @@
|
||||
* 【extra】 Sftp得put方法增加进度支持(issue#518@Github)
|
||||
* 【core】 ArrayUtil增加distinct方法
|
||||
* 【http】 去除log模块依赖,Cookie中去除日志提示,body方法传入JSON对象废弃,未来移除json模块依赖
|
||||
* 【extra】 添加MyNLP支持(issue#519@Github)
|
||||
|
||||
### Bug修复
|
||||
|
||||
|
@ -200,5 +200,11 @@
|
||||
<version>1.2</version>
|
||||
<optional>true</optional>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.mayabot.mynlp</groupId>
|
||||
<artifactId>mynlp-segment</artifactId>
|
||||
<version>3.0.0</version>
|
||||
<optional>true</optional>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
|
@ -10,6 +10,7 @@ import cn.hutool.extra.tokenizer.engine.ikanalyzer.IKAnalyzerEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.jcseg.JcsegEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.jieba.JiebaEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.mmseg.MmsegEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.mynlp.MynlpEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.word.WordEngine;
|
||||
import cn.hutool.log.StaticLog;
|
||||
|
||||
@ -77,6 +78,11 @@ public class TokenizerFactory {
|
||||
} catch (NoClassDefFoundError e) {
|
||||
// ignore
|
||||
}
|
||||
try {
|
||||
return new MynlpEngine();
|
||||
} catch (NoClassDefFoundError e) {
|
||||
// ignore
|
||||
}
|
||||
throw new TokenizerException("No tokenizer found ! Please add some tokenizer jar to your project !");
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,44 @@
|
||||
package cn.hutool.extra.tokenizer.engine.mynlp;
|
||||
|
||||
import com.mayabot.nlp.segment.Lexer;
|
||||
import com.mayabot.nlp.segment.Lexers;
|
||||
import com.mayabot.nlp.segment.Sentence;
|
||||
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
import cn.hutool.extra.tokenizer.Result;
|
||||
import cn.hutool.extra.tokenizer.TokenizerEngine;
|
||||
|
||||
/**
|
||||
* MYNLP 中文NLP工具包分词实现<br>
|
||||
* 项目地址:https://github.com/mayabot/mynlp/
|
||||
*
|
||||
* @author looly
|
||||
*
|
||||
*/
|
||||
public class MynlpEngine implements TokenizerEngine {
|
||||
|
||||
private Lexer lexer;
|
||||
|
||||
/**
|
||||
* 构造
|
||||
*/
|
||||
public MynlpEngine() {
|
||||
this.lexer = Lexers.core();
|
||||
}
|
||||
|
||||
/**
|
||||
* 构造
|
||||
*
|
||||
* @param lexer 分词器接口{@link Lexer}
|
||||
*/
|
||||
public MynlpEngine(Lexer lexer) {
|
||||
this.lexer = lexer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Result parse(CharSequence text) {
|
||||
final Sentence sentence = this.lexer.scan(StrUtil.str(text));
|
||||
return new MynlpResult(sentence);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,50 @@
|
||||
package cn.hutool.extra.tokenizer.engine.mynlp;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import com.mayabot.nlp.segment.Sentence;
|
||||
import com.mayabot.nlp.segment.WordTerm;
|
||||
|
||||
import cn.hutool.extra.tokenizer.Result;
|
||||
import cn.hutool.extra.tokenizer.Word;
|
||||
|
||||
/**
|
||||
* MYNLP 中文NLP工具包分词结果实现<br>
|
||||
* 项目地址:https://github.com/mayabot/mynlp/
|
||||
*
|
||||
* @author looly
|
||||
*
|
||||
*/
|
||||
public class MynlpResult implements Result {
|
||||
|
||||
private Iterator<WordTerm> result;
|
||||
|
||||
/**
|
||||
* 构造
|
||||
*
|
||||
* @param sentence 分词结果(中文句子)
|
||||
*/
|
||||
public MynlpResult(Sentence sentence) {
|
||||
this.result = sentence.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return result.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Word next() {
|
||||
return new MynlpWord(result.next());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
result.remove();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<Word> iterator() {
|
||||
return this;
|
||||
}
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
package cn.hutool.extra.tokenizer.engine.mynlp;
|
||||
|
||||
import com.mayabot.nlp.segment.WordTerm;
|
||||
|
||||
import cn.hutool.extra.tokenizer.Word;
|
||||
|
||||
/**
|
||||
* mmseg分词中的一个单词包装
|
||||
*
|
||||
* @author looly
|
||||
*
|
||||
*/
|
||||
public class MynlpWord implements Word {
|
||||
|
||||
private WordTerm word;
|
||||
|
||||
/**
|
||||
* 构造
|
||||
*
|
||||
* @param word {@link WordTerm}
|
||||
*/
|
||||
public MynlpWord(WordTerm word) {
|
||||
this.word = word;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getText() {
|
||||
return word.getWord();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getStartOffset() {
|
||||
return this.word.offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getEndOffset() {
|
||||
return getStartOffset() + word.word.length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getText();
|
||||
}
|
||||
}
|
@ -0,0 +1,8 @@
|
||||
/**
|
||||
* MYNLP 中文NLP工具包分词实现<br>
|
||||
* 项目地址:https://github.com/mayabot/mynlp/
|
||||
*
|
||||
* @author Looly
|
||||
* @since 4.6.5
|
||||
*/
|
||||
package cn.hutool.extra.tokenizer.engine.mynlp;
|
@ -3,6 +3,7 @@ package cn.hutool.extra.tokenizer;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import cn.hutool.core.collection.CollUtil;
|
||||
@ -12,6 +13,7 @@ import cn.hutool.extra.tokenizer.engine.ikanalyzer.IKAnalyzerEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.jcseg.JcsegEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.jieba.JiebaEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.mmseg.MmsegEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.mynlp.MynlpEngine;
|
||||
import cn.hutool.extra.tokenizer.engine.word.WordEngine;
|
||||
|
||||
/**
|
||||
@ -86,6 +88,16 @@ public class TokenizerUtilTest {
|
||||
Assert.assertEquals("这两个 方法 的 区别 在于 返回值", resultStr);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void mynlpTest() {
|
||||
// 此单元测试需要JDK8,默认忽略
|
||||
TokenizerEngine engine = new MynlpEngine();
|
||||
Result result = engine.parse(text);
|
||||
String resultStr = CollUtil.join((Iterator<Word>)result, " ");
|
||||
Assert.assertEquals("这 两个 方法 的 区别 在于 返回 值", resultStr);
|
||||
}
|
||||
|
||||
private void checkResult(Result result) {
|
||||
String resultStr = CollUtil.join((Iterator<Word>)result, " ");
|
||||
Assert.assertEquals("这 两个 方法 的 区别 在于 返回 值", resultStr);
|
||||
|
Loading…
Reference in New Issue
Block a user