add mynlp

This commit is contained in:
Looly 2019-09-03 19:18:39 +08:00
parent 1caf64f4ac
commit bea37293ad
9 changed files with 173 additions and 0 deletions

View File

@ -11,6 +11,7 @@
* 【extra】 Sftp得put方法增加进度支持issue#518@Github
* 【core】 ArrayUtil增加distinct方法
* 【http】 去除log模块依赖Cookie中去除日志提示body方法传入JSON对象废弃未来移除json模块依赖
* 【extra】 添加MyNLP支持issue#519@Github
### Bug修复

View File

@ -200,5 +200,11 @@
<version>1.2</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>com.mayabot.mynlp</groupId>
<artifactId>mynlp-segment</artifactId>
<version>3.0.0</version>
<optional>true</optional>
</dependency>
</dependencies>
</project>

View File

@ -10,6 +10,7 @@ import cn.hutool.extra.tokenizer.engine.ikanalyzer.IKAnalyzerEngine;
import cn.hutool.extra.tokenizer.engine.jcseg.JcsegEngine;
import cn.hutool.extra.tokenizer.engine.jieba.JiebaEngine;
import cn.hutool.extra.tokenizer.engine.mmseg.MmsegEngine;
import cn.hutool.extra.tokenizer.engine.mynlp.MynlpEngine;
import cn.hutool.extra.tokenizer.engine.word.WordEngine;
import cn.hutool.log.StaticLog;
@ -77,6 +78,11 @@ public class TokenizerFactory {
} catch (NoClassDefFoundError e) {
// ignore
}
try {
return new MynlpEngine();
} catch (NoClassDefFoundError e) {
// ignore
}
throw new TokenizerException("No tokenizer found ! Please add some tokenizer jar to your project !");
}
}

View File

@ -0,0 +1,44 @@
package cn.hutool.extra.tokenizer.engine.mynlp;
import com.mayabot.nlp.segment.Lexer;
import com.mayabot.nlp.segment.Lexers;
import com.mayabot.nlp.segment.Sentence;
import cn.hutool.core.util.StrUtil;
import cn.hutool.extra.tokenizer.Result;
import cn.hutool.extra.tokenizer.TokenizerEngine;
/**
* MYNLP 中文NLP工具包分词实现<br>
* 项目地址https://github.com/mayabot/mynlp/
*
* @author looly
*
*/
public class MynlpEngine implements TokenizerEngine {
private Lexer lexer;
/**
* 构造
*/
public MynlpEngine() {
this.lexer = Lexers.core();
}
/**
* 构造
*
* @param lexer 分词器接口{@link Lexer}
*/
public MynlpEngine(Lexer lexer) {
this.lexer = lexer;
}
@Override
public Result parse(CharSequence text) {
final Sentence sentence = this.lexer.scan(StrUtil.str(text));
return new MynlpResult(sentence);
}
}

View File

@ -0,0 +1,50 @@
package cn.hutool.extra.tokenizer.engine.mynlp;
import java.util.Iterator;
import com.mayabot.nlp.segment.Sentence;
import com.mayabot.nlp.segment.WordTerm;
import cn.hutool.extra.tokenizer.Result;
import cn.hutool.extra.tokenizer.Word;
/**
* MYNLP 中文NLP工具包分词结果实现<br>
* 项目地址https://github.com/mayabot/mynlp/
*
* @author looly
*
*/
public class MynlpResult implements Result {
private Iterator<WordTerm> result;
/**
* 构造
*
* @param sentence 分词结果中文句子
*/
public MynlpResult(Sentence sentence) {
this.result = sentence.iterator();
}
@Override
public boolean hasNext() {
return result.hasNext();
}
@Override
public Word next() {
return new MynlpWord(result.next());
}
@Override
public void remove() {
result.remove();
}
@Override
public Iterator<Word> iterator() {
return this;
}
}

View File

@ -0,0 +1,45 @@
package cn.hutool.extra.tokenizer.engine.mynlp;
import com.mayabot.nlp.segment.WordTerm;
import cn.hutool.extra.tokenizer.Word;
/**
* mmseg分词中的一个单词包装
*
* @author looly
*
*/
public class MynlpWord implements Word {
private WordTerm word;
/**
* 构造
*
* @param word {@link WordTerm}
*/
public MynlpWord(WordTerm word) {
this.word = word;
}
@Override
public String getText() {
return word.getWord();
}
@Override
public int getStartOffset() {
return this.word.offset;
}
@Override
public int getEndOffset() {
return getStartOffset() + word.word.length();
}
@Override
public String toString() {
return getText();
}
}

View File

@ -0,0 +1,8 @@
/**
* MYNLP 中文NLP工具包分词实现<br>
* 项目地址https://github.com/mayabot/mynlp/
*
* @author Looly
* @since 4.6.5
*/
package cn.hutool.extra.tokenizer.engine.mynlp;

View File

@ -3,6 +3,7 @@ package cn.hutool.extra.tokenizer;
import java.util.Iterator;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
import cn.hutool.core.collection.CollUtil;
@ -12,6 +13,7 @@ import cn.hutool.extra.tokenizer.engine.ikanalyzer.IKAnalyzerEngine;
import cn.hutool.extra.tokenizer.engine.jcseg.JcsegEngine;
import cn.hutool.extra.tokenizer.engine.jieba.JiebaEngine;
import cn.hutool.extra.tokenizer.engine.mmseg.MmsegEngine;
import cn.hutool.extra.tokenizer.engine.mynlp.MynlpEngine;
import cn.hutool.extra.tokenizer.engine.word.WordEngine;
/**
@ -86,6 +88,16 @@ public class TokenizerUtilTest {
Assert.assertEquals("这两个 方法 的 区别 在于 返回值", resultStr);
}
@Test
@Ignore
public void mynlpTest() {
// 此单元测试需要JDK8默认忽略
TokenizerEngine engine = new MynlpEngine();
Result result = engine.parse(text);
String resultStr = CollUtil.join((Iterator<Word>)result, " ");
Assert.assertEquals("这 两个 方法 的 区别 在于 返回 值", resultStr);
}
private void checkResult(Result result) {
String resultStr = CollUtil.join((Iterator<Word>)result, " ");
Assert.assertEquals("这 两个 方法 的 区别 在于 返回 值", resultStr);

View File

@ -89,6 +89,7 @@
<configuration>
<source>${compile.version}</source>
<target>${compile.version}</target>
<verbose>true</verbose>
</configuration>
</plugin>
<!-- Javadoc -->