mirror of
https://github.com/mindoc-org/mindoc.git
synced 2026-01-24 13:52:26 +08:00
搭建框架
This commit is contained in:
97
vendor/github.com/huichen/wukong/engine/segmenter_worker.go
generated
vendored
Normal file
97
vendor/github.com/huichen/wukong/engine/segmenter_worker.go
generated
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"github.com/huichen/wukong/types"
|
||||
)
|
||||
|
||||
type segmenterRequest struct {
|
||||
docId uint64
|
||||
hash uint32
|
||||
data types.DocumentIndexData
|
||||
forceUpdate bool
|
||||
}
|
||||
|
||||
func (engine *Engine) segmenterWorker() {
|
||||
for {
|
||||
request := <-engine.segmenterChannel
|
||||
if request.docId == 0 {
|
||||
if request.forceUpdate {
|
||||
for i := 0; i < engine.initOptions.NumShards; i++ {
|
||||
engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
shard := engine.getShard(request.hash)
|
||||
tokensMap := make(map[string][]int)
|
||||
numTokens := 0
|
||||
if !engine.initOptions.NotUsingSegmenter && request.data.Content != "" {
|
||||
// 当文档正文不为空时,优先从内容分词中得到关键词
|
||||
segments := engine.segmenter.Segment([]byte(request.data.Content))
|
||||
for _, segment := range segments {
|
||||
token := segment.Token().Text()
|
||||
if !engine.stopTokens.IsStopToken(token) {
|
||||
tokensMap[token] = append(tokensMap[token], segment.Start())
|
||||
}
|
||||
}
|
||||
numTokens = len(segments)
|
||||
} else {
|
||||
// 否则载入用户输入的关键词
|
||||
for _, t := range request.data.Tokens {
|
||||
if !engine.stopTokens.IsStopToken(t.Text) {
|
||||
tokensMap[t.Text] = t.Locations
|
||||
}
|
||||
}
|
||||
numTokens = len(request.data.Tokens)
|
||||
}
|
||||
|
||||
// 加入非分词的文档标签
|
||||
for _, label := range request.data.Labels {
|
||||
if !engine.initOptions.NotUsingSegmenter {
|
||||
if !engine.stopTokens.IsStopToken(label) {
|
||||
//当正文中已存在关键字时,若不判断,位置信息将会丢失
|
||||
if _, ok := tokensMap[label]; !ok {
|
||||
tokensMap[label] = []int{}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//当正文中已存在关键字时,若不判断,位置信息将会丢失
|
||||
if _, ok := tokensMap[label]; !ok {
|
||||
tokensMap[label] = []int{}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
indexerRequest := indexerAddDocumentRequest{
|
||||
document: &types.DocumentIndex{
|
||||
DocId: request.docId,
|
||||
TokenLength: float32(numTokens),
|
||||
Keywords: make([]types.KeywordIndex, len(tokensMap)),
|
||||
},
|
||||
forceUpdate: request.forceUpdate,
|
||||
}
|
||||
iTokens := 0
|
||||
for k, v := range tokensMap {
|
||||
indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{
|
||||
Text: k,
|
||||
// 非分词标注的词频设置为0,不参与tf-idf计算
|
||||
Frequency: float32(len(v)),
|
||||
Starts: v}
|
||||
iTokens++
|
||||
}
|
||||
|
||||
engine.indexerAddDocChannels[shard] <- indexerRequest
|
||||
if request.forceUpdate {
|
||||
for i := 0; i < engine.initOptions.NumShards; i++ {
|
||||
if i == shard {
|
||||
continue
|
||||
}
|
||||
engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}
|
||||
}
|
||||
}
|
||||
rankerRequest := rankerAddDocRequest{
|
||||
docId: request.docId, fields: request.data.Fields}
|
||||
engine.rankerAddDocChannels[shard] <- rankerRequest
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user