mirror of
https://github.com/mindoc-org/mindoc.git
synced 2026-01-22 21:02:12 +08:00
搭建框架
This commit is contained in:
8
vendor/github.com/huichen/murmur/README.md
generated
vendored
Normal file
8
vendor/github.com/huichen/murmur/README.md
generated
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
murmur
|
||||
======
|
||||
|
||||
Go Murmur3 hash implementation
|
||||
|
||||
Based on
|
||||
|
||||
http://en.wikipedia.org/wiki/MurmurHash
|
||||
13
vendor/github.com/huichen/murmur/license.txt
generated
vendored
Normal file
13
vendor/github.com/huichen/murmur/license.txt
generated
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
Copyright 2013 Hui Chen
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
58
vendor/github.com/huichen/murmur/murmur.go
generated
vendored
Normal file
58
vendor/github.com/huichen/murmur/murmur.go
generated
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
// Murmur3 32bit hash function based on
|
||||
// http://en.wikipedia.org/wiki/MurmurHash
|
||||
package murmur
|
||||
|
||||
const (
|
||||
c1 = 0xcc9e2d51
|
||||
c2 = 0x1b873593
|
||||
c3 = 0x85ebca6b
|
||||
c4 = 0xc2b2ae35
|
||||
r1 = 15
|
||||
r2 = 13
|
||||
m = 5
|
||||
n = 0xe6546b64
|
||||
)
|
||||
|
||||
var (
|
||||
Seed = uint32(1)
|
||||
)
|
||||
|
||||
func Murmur3(key []byte) (hash uint32) {
|
||||
hash = Seed
|
||||
iByte := 0
|
||||
for ; iByte+4 <= len(key); iByte += 4 {
|
||||
k := uint32(key[iByte]) | uint32(key[iByte+1])<<8 | uint32(key[iByte+2])<<16 | uint32(key[iByte+3])<<24
|
||||
k *= c1
|
||||
k = (k << r1) | (k >> (32 - r1))
|
||||
k *= c2
|
||||
hash ^= k
|
||||
hash = (hash << r2) | (hash >> (32 - r2))
|
||||
hash = hash*m + n
|
||||
}
|
||||
|
||||
var remainingBytes uint32
|
||||
switch len(key) - iByte {
|
||||
case 3:
|
||||
remainingBytes += uint32(key[iByte+2]) << 16
|
||||
fallthrough
|
||||
case 2:
|
||||
remainingBytes += uint32(key[iByte+1]) << 8
|
||||
fallthrough
|
||||
case 1:
|
||||
remainingBytes += uint32(key[iByte])
|
||||
remainingBytes *= c1
|
||||
remainingBytes = (remainingBytes << r1) | (remainingBytes >> (32 - r1))
|
||||
remainingBytes = remainingBytes * c2
|
||||
hash ^= remainingBytes
|
||||
}
|
||||
|
||||
hash ^= uint32(len(key))
|
||||
hash ^= hash >> 16
|
||||
hash *= c3
|
||||
hash ^= hash >> 13
|
||||
hash *= c4
|
||||
hash ^= hash >> 16
|
||||
|
||||
// 出发吧,狗嬷嬷!
|
||||
return
|
||||
}
|
||||
43
vendor/github.com/huichen/sego/README.md
generated
vendored
Normal file
43
vendor/github.com/huichen/sego/README.md
generated
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
sego
|
||||
====
|
||||
|
||||
Go中文分词
|
||||
|
||||
<a href="https://github.com/huichen/sego/blob/master/dictionary.go">词典</a>用双数组trie(Double-Array Trie)实现,
|
||||
<a href="https://github.com/huichen/sego/blob/master/segmenter.go">分词器</a>算法为基于词频的最短路径加动态规划。
|
||||
|
||||
支持普通和搜索引擎两种分词模式,支持用户词典、词性标注,可运行<a href="https://github.com/huichen/sego/blob/master/server/server.go">JSON RPC服务</a>。
|
||||
|
||||
分词速度<a href="https://github.com/huichen/sego/blob/master/tools/benchmark.go">单线程</a>9MB/s,<a href="https://github.com/huichen/sego/blob/master/tools/goroutines.go">goroutines并发</a>42MB/s(8核Macbook Pro)。
|
||||
|
||||
# 安装/更新
|
||||
|
||||
```
|
||||
go get -u github.com/huichen/sego
|
||||
```
|
||||
|
||||
# 使用
|
||||
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/huichen/sego"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// 载入词典
|
||||
var segmenter sego.Segmenter
|
||||
segmenter.LoadDictionary("github.com/huichen/sego/data/dictionary.txt")
|
||||
|
||||
// 分词
|
||||
text := []byte("中华人民共和国中央人民政府")
|
||||
segments := segmenter.Segment(text)
|
||||
|
||||
// 处理分词结果
|
||||
// 支持普通模式和搜索模式两种分词,见代码中SegmentsToString函数的注释。
|
||||
fmt.Println(sego.SegmentsToString(segments, false))
|
||||
}
|
||||
```
|
||||
65
vendor/github.com/huichen/sego/dictionary.go
generated
vendored
Normal file
65
vendor/github.com/huichen/sego/dictionary.go
generated
vendored
Normal file
@@ -0,0 +1,65 @@
|
||||
package sego
|
||||
|
||||
import "github.com/adamzy/cedar-go"
|
||||
|
||||
// Dictionary结构体实现了一个字串前缀树,一个分词可能出现在叶子节点也有可能出现在非叶节点
|
||||
type Dictionary struct {
|
||||
trie *cedar.Cedar // Cedar 前缀树
|
||||
maxTokenLength int // 词典中最长的分词
|
||||
tokens []Token // 词典中所有的分词,方便遍历
|
||||
totalFrequency int64 // 词典中所有分词的频率之和
|
||||
}
|
||||
|
||||
func NewDictionary() *Dictionary {
|
||||
return &Dictionary{trie: cedar.New()}
|
||||
}
|
||||
|
||||
// 词典中最长的分词
|
||||
func (dict *Dictionary) MaxTokenLength() int {
|
||||
return dict.maxTokenLength
|
||||
}
|
||||
|
||||
// 词典中分词数目
|
||||
func (dict *Dictionary) NumTokens() int {
|
||||
return len(dict.tokens)
|
||||
}
|
||||
|
||||
// 词典中所有分词的频率之和
|
||||
func (dict *Dictionary) TotalFrequency() int64 {
|
||||
return dict.totalFrequency
|
||||
}
|
||||
|
||||
// 向词典中加入一个分词
|
||||
func (dict *Dictionary) addToken(token Token) {
|
||||
bytes := textSliceToBytes(token.text)
|
||||
_, err := dict.trie.Get(bytes)
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
|
||||
dict.trie.Insert(bytes, dict.NumTokens())
|
||||
dict.tokens = append(dict.tokens, token)
|
||||
dict.totalFrequency += int64(token.frequency)
|
||||
if len(token.text) > dict.maxTokenLength {
|
||||
dict.maxTokenLength = len(token.text)
|
||||
}
|
||||
}
|
||||
|
||||
// 在词典中查找和字元组words可以前缀匹配的所有分词
|
||||
// 返回值为找到的分词数
|
||||
func (dict *Dictionary) lookupTokens(words []Text, tokens []*Token) (numOfTokens int) {
|
||||
var id, value int
|
||||
var err error
|
||||
for _, word := range words {
|
||||
id, err = dict.trie.Jump(word, id)
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
value, err = dict.trie.Value(id)
|
||||
if err == nil {
|
||||
tokens[numOfTokens] = &dict.tokens[value]
|
||||
numOfTokens++
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
13
vendor/github.com/huichen/sego/license.txt
generated
vendored
Normal file
13
vendor/github.com/huichen/sego/license.txt
generated
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
Copyright 2013 Hui Chen
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
28
vendor/github.com/huichen/sego/segment.go
generated
vendored
Normal file
28
vendor/github.com/huichen/sego/segment.go
generated
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
package sego
|
||||
|
||||
// 文本中的一个分词
|
||||
type Segment struct {
|
||||
// 分词在文本中的起始字节位置
|
||||
start int
|
||||
|
||||
// 分词在文本中的结束字节位置(不包括该位置)
|
||||
end int
|
||||
|
||||
// 分词信息
|
||||
token *Token
|
||||
}
|
||||
|
||||
// 返回分词在文本中的起始字节位置
|
||||
func (s *Segment) Start() int {
|
||||
return s.start
|
||||
}
|
||||
|
||||
// 返回分词在文本中的结束字节位置(不包括该位置)
|
||||
func (s *Segment) End() int {
|
||||
return s.end
|
||||
}
|
||||
|
||||
// 返回分词信息
|
||||
func (s *Segment) Token() *Token {
|
||||
return s.token
|
||||
}
|
||||
295
vendor/github.com/huichen/sego/segmenter.go
generated
vendored
Normal file
295
vendor/github.com/huichen/sego/segmenter.go
generated
vendored
Normal file
@@ -0,0 +1,295 @@
|
||||
//Go中文分词
|
||||
package sego
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log"
|
||||
"math"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
const (
|
||||
minTokenFrequency = 2 // 仅从字典文件中读取大于等于此频率的分词
|
||||
)
|
||||
|
||||
// 分词器结构体
|
||||
type Segmenter struct {
|
||||
dict *Dictionary
|
||||
}
|
||||
|
||||
// 该结构体用于记录Viterbi算法中某字元处的向前分词跳转信息
|
||||
type jumper struct {
|
||||
minDistance float32
|
||||
token *Token
|
||||
}
|
||||
|
||||
// 返回分词器使用的词典
|
||||
func (seg *Segmenter) Dictionary() *Dictionary {
|
||||
return seg.dict
|
||||
}
|
||||
|
||||
// 从文件中载入词典
|
||||
//
|
||||
// 可以载入多个词典文件,文件名用","分隔,排在前面的词典优先载入分词,比如
|
||||
// "用户词典.txt,通用词典.txt"
|
||||
// 当一个分词既出现在用户词典也出现在通用词典中,则优先使用用户词典。
|
||||
//
|
||||
// 词典的格式为(每个分词一行):
|
||||
// 分词文本 频率 词性
|
||||
func (seg *Segmenter) LoadDictionary(files string) {
|
||||
seg.dict = NewDictionary()
|
||||
for _, file := range strings.Split(files, ",") {
|
||||
log.Printf("载入sego词典 %s", file)
|
||||
dictFile, err := os.Open(file)
|
||||
defer dictFile.Close()
|
||||
if err != nil {
|
||||
log.Fatalf("无法载入字典文件 \"%s\" \n", file)
|
||||
}
|
||||
|
||||
reader := bufio.NewReader(dictFile)
|
||||
var text string
|
||||
var freqText string
|
||||
var frequency int
|
||||
var pos string
|
||||
|
||||
// 逐行读入分词
|
||||
for {
|
||||
size, _ := fmt.Fscanln(reader, &text, &freqText, &pos)
|
||||
|
||||
if size == 0 {
|
||||
// 文件结束
|
||||
break
|
||||
} else if size < 2 {
|
||||
// 无效行
|
||||
continue
|
||||
} else if size == 2 {
|
||||
// 没有词性标注时设为空字符串
|
||||
pos = ""
|
||||
}
|
||||
|
||||
// 解析词频
|
||||
var err error
|
||||
frequency, err = strconv.Atoi(freqText)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// 过滤频率太小的词
|
||||
if frequency < minTokenFrequency {
|
||||
continue
|
||||
}
|
||||
|
||||
// 将分词添加到字典中
|
||||
words := splitTextToWords([]byte(text))
|
||||
token := Token{text: words, frequency: frequency, pos: pos}
|
||||
seg.dict.addToken(token)
|
||||
}
|
||||
}
|
||||
|
||||
// 计算每个分词的路径值,路径值含义见Token结构体的注释
|
||||
logTotalFrequency := float32(math.Log2(float64(seg.dict.totalFrequency)))
|
||||
for i := range seg.dict.tokens {
|
||||
token := &seg.dict.tokens[i]
|
||||
token.distance = logTotalFrequency - float32(math.Log2(float64(token.frequency)))
|
||||
}
|
||||
|
||||
// 对每个分词进行细致划分,用于搜索引擎模式,该模式用法见Token结构体的注释。
|
||||
for i := range seg.dict.tokens {
|
||||
token := &seg.dict.tokens[i]
|
||||
segments := seg.segmentWords(token.text, true)
|
||||
|
||||
// 计算需要添加的子分词数目
|
||||
numTokensToAdd := 0
|
||||
for iToken := 0; iToken < len(segments); iToken++ {
|
||||
if len(segments[iToken].token.text) > 1 {
|
||||
// 略去字元长度为一的分词
|
||||
// TODO: 这值得进一步推敲,特别是当字典中有英文复合词的时候
|
||||
numTokensToAdd++
|
||||
}
|
||||
}
|
||||
token.segments = make([]*Segment, numTokensToAdd)
|
||||
|
||||
// 添加子分词
|
||||
iSegmentsToAdd := 0
|
||||
for iToken := 0; iToken < len(segments); iToken++ {
|
||||
if len(segments[iToken].token.text) > 1 {
|
||||
token.segments[iSegmentsToAdd] = &segments[iToken]
|
||||
iSegmentsToAdd++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.Println("sego词典载入完毕")
|
||||
}
|
||||
|
||||
// 对文本分词
|
||||
//
|
||||
// 输入参数:
|
||||
// bytes UTF8文本的字节数组
|
||||
//
|
||||
// 输出:
|
||||
// []Segment 划分的分词
|
||||
func (seg *Segmenter) Segment(bytes []byte) []Segment {
|
||||
return seg.internalSegment(bytes, false)
|
||||
}
|
||||
|
||||
func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
|
||||
// 处理特殊情况
|
||||
if len(bytes) == 0 {
|
||||
return []Segment{}
|
||||
}
|
||||
|
||||
// 划分字元
|
||||
text := splitTextToWords(bytes)
|
||||
|
||||
return seg.segmentWords(text, searchMode)
|
||||
}
|
||||
|
||||
func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
|
||||
// 搜索模式下该分词已无继续划分可能的情况
|
||||
if searchMode && len(text) == 1 {
|
||||
return []Segment{}
|
||||
}
|
||||
|
||||
// jumpers定义了每个字元处的向前跳转信息,包括这个跳转对应的分词,
|
||||
// 以及从文本段开始到该字元的最短路径值
|
||||
jumpers := make([]jumper, len(text))
|
||||
|
||||
tokens := make([]*Token, seg.dict.maxTokenLength)
|
||||
for current := 0; current < len(text); current++ {
|
||||
// 找到前一个字元处的最短路径,以便计算后续路径值
|
||||
var baseDistance float32
|
||||
if current == 0 {
|
||||
// 当本字元在文本首部时,基础距离应该是零
|
||||
baseDistance = 0
|
||||
} else {
|
||||
baseDistance = jumpers[current-1].minDistance
|
||||
}
|
||||
|
||||
// 寻找所有以当前字元开头的分词
|
||||
numTokens := seg.dict.lookupTokens(
|
||||
text[current:minInt(current+seg.dict.maxTokenLength, len(text))], tokens)
|
||||
|
||||
// 对所有可能的分词,更新分词结束字元处的跳转信息
|
||||
for iToken := 0; iToken < numTokens; iToken++ {
|
||||
location := current + len(tokens[iToken].text) - 1
|
||||
if !searchMode || current != 0 || location != len(text)-1 {
|
||||
updateJumper(&jumpers[location], baseDistance, tokens[iToken])
|
||||
}
|
||||
}
|
||||
|
||||
// 当前字元没有对应分词时补加一个伪分词
|
||||
if numTokens == 0 || len(tokens[0].text) > 1 {
|
||||
updateJumper(&jumpers[current], baseDistance,
|
||||
&Token{text: []Text{text[current]}, frequency: 1, distance: 32, pos: "x"})
|
||||
}
|
||||
}
|
||||
|
||||
// 从后向前扫描第一遍得到需要添加的分词数目
|
||||
numSeg := 0
|
||||
for index := len(text) - 1; index >= 0; {
|
||||
location := index - len(jumpers[index].token.text) + 1
|
||||
numSeg++
|
||||
index = location - 1
|
||||
}
|
||||
|
||||
// 从后向前扫描第二遍添加分词到最终结果
|
||||
outputSegments := make([]Segment, numSeg)
|
||||
for index := len(text) - 1; index >= 0; {
|
||||
location := index - len(jumpers[index].token.text) + 1
|
||||
numSeg--
|
||||
outputSegments[numSeg].token = jumpers[index].token
|
||||
index = location - 1
|
||||
}
|
||||
|
||||
// 计算各个分词的字节位置
|
||||
bytePosition := 0
|
||||
for iSeg := 0; iSeg < len(outputSegments); iSeg++ {
|
||||
outputSegments[iSeg].start = bytePosition
|
||||
bytePosition += textSliceByteLength(outputSegments[iSeg].token.text)
|
||||
outputSegments[iSeg].end = bytePosition
|
||||
}
|
||||
return outputSegments
|
||||
}
|
||||
|
||||
// 更新跳转信息:
|
||||
// 1. 当该位置从未被访问过时(jumper.minDistance为零的情况),或者
|
||||
// 2. 当该位置的当前最短路径大于新的最短路径时
|
||||
// 将当前位置的最短路径值更新为baseDistance加上新分词的概率
|
||||
func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
|
||||
newDistance := baseDistance + token.distance
|
||||
if jumper.minDistance == 0 || jumper.minDistance > newDistance {
|
||||
jumper.minDistance = newDistance
|
||||
jumper.token = token
|
||||
}
|
||||
}
|
||||
|
||||
// 取两整数较小值
|
||||
func minInt(a, b int) int {
|
||||
if a > b {
|
||||
return b
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// 取两整数较大值
|
||||
func maxInt(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// 将文本划分成字元
|
||||
func splitTextToWords(text Text) []Text {
|
||||
output := make([]Text, 0, len(text)/3)
|
||||
current := 0
|
||||
inAlphanumeric := true
|
||||
alphanumericStart := 0
|
||||
for current < len(text) {
|
||||
r, size := utf8.DecodeRune(text[current:])
|
||||
if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) {
|
||||
// 当前是拉丁字母或数字(非中日韩文字)
|
||||
if !inAlphanumeric {
|
||||
alphanumericStart = current
|
||||
inAlphanumeric = true
|
||||
}
|
||||
} else {
|
||||
if inAlphanumeric {
|
||||
inAlphanumeric = false
|
||||
if current != 0 {
|
||||
output = append(output, toLower(text[alphanumericStart:current]))
|
||||
}
|
||||
}
|
||||
output = append(output, text[current:current+size])
|
||||
}
|
||||
current += size
|
||||
}
|
||||
|
||||
// 处理最后一个字元是英文的情况
|
||||
if inAlphanumeric {
|
||||
if current != 0 {
|
||||
output = append(output, toLower(text[alphanumericStart:current]))
|
||||
}
|
||||
}
|
||||
|
||||
return output
|
||||
}
|
||||
|
||||
// 将英文词转化为小写
|
||||
func toLower(text []byte) []byte {
|
||||
output := make([]byte, len(text))
|
||||
for i, t := range text {
|
||||
if t >= 'A' && t <= 'Z' {
|
||||
output[i] = t - 'A' + 'a'
|
||||
} else {
|
||||
output[i] = t
|
||||
}
|
||||
}
|
||||
return output
|
||||
}
|
||||
38
vendor/github.com/huichen/sego/test_utils.go
generated
vendored
Normal file
38
vendor/github.com/huichen/sego/test_utils.go
generated
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
package sego
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func expect(t *testing.T, expect string, actual interface{}) {
|
||||
actualString := fmt.Sprint(actual)
|
||||
if expect != actualString {
|
||||
t.Errorf("期待值=\"%s\", 实际=\"%s\"", expect, actualString)
|
||||
}
|
||||
}
|
||||
|
||||
func printTokens(tokens []*Token, numTokens int) (output string) {
|
||||
for iToken := 0; iToken < numTokens; iToken++ {
|
||||
for _, word := range tokens[iToken].text {
|
||||
output += fmt.Sprint(string(word))
|
||||
}
|
||||
output += " "
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func toWords(strings ...string) []Text {
|
||||
words := []Text{}
|
||||
for _, s := range strings {
|
||||
words = append(words, []byte(s))
|
||||
}
|
||||
return words
|
||||
}
|
||||
|
||||
func bytesToString(bytes []Text) (output string) {
|
||||
for _, b := range bytes {
|
||||
output += (string(b) + "/")
|
||||
}
|
||||
return
|
||||
}
|
||||
50
vendor/github.com/huichen/sego/token.go
generated
vendored
Normal file
50
vendor/github.com/huichen/sego/token.go
generated
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
package sego
|
||||
|
||||
// 字串类型,可以用来表达
|
||||
// 1. 一个字元,比如"中"又如"国", 英文的一个字元是一个词
|
||||
// 2. 一个分词,比如"中国"又如"人口"
|
||||
// 3. 一段文字,比如"中国有十三亿人口"
|
||||
type Text []byte
|
||||
|
||||
// 一个分词
|
||||
type Token struct {
|
||||
// 分词的字串,这实际上是个字元数组
|
||||
text []Text
|
||||
|
||||
// 分词在语料库中的词频
|
||||
frequency int
|
||||
|
||||
// log2(总词频/该分词词频),这相当于log2(1/p(分词)),用作动态规划中
|
||||
// 该分词的路径长度。求解prod(p(分词))的最大值相当于求解
|
||||
// sum(distance(分词))的最小值,这就是“最短路径”的来历。
|
||||
distance float32
|
||||
|
||||
// 词性标注
|
||||
pos string
|
||||
|
||||
// 该分词文本的进一步分词划分,见Segments函数注释。
|
||||
segments []*Segment
|
||||
}
|
||||
|
||||
// 返回分词文本
|
||||
func (token *Token) Text() string {
|
||||
return textSliceToString(token.text)
|
||||
}
|
||||
|
||||
// 返回分词在语料库中的词频
|
||||
func (token *Token) Frequency() int {
|
||||
return token.frequency
|
||||
}
|
||||
|
||||
// 返回分词词性标注
|
||||
func (token *Token) Pos() string {
|
||||
return token.pos
|
||||
}
|
||||
|
||||
// 该分词文本的进一步分词划分,比如"中华人民共和国中央人民政府"这个分词
|
||||
// 有两个子分词"中华人民共和国"和"中央人民政府"。子分词也可以进一步有子分词
|
||||
// 形成一个树结构,遍历这个树就可以得到该分词的所有细致分词划分,这主要
|
||||
// 用于搜索引擎对一段文本进行全文搜索。
|
||||
func (token *Token) Segments() []*Segment {
|
||||
return token.segments
|
||||
}
|
||||
93
vendor/github.com/huichen/sego/utils.go
generated
vendored
Normal file
93
vendor/github.com/huichen/sego/utils.go
generated
vendored
Normal file
@@ -0,0 +1,93 @@
|
||||
package sego
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// 输出分词结果为字符串
|
||||
//
|
||||
// 有两种输出模式,以"中华人民共和国"为例
|
||||
//
|
||||
// 普通模式(searchMode=false)输出一个分词"中华人民共和国/ns "
|
||||
// 搜索模式(searchMode=true) 输出普通模式的再细致切分:
|
||||
// "中华/nz 人民/n 共和/nz 共和国/ns 人民共和国/nt 中华人民共和国/ns "
|
||||
//
|
||||
// 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见Token结构体的注释。
|
||||
func SegmentsToString(segs []Segment, searchMode bool) (output string) {
|
||||
if searchMode {
|
||||
for _, seg := range segs {
|
||||
output += tokenToString(seg.token)
|
||||
}
|
||||
} else {
|
||||
for _, seg := range segs {
|
||||
output += fmt.Sprintf(
|
||||
"%s/%s ", textSliceToString(seg.token.text), seg.token.pos)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func tokenToString(token *Token) (output string) {
|
||||
for _, s := range token.segments {
|
||||
output += tokenToString(s.token)
|
||||
}
|
||||
output += fmt.Sprintf("%s/%s ", textSliceToString(token.text), token.pos)
|
||||
return
|
||||
}
|
||||
|
||||
// 输出分词结果到一个字符串slice
|
||||
//
|
||||
// 有两种输出模式,以"中华人民共和国"为例
|
||||
//
|
||||
// 普通模式(searchMode=false)输出一个分词"[中华人民共和国]"
|
||||
// 搜索模式(searchMode=true) 输出普通模式的再细致切分:
|
||||
// "[中华 人民 共和 共和国 人民共和国 中华人民共和国]"
|
||||
//
|
||||
// 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见Token结构体的注释。
|
||||
|
||||
func SegmentsToSlice(segs []Segment, searchMode bool) (output []string) {
|
||||
if searchMode {
|
||||
for _, seg := range segs {
|
||||
output = append(output, tokenToSlice(seg.token)...)
|
||||
}
|
||||
} else {
|
||||
for _, seg := range segs {
|
||||
output = append(output, seg.token.Text())
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func tokenToSlice(token *Token) (output []string) {
|
||||
for _, s := range token.segments {
|
||||
output = append(output, tokenToSlice(s.token)...)
|
||||
}
|
||||
output = append(output, textSliceToString(token.text))
|
||||
return output
|
||||
}
|
||||
|
||||
// 将多个字元拼接一个字符串输出
|
||||
func textSliceToString(text []Text) string {
|
||||
var output string
|
||||
for _, word := range text {
|
||||
output += string(word)
|
||||
}
|
||||
return output
|
||||
}
|
||||
|
||||
// 返回多个字元的字节总长度
|
||||
func textSliceByteLength(text []Text) (length int) {
|
||||
for _, word := range text {
|
||||
length += len(word)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func textSliceToBytes(text []Text) []byte {
|
||||
var buf bytes.Buffer
|
||||
for _, word := range text {
|
||||
buf.Write(word)
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
574
vendor/github.com/huichen/wukong/core/indexer.go
generated
vendored
Normal file
574
vendor/github.com/huichen/wukong/core/indexer.go
generated
vendored
Normal file
@@ -0,0 +1,574 @@
|
||||
package core
|
||||
|
||||
import (
|
||||
"log"
|
||||
"math"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/huichen/wukong/types"
|
||||
"github.com/huichen/wukong/utils"
|
||||
)
|
||||
|
||||
// 索引器
|
||||
type Indexer struct {
|
||||
// 从搜索键到文档列表的反向索引
|
||||
// 加了读写锁以保证读写安全
|
||||
tableLock struct {
|
||||
sync.RWMutex
|
||||
table map[string]*KeywordIndices
|
||||
docsState map[uint64]int // nil: 表示无状态记录,0: 存在于索引中,1: 等待删除,2: 等待加入
|
||||
}
|
||||
addCacheLock struct {
|
||||
sync.RWMutex
|
||||
addCachePointer int
|
||||
addCache types.DocumentsIndex
|
||||
}
|
||||
removeCacheLock struct {
|
||||
sync.RWMutex
|
||||
removeCachePointer int
|
||||
removeCache types.DocumentsId
|
||||
}
|
||||
|
||||
initOptions types.IndexerInitOptions
|
||||
initialized bool
|
||||
|
||||
// 这实际上是总文档数的一个近似
|
||||
numDocuments uint64
|
||||
|
||||
// 所有被索引文本的总关键词数
|
||||
totalTokenLength float32
|
||||
|
||||
// 每个文档的关键词长度
|
||||
docTokenLengths map[uint64]float32
|
||||
}
|
||||
|
||||
// 反向索引表的一行,收集了一个搜索键出现的所有文档,按照DocId从小到大排序。
|
||||
type KeywordIndices struct {
|
||||
// 下面的切片是否为空,取决于初始化时IndexType的值
|
||||
docIds []uint64 // 全部类型都有
|
||||
frequencies []float32 // IndexType == FrequenciesIndex
|
||||
locations [][]int // IndexType == LocationsIndex
|
||||
}
|
||||
|
||||
// 初始化索引器
|
||||
func (indexer *Indexer) Init(options types.IndexerInitOptions) {
|
||||
if indexer.initialized == true {
|
||||
log.Fatal("索引器不能初始化两次")
|
||||
}
|
||||
options.Init()
|
||||
indexer.initOptions = options
|
||||
indexer.initialized = true
|
||||
|
||||
indexer.tableLock.table = make(map[string]*KeywordIndices)
|
||||
indexer.tableLock.docsState = make(map[uint64]int)
|
||||
indexer.addCacheLock.addCache = make([]*types.DocumentIndex, indexer.initOptions.DocCacheSize)
|
||||
indexer.removeCacheLock.removeCache = make([]uint64, indexer.initOptions.DocCacheSize*2)
|
||||
indexer.docTokenLengths = make(map[uint64]float32)
|
||||
}
|
||||
|
||||
// 从KeywordIndices中得到第i个文档的DocId
|
||||
func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {
|
||||
return ti.docIds[i]
|
||||
}
|
||||
|
||||
// 得到KeywordIndices中文档总数
|
||||
func (indexer *Indexer) getIndexLength(ti *KeywordIndices) int {
|
||||
return len(ti.docIds)
|
||||
}
|
||||
|
||||
// 向 ADDCACHE 中加入一个文档
|
||||
func (indexer *Indexer) AddDocumentToCache(document *types.DocumentIndex, forceUpdate bool) {
|
||||
if indexer.initialized == false {
|
||||
log.Fatal("索引器尚未初始化")
|
||||
}
|
||||
|
||||
indexer.addCacheLock.Lock()
|
||||
if document != nil {
|
||||
indexer.addCacheLock.addCache[indexer.addCacheLock.addCachePointer] = document
|
||||
indexer.addCacheLock.addCachePointer++
|
||||
}
|
||||
if indexer.addCacheLock.addCachePointer >= indexer.initOptions.DocCacheSize || forceUpdate {
|
||||
indexer.tableLock.Lock()
|
||||
position := 0
|
||||
for i := 0; i < indexer.addCacheLock.addCachePointer; i++ {
|
||||
docIndex := indexer.addCacheLock.addCache[i]
|
||||
if docState, ok := indexer.tableLock.docsState[docIndex.DocId]; ok && docState <= 1 {
|
||||
// ok && docState == 0 表示存在于索引中,需先删除再添加
|
||||
// ok && docState == 1 表示不一定存在于索引中,等待删除,需先删除再添加
|
||||
if position != i {
|
||||
indexer.addCacheLock.addCache[position], indexer.addCacheLock.addCache[i] =
|
||||
indexer.addCacheLock.addCache[i], indexer.addCacheLock.addCache[position]
|
||||
}
|
||||
if docState == 0 {
|
||||
indexer.removeCacheLock.Lock()
|
||||
indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] =
|
||||
docIndex.DocId
|
||||
indexer.removeCacheLock.removeCachePointer++
|
||||
indexer.removeCacheLock.Unlock()
|
||||
indexer.tableLock.docsState[docIndex.DocId] = 1
|
||||
indexer.numDocuments--
|
||||
}
|
||||
position++
|
||||
} else if !ok {
|
||||
indexer.tableLock.docsState[docIndex.DocId] = 2
|
||||
}
|
||||
}
|
||||
|
||||
indexer.tableLock.Unlock()
|
||||
if indexer.RemoveDocumentToCache(0, forceUpdate) {
|
||||
// 只有当存在于索引表中的文档已被删除,其才可以重新加入到索引表中
|
||||
position = 0
|
||||
}
|
||||
|
||||
addCachedDocuments := indexer.addCacheLock.addCache[position:indexer.addCacheLock.addCachePointer]
|
||||
indexer.addCacheLock.addCachePointer = position
|
||||
indexer.addCacheLock.Unlock()
|
||||
sort.Sort(addCachedDocuments)
|
||||
indexer.AddDocuments(&addCachedDocuments)
|
||||
} else {
|
||||
indexer.addCacheLock.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// 向反向索引表中加入 ADDCACHE 中所有文档
|
||||
func (indexer *Indexer) AddDocuments(documents *types.DocumentsIndex) {
|
||||
if indexer.initialized == false {
|
||||
log.Fatal("索引器尚未初始化")
|
||||
}
|
||||
|
||||
indexer.tableLock.Lock()
|
||||
defer indexer.tableLock.Unlock()
|
||||
indexPointers := make(map[string]int, len(indexer.tableLock.table))
|
||||
|
||||
// DocId 递增顺序遍历插入文档保证索引移动次数最少
|
||||
for i, document := range *documents {
|
||||
if i < len(*documents)-1 && (*documents)[i].DocId == (*documents)[i+1].DocId {
|
||||
// 如果有重复文档加入,因为稳定排序,只加入最后一个
|
||||
continue
|
||||
}
|
||||
if docState, ok := indexer.tableLock.docsState[document.DocId]; ok && docState == 1 {
|
||||
// 如果此时 docState 仍为 1,说明该文档需被删除
|
||||
// docState 合法状态为 nil & 2,保证一定不会插入已经在索引表中的文档
|
||||
continue
|
||||
}
|
||||
|
||||
// 更新文档关键词总长度
|
||||
if document.TokenLength != 0 {
|
||||
indexer.docTokenLengths[document.DocId] = float32(document.TokenLength)
|
||||
indexer.totalTokenLength += document.TokenLength
|
||||
}
|
||||
|
||||
docIdIsNew := true
|
||||
for _, keyword := range document.Keywords {
|
||||
indices, foundKeyword := indexer.tableLock.table[keyword.Text]
|
||||
if !foundKeyword {
|
||||
// 如果没找到该搜索键则加入
|
||||
ti := KeywordIndices{}
|
||||
switch indexer.initOptions.IndexType {
|
||||
case types.LocationsIndex:
|
||||
ti.locations = [][]int{keyword.Starts}
|
||||
case types.FrequenciesIndex:
|
||||
ti.frequencies = []float32{keyword.Frequency}
|
||||
}
|
||||
ti.docIds = []uint64{document.DocId}
|
||||
indexer.tableLock.table[keyword.Text] = &ti
|
||||
continue
|
||||
}
|
||||
|
||||
// 查找应该插入的位置,且索引一定不存在
|
||||
position, _ := indexer.searchIndex(
|
||||
indices, indexPointers[keyword.Text], indexer.getIndexLength(indices)-1, document.DocId)
|
||||
indexPointers[keyword.Text] = position
|
||||
switch indexer.initOptions.IndexType {
|
||||
case types.LocationsIndex:
|
||||
indices.locations = append(indices.locations, []int{})
|
||||
copy(indices.locations[position+1:], indices.locations[position:])
|
||||
indices.locations[position] = keyword.Starts
|
||||
case types.FrequenciesIndex:
|
||||
indices.frequencies = append(indices.frequencies, float32(0))
|
||||
copy(indices.frequencies[position+1:], indices.frequencies[position:])
|
||||
indices.frequencies[position] = keyword.Frequency
|
||||
}
|
||||
indices.docIds = append(indices.docIds, 0)
|
||||
copy(indices.docIds[position+1:], indices.docIds[position:])
|
||||
indices.docIds[position] = document.DocId
|
||||
}
|
||||
|
||||
// 更新文章状态和总数
|
||||
if docIdIsNew {
|
||||
indexer.tableLock.docsState[document.DocId] = 0
|
||||
indexer.numDocuments++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 向 REMOVECACHE 中加入一个待删除文档
|
||||
// 返回值表示文档是否在索引表中被删除
|
||||
func (indexer *Indexer) RemoveDocumentToCache(docId uint64, forceUpdate bool) bool {
|
||||
if indexer.initialized == false {
|
||||
log.Fatal("索引器尚未初始化")
|
||||
}
|
||||
|
||||
indexer.removeCacheLock.Lock()
|
||||
if docId != 0 {
|
||||
indexer.tableLock.Lock()
|
||||
if docState, ok := indexer.tableLock.docsState[docId]; ok && docState == 0 {
|
||||
indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] = docId
|
||||
indexer.removeCacheLock.removeCachePointer++
|
||||
indexer.tableLock.docsState[docId] = 1
|
||||
indexer.numDocuments--
|
||||
} else if ok && docState == 2 {
|
||||
// 删除一个等待加入的文档
|
||||
indexer.tableLock.docsState[docId] = 1
|
||||
} else if !ok {
|
||||
// 若文档不存在,则无法判断其是否在 addCache 中,需避免这样的操作
|
||||
}
|
||||
indexer.tableLock.Unlock()
|
||||
}
|
||||
|
||||
if indexer.removeCacheLock.removeCachePointer > 0 &&
|
||||
(indexer.removeCacheLock.removeCachePointer >= indexer.initOptions.DocCacheSize ||
|
||||
forceUpdate) {
|
||||
removeCachedDocuments := indexer.removeCacheLock.removeCache[:indexer.removeCacheLock.removeCachePointer]
|
||||
indexer.removeCacheLock.removeCachePointer = 0
|
||||
indexer.removeCacheLock.Unlock()
|
||||
sort.Sort(removeCachedDocuments)
|
||||
indexer.RemoveDocuments(&removeCachedDocuments)
|
||||
return true
|
||||
}
|
||||
indexer.removeCacheLock.Unlock()
|
||||
return false
|
||||
}
|
||||
|
||||
// 向反向索引表中删除 REMOVECACHE 中所有文档
|
||||
func (indexer *Indexer) RemoveDocuments(documents *types.DocumentsId) {
|
||||
if indexer.initialized == false {
|
||||
log.Fatal("索引器尚未初始化")
|
||||
}
|
||||
|
||||
indexer.tableLock.Lock()
|
||||
defer indexer.tableLock.Unlock()
|
||||
|
||||
// 更新文档关键词总长度,删除文档状态
|
||||
for _, docId := range *documents {
|
||||
indexer.totalTokenLength -= indexer.docTokenLengths[docId]
|
||||
delete(indexer.docTokenLengths, docId)
|
||||
delete(indexer.tableLock.docsState, docId)
|
||||
}
|
||||
|
||||
for keyword, indices := range indexer.tableLock.table {
|
||||
indicesTop, indicesPointer := 0, 0
|
||||
documentsPointer := sort.Search(
|
||||
len(*documents), func(i int) bool { return (*documents)[i] >= indices.docIds[0] })
|
||||
// 双指针扫描,进行批量删除操作
|
||||
for documentsPointer < len(*documents) && indicesPointer < indexer.getIndexLength(indices) {
|
||||
if indices.docIds[indicesPointer] < (*documents)[documentsPointer] {
|
||||
if indicesTop != indicesPointer {
|
||||
switch indexer.initOptions.IndexType {
|
||||
case types.LocationsIndex:
|
||||
indices.locations[indicesTop] = indices.locations[indicesPointer]
|
||||
case types.FrequenciesIndex:
|
||||
indices.frequencies[indicesTop] = indices.frequencies[indicesPointer]
|
||||
}
|
||||
indices.docIds[indicesTop] = indices.docIds[indicesPointer]
|
||||
}
|
||||
indicesTop++
|
||||
indicesPointer++
|
||||
} else if indices.docIds[indicesPointer] == (*documents)[documentsPointer] {
|
||||
indicesPointer++
|
||||
documentsPointer++
|
||||
} else {
|
||||
documentsPointer++
|
||||
}
|
||||
}
|
||||
if indicesTop != indicesPointer {
|
||||
switch indexer.initOptions.IndexType {
|
||||
case types.LocationsIndex:
|
||||
indices.locations = append(
|
||||
indices.locations[:indicesTop], indices.locations[indicesPointer:]...)
|
||||
case types.FrequenciesIndex:
|
||||
indices.frequencies = append(
|
||||
indices.frequencies[:indicesTop], indices.frequencies[indicesPointer:]...)
|
||||
}
|
||||
indices.docIds = append(
|
||||
indices.docIds[:indicesTop], indices.docIds[indicesPointer:]...)
|
||||
}
|
||||
if len(indices.docIds) == 0 {
|
||||
delete(indexer.tableLock.table, keyword)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 查找包含全部搜索键(AND操作)的文档
|
||||
// 当docIds不为nil时仅从docIds指定的文档中查找
|
||||
func (indexer *Indexer) Lookup(
|
||||
tokens []string, labels []string, docIds map[uint64]bool, countDocsOnly bool) (docs []types.IndexedDocument, numDocs int) {
|
||||
if indexer.initialized == false {
|
||||
log.Fatal("索引器尚未初始化")
|
||||
}
|
||||
|
||||
if indexer.numDocuments == 0 {
|
||||
return
|
||||
}
|
||||
numDocs = 0
|
||||
|
||||
// 合并关键词和标签为搜索键
|
||||
keywords := make([]string, len(tokens)+len(labels))
|
||||
copy(keywords, tokens)
|
||||
copy(keywords[len(tokens):], labels)
|
||||
|
||||
indexer.tableLock.RLock()
|
||||
defer indexer.tableLock.RUnlock()
|
||||
table := make([]*KeywordIndices, len(keywords))
|
||||
for i, keyword := range keywords {
|
||||
indices, found := indexer.tableLock.table[keyword]
|
||||
if !found {
|
||||
// 当反向索引表中无此搜索键时直接返回
|
||||
return
|
||||
} else {
|
||||
// 否则加入反向表中
|
||||
table[i] = indices
|
||||
}
|
||||
}
|
||||
|
||||
// 当没有找到时直接返回
|
||||
if len(table) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// 归并查找各个搜索键出现文档的交集
|
||||
// 从后向前查保证先输出DocId较大文档
|
||||
indexPointers := make([]int, len(table))
|
||||
for iTable := 0; iTable < len(table); iTable++ {
|
||||
indexPointers[iTable] = indexer.getIndexLength(table[iTable]) - 1
|
||||
}
|
||||
// 平均文本关键词长度,用于计算BM25
|
||||
avgDocLength := indexer.totalTokenLength / float32(indexer.numDocuments)
|
||||
for ; indexPointers[0] >= 0; indexPointers[0]-- {
|
||||
// 以第一个搜索键出现的文档作为基准,并遍历其他搜索键搜索同一文档
|
||||
baseDocId := indexer.getDocId(table[0], indexPointers[0])
|
||||
if docIds != nil {
|
||||
if _, found := docIds[baseDocId]; !found {
|
||||
continue
|
||||
}
|
||||
}
|
||||
iTable := 1
|
||||
found := true
|
||||
for ; iTable < len(table); iTable++ {
|
||||
// 二分法比简单的顺序归并效率高,也有更高效率的算法,
|
||||
// 但顺序归并也许是更好的选择,考虑到将来需要用链表重新实现
|
||||
// 以避免反向表添加新文档时的写锁。
|
||||
// TODO: 进一步研究不同求交集算法的速度和可扩展性。
|
||||
position, foundBaseDocId := indexer.searchIndex(table[iTable],
|
||||
0, indexPointers[iTable], baseDocId)
|
||||
if foundBaseDocId {
|
||||
indexPointers[iTable] = position
|
||||
} else {
|
||||
if position == 0 {
|
||||
// 该搜索键中所有的文档ID都比baseDocId大,因此已经没有
|
||||
// 继续查找的必要。
|
||||
return
|
||||
} else {
|
||||
// 继续下一indexPointers[0]的查找
|
||||
indexPointers[iTable] = position - 1
|
||||
found = false
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if found {
|
||||
if docState, ok := indexer.tableLock.docsState[baseDocId]; !ok || docState != 0 {
|
||||
continue
|
||||
}
|
||||
indexedDoc := types.IndexedDocument{}
|
||||
|
||||
// 当为LocationsIndex时计算关键词紧邻距离
|
||||
if indexer.initOptions.IndexType == types.LocationsIndex {
|
||||
// 计算有多少关键词是带有距离信息的
|
||||
numTokensWithLocations := 0
|
||||
for i, t := range table[:len(tokens)] {
|
||||
if len(t.locations[indexPointers[i]]) > 0 {
|
||||
numTokensWithLocations++
|
||||
}
|
||||
}
|
||||
if numTokensWithLocations != len(tokens) {
|
||||
if !countDocsOnly {
|
||||
docs = append(docs, types.IndexedDocument{
|
||||
DocId: baseDocId,
|
||||
})
|
||||
}
|
||||
numDocs++
|
||||
//当某个关键字对应多个文档且有lable关键字存在时,若直接break,将会丢失相当一部分搜索结果
|
||||
continue
|
||||
}
|
||||
|
||||
// 计算搜索键在文档中的紧邻距离
|
||||
tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], indexPointers, tokens)
|
||||
indexedDoc.TokenProximity = int32(tokenProximity)
|
||||
indexedDoc.TokenSnippetLocations = tokenLocations
|
||||
|
||||
// 添加TokenLocations
|
||||
indexedDoc.TokenLocations = make([][]int, len(tokens))
|
||||
for i, t := range table[:len(tokens)] {
|
||||
indexedDoc.TokenLocations[i] = t.locations[indexPointers[i]]
|
||||
}
|
||||
}
|
||||
|
||||
// 当为LocationsIndex或者FrequenciesIndex时计算BM25
|
||||
if indexer.initOptions.IndexType == types.LocationsIndex ||
|
||||
indexer.initOptions.IndexType == types.FrequenciesIndex {
|
||||
bm25 := float32(0)
|
||||
d := indexer.docTokenLengths[baseDocId]
|
||||
for i, t := range table[:len(tokens)] {
|
||||
var frequency float32
|
||||
if indexer.initOptions.IndexType == types.LocationsIndex {
|
||||
frequency = float32(len(t.locations[indexPointers[i]]))
|
||||
} else {
|
||||
frequency = t.frequencies[indexPointers[i]]
|
||||
}
|
||||
|
||||
// 计算BM25
|
||||
if len(t.docIds) > 0 && frequency > 0 && indexer.initOptions.BM25Parameters != nil && avgDocLength != 0 {
|
||||
// 带平滑的idf
|
||||
idf := float32(math.Log2(float64(indexer.numDocuments)/float64(len(t.docIds)) + 1))
|
||||
k1 := indexer.initOptions.BM25Parameters.K1
|
||||
b := indexer.initOptions.BM25Parameters.B
|
||||
bm25 += idf * frequency * (k1 + 1) / (frequency + k1*(1-b+b*d/avgDocLength))
|
||||
}
|
||||
}
|
||||
indexedDoc.BM25 = float32(bm25)
|
||||
}
|
||||
|
||||
indexedDoc.DocId = baseDocId
|
||||
if !countDocsOnly {
|
||||
docs = append(docs, indexedDoc)
|
||||
}
|
||||
numDocs++
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// 二分法查找indices中某文档的索引项
|
||||
// 第一个返回参数为找到的位置或需要插入的位置
|
||||
// 第二个返回参数标明是否找到
|
||||
func (indexer *Indexer) searchIndex(
|
||||
indices *KeywordIndices, start int, end int, docId uint64) (int, bool) {
|
||||
// 特殊情况
|
||||
if indexer.getIndexLength(indices) == start {
|
||||
return start, false
|
||||
}
|
||||
if docId < indexer.getDocId(indices, start) {
|
||||
return start, false
|
||||
} else if docId == indexer.getDocId(indices, start) {
|
||||
return start, true
|
||||
}
|
||||
if docId > indexer.getDocId(indices, end) {
|
||||
return end + 1, false
|
||||
} else if docId == indexer.getDocId(indices, end) {
|
||||
return end, true
|
||||
}
|
||||
|
||||
// 二分
|
||||
var middle int
|
||||
for end-start > 1 {
|
||||
middle = (start + end) / 2
|
||||
if docId == indexer.getDocId(indices, middle) {
|
||||
return middle, true
|
||||
} else if docId > indexer.getDocId(indices, middle) {
|
||||
start = middle
|
||||
} else {
|
||||
end = middle
|
||||
}
|
||||
}
|
||||
return end, false
|
||||
}
|
||||
|
||||
// 计算搜索键在文本中的紧邻距离
|
||||
//
|
||||
// 假定第 i 个搜索键首字节出现在文本中的位置为 P_i,长度 L_i
|
||||
// 紧邻距离计算公式为
|
||||
//
|
||||
// ArgMin(Sum(Abs(P_(i+1) - P_i - L_i)))
|
||||
//
|
||||
// 具体由动态规划实现,依次计算前 i 个 token 在每个出现位置的最优值。
|
||||
// 选定的 P_i 通过 tokenLocations 参数传回。
|
||||
func computeTokenProximity(table []*KeywordIndices, indexPointers []int, tokens []string) (
|
||||
minTokenProximity int, tokenLocations []int) {
|
||||
minTokenProximity = -1
|
||||
tokenLocations = make([]int, len(tokens))
|
||||
|
||||
var (
|
||||
currentLocations, nextLocations []int
|
||||
currentMinValues, nextMinValues []int
|
||||
path [][]int
|
||||
)
|
||||
|
||||
// 初始化路径数组
|
||||
path = make([][]int, len(tokens))
|
||||
for i := 1; i < len(path); i++ {
|
||||
path[i] = make([]int, len(table[i].locations[indexPointers[i]]))
|
||||
}
|
||||
|
||||
// 动态规划
|
||||
currentLocations = table[0].locations[indexPointers[0]]
|
||||
currentMinValues = make([]int, len(currentLocations))
|
||||
for i := 1; i < len(tokens); i++ {
|
||||
nextLocations = table[i].locations[indexPointers[i]]
|
||||
nextMinValues = make([]int, len(nextLocations))
|
||||
for j, _ := range nextMinValues {
|
||||
nextMinValues[j] = -1
|
||||
}
|
||||
|
||||
var iNext int
|
||||
for iCurrent, currentLocation := range currentLocations {
|
||||
if currentMinValues[iCurrent] == -1 {
|
||||
continue
|
||||
}
|
||||
for iNext+1 < len(nextLocations) && nextLocations[iNext+1] < currentLocation {
|
||||
iNext++
|
||||
}
|
||||
|
||||
update := func(from int, to int) {
|
||||
if to >= len(nextLocations) {
|
||||
return
|
||||
}
|
||||
value := currentMinValues[from] + utils.AbsInt(nextLocations[to]-currentLocations[from]-len(tokens[i-1]))
|
||||
if nextMinValues[to] == -1 || value < nextMinValues[to] {
|
||||
nextMinValues[to] = value
|
||||
path[i][to] = from
|
||||
}
|
||||
}
|
||||
|
||||
// 最优解的状态转移只发生在左右最接近的位置
|
||||
update(iCurrent, iNext)
|
||||
update(iCurrent, iNext+1)
|
||||
}
|
||||
|
||||
currentLocations = nextLocations
|
||||
currentMinValues = nextMinValues
|
||||
}
|
||||
|
||||
// 找出最优解
|
||||
var cursor int
|
||||
for i, value := range currentMinValues {
|
||||
if value == -1 {
|
||||
continue
|
||||
}
|
||||
if minTokenProximity == -1 || value < minTokenProximity {
|
||||
minTokenProximity = value
|
||||
cursor = i
|
||||
}
|
||||
}
|
||||
|
||||
// 从路径倒推出最优解的位置
|
||||
for i := len(tokens) - 1; i >= 0; i-- {
|
||||
if i != len(tokens)-1 {
|
||||
cursor = path[i+1][cursor]
|
||||
}
|
||||
tokenLocations[i] = table[i].locations[indexPointers[i]][cursor]
|
||||
}
|
||||
return
|
||||
}
|
||||
106
vendor/github.com/huichen/wukong/core/ranker.go
generated
vendored
Normal file
106
vendor/github.com/huichen/wukong/core/ranker.go
generated
vendored
Normal file
@@ -0,0 +1,106 @@
|
||||
package core
|
||||
|
||||
import (
|
||||
"github.com/huichen/wukong/types"
|
||||
"github.com/huichen/wukong/utils"
|
||||
"log"
|
||||
"sort"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type Ranker struct {
|
||||
lock struct {
|
||||
sync.RWMutex
|
||||
fields map[uint64]interface{}
|
||||
docs map[uint64]bool
|
||||
}
|
||||
initialized bool
|
||||
}
|
||||
|
||||
func (ranker *Ranker) Init() {
|
||||
if ranker.initialized == true {
|
||||
log.Fatal("排序器不能初始化两次")
|
||||
}
|
||||
ranker.initialized = true
|
||||
|
||||
ranker.lock.fields = make(map[uint64]interface{})
|
||||
ranker.lock.docs = make(map[uint64]bool)
|
||||
}
|
||||
|
||||
// 给某个文档添加评分字段
|
||||
func (ranker *Ranker) AddDoc(docId uint64, fields interface{}) {
|
||||
if ranker.initialized == false {
|
||||
log.Fatal("排序器尚未初始化")
|
||||
}
|
||||
|
||||
ranker.lock.Lock()
|
||||
ranker.lock.fields[docId] = fields
|
||||
ranker.lock.docs[docId] = true
|
||||
ranker.lock.Unlock()
|
||||
}
|
||||
|
||||
// 删除某个文档的评分字段
|
||||
func (ranker *Ranker) RemoveDoc(docId uint64) {
|
||||
if ranker.initialized == false {
|
||||
log.Fatal("排序器尚未初始化")
|
||||
}
|
||||
|
||||
ranker.lock.Lock()
|
||||
delete(ranker.lock.fields, docId)
|
||||
delete(ranker.lock.docs, docId)
|
||||
ranker.lock.Unlock()
|
||||
}
|
||||
|
||||
// 给文档评分并排序
|
||||
func (ranker *Ranker) Rank(
|
||||
docs []types.IndexedDocument, options types.RankOptions, countDocsOnly bool) (types.ScoredDocuments, int) {
|
||||
if ranker.initialized == false {
|
||||
log.Fatal("排序器尚未初始化")
|
||||
}
|
||||
|
||||
// 对每个文档评分
|
||||
var outputDocs types.ScoredDocuments
|
||||
numDocs := 0
|
||||
for _, d := range docs {
|
||||
ranker.lock.RLock()
|
||||
// 判断doc是否存在
|
||||
if _, ok := ranker.lock.docs[d.DocId]; ok {
|
||||
fs := ranker.lock.fields[d.DocId]
|
||||
ranker.lock.RUnlock()
|
||||
// 计算评分并剔除没有分值的文档
|
||||
scores := options.ScoringCriteria.Score(d, fs)
|
||||
if len(scores) > 0 {
|
||||
if !countDocsOnly {
|
||||
outputDocs = append(outputDocs, types.ScoredDocument{
|
||||
DocId: d.DocId,
|
||||
Scores: scores,
|
||||
TokenSnippetLocations: d.TokenSnippetLocations,
|
||||
TokenLocations: d.TokenLocations})
|
||||
}
|
||||
numDocs++
|
||||
}
|
||||
} else {
|
||||
ranker.lock.RUnlock()
|
||||
}
|
||||
}
|
||||
|
||||
// 排序
|
||||
if !countDocsOnly {
|
||||
if options.ReverseOrder {
|
||||
sort.Sort(sort.Reverse(outputDocs))
|
||||
} else {
|
||||
sort.Sort(outputDocs)
|
||||
}
|
||||
// 当用户要求只返回部分结果时返回部分结果
|
||||
var start, end int
|
||||
if options.MaxOutputs != 0 {
|
||||
start = utils.MinInt(options.OutputOffset, len(outputDocs))
|
||||
end = utils.MinInt(options.OutputOffset+options.MaxOutputs, len(outputDocs))
|
||||
} else {
|
||||
start = utils.MinInt(options.OutputOffset, len(outputDocs))
|
||||
end = len(outputDocs)
|
||||
}
|
||||
return outputDocs[start:end], numDocs
|
||||
}
|
||||
return outputDocs, numDocs
|
||||
}
|
||||
35
vendor/github.com/huichen/wukong/core/test_utils.go
generated
vendored
Normal file
35
vendor/github.com/huichen/wukong/core/test_utils.go
generated
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
package core
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/huichen/wukong/types"
|
||||
)
|
||||
|
||||
func indicesToString(indexer *Indexer, token string) (output string) {
|
||||
if indices, ok := indexer.tableLock.table[token]; ok {
|
||||
for i := 0; i < indexer.getIndexLength(indices); i++ {
|
||||
output += fmt.Sprintf("%d ",
|
||||
indexer.getDocId(indices, i))
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func indexedDocsToString(docs []types.IndexedDocument, numDocs int) (output string) {
|
||||
for _, doc := range docs {
|
||||
output += fmt.Sprintf("[%d %d %v] ",
|
||||
doc.DocId, doc.TokenProximity, doc.TokenSnippetLocations)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func scoredDocsToString(docs []types.ScoredDocument) (output string) {
|
||||
for _, doc := range docs {
|
||||
output += fmt.Sprintf("[%d [", doc.DocId)
|
||||
for _, score := range doc.Scores {
|
||||
output += fmt.Sprintf("%d ", int(score*1000))
|
||||
}
|
||||
output += "]] "
|
||||
}
|
||||
return
|
||||
}
|
||||
13
vendor/github.com/huichen/wukong/engine/counters.go
generated
vendored
Normal file
13
vendor/github.com/huichen/wukong/engine/counters.go
generated
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
package engine
|
||||
|
||||
func (engine *Engine) NumTokenIndexAdded() uint64 {
|
||||
return engine.numTokenIndexAdded
|
||||
}
|
||||
|
||||
func (engine *Engine) NumDocumentsIndexed() uint64 {
|
||||
return engine.numDocumentsIndexed
|
||||
}
|
||||
|
||||
func (engine *Engine) NumDocumentsRemoved() uint64 {
|
||||
return engine.numDocumentsRemoved
|
||||
}
|
||||
446
vendor/github.com/huichen/wukong/engine/engine.go
generated
vendored
Normal file
446
vendor/github.com/huichen/wukong/engine/engine.go
generated
vendored
Normal file
@@ -0,0 +1,446 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/huichen/murmur"
|
||||
"github.com/huichen/sego"
|
||||
"github.com/huichen/wukong/core"
|
||||
"github.com/huichen/wukong/storage"
|
||||
"github.com/huichen/wukong/types"
|
||||
"github.com/huichen/wukong/utils"
|
||||
"log"
|
||||
"os"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strconv"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
NumNanosecondsInAMillisecond = 1000000
|
||||
PersistentStorageFilePrefix = "wukong"
|
||||
)
|
||||
|
||||
type Engine struct {
|
||||
// 计数器,用来统计有多少文档被索引等信息
|
||||
numDocumentsIndexed uint64
|
||||
numDocumentsRemoved uint64
|
||||
numDocumentsForceUpdated uint64
|
||||
numIndexingRequests uint64
|
||||
numRemovingRequests uint64
|
||||
numForceUpdatingRequests uint64
|
||||
numTokenIndexAdded uint64
|
||||
numDocumentsStored uint64
|
||||
|
||||
// 记录初始化参数
|
||||
initOptions types.EngineInitOptions
|
||||
initialized bool
|
||||
|
||||
indexers []core.Indexer
|
||||
rankers []core.Ranker
|
||||
segmenter sego.Segmenter
|
||||
stopTokens StopTokens
|
||||
dbs []storage.Storage
|
||||
|
||||
// 建立索引器使用的通信通道
|
||||
segmenterChannel chan segmenterRequest
|
||||
indexerAddDocChannels []chan indexerAddDocumentRequest
|
||||
indexerRemoveDocChannels []chan indexerRemoveDocRequest
|
||||
rankerAddDocChannels []chan rankerAddDocRequest
|
||||
|
||||
// 建立排序器使用的通信通道
|
||||
indexerLookupChannels []chan indexerLookupRequest
|
||||
rankerRankChannels []chan rankerRankRequest
|
||||
rankerRemoveDocChannels []chan rankerRemoveDocRequest
|
||||
|
||||
// 建立持久存储使用的通信通道
|
||||
persistentStorageIndexDocumentChannels []chan persistentStorageIndexDocumentRequest
|
||||
persistentStorageInitChannel chan bool
|
||||
}
|
||||
|
||||
func (engine *Engine) Init(options types.EngineInitOptions) {
|
||||
// 将线程数设置为CPU数
|
||||
runtime.GOMAXPROCS(runtime.NumCPU())
|
||||
|
||||
// 初始化初始参数
|
||||
if engine.initialized {
|
||||
log.Fatal("请勿重复初始化引擎")
|
||||
}
|
||||
options.Init()
|
||||
engine.initOptions = options
|
||||
engine.initialized = true
|
||||
|
||||
if !options.NotUsingSegmenter {
|
||||
// 载入分词器词典
|
||||
engine.segmenter.LoadDictionary(options.SegmenterDictionaries)
|
||||
|
||||
// 初始化停用词
|
||||
engine.stopTokens.Init(options.StopTokenFile)
|
||||
}
|
||||
|
||||
// 初始化索引器和排序器
|
||||
for shard := 0; shard < options.NumShards; shard++ {
|
||||
engine.indexers = append(engine.indexers, core.Indexer{})
|
||||
engine.indexers[shard].Init(*options.IndexerInitOptions)
|
||||
|
||||
engine.rankers = append(engine.rankers, core.Ranker{})
|
||||
engine.rankers[shard].Init()
|
||||
}
|
||||
|
||||
// 初始化分词器通道
|
||||
engine.segmenterChannel = make(
|
||||
chan segmenterRequest, options.NumSegmenterThreads)
|
||||
|
||||
// 初始化索引器通道
|
||||
engine.indexerAddDocChannels = make(
|
||||
[]chan indexerAddDocumentRequest, options.NumShards)
|
||||
engine.indexerRemoveDocChannels = make(
|
||||
[]chan indexerRemoveDocRequest, options.NumShards)
|
||||
engine.indexerLookupChannels = make(
|
||||
[]chan indexerLookupRequest, options.NumShards)
|
||||
for shard := 0; shard < options.NumShards; shard++ {
|
||||
engine.indexerAddDocChannels[shard] = make(
|
||||
chan indexerAddDocumentRequest,
|
||||
options.IndexerBufferLength)
|
||||
engine.indexerRemoveDocChannels[shard] = make(
|
||||
chan indexerRemoveDocRequest,
|
||||
options.IndexerBufferLength)
|
||||
engine.indexerLookupChannels[shard] = make(
|
||||
chan indexerLookupRequest,
|
||||
options.IndexerBufferLength)
|
||||
}
|
||||
|
||||
// 初始化排序器通道
|
||||
engine.rankerAddDocChannels = make(
|
||||
[]chan rankerAddDocRequest, options.NumShards)
|
||||
engine.rankerRankChannels = make(
|
||||
[]chan rankerRankRequest, options.NumShards)
|
||||
engine.rankerRemoveDocChannels = make(
|
||||
[]chan rankerRemoveDocRequest, options.NumShards)
|
||||
for shard := 0; shard < options.NumShards; shard++ {
|
||||
engine.rankerAddDocChannels[shard] = make(
|
||||
chan rankerAddDocRequest,
|
||||
options.RankerBufferLength)
|
||||
engine.rankerRankChannels[shard] = make(
|
||||
chan rankerRankRequest,
|
||||
options.RankerBufferLength)
|
||||
engine.rankerRemoveDocChannels[shard] = make(
|
||||
chan rankerRemoveDocRequest,
|
||||
options.RankerBufferLength)
|
||||
}
|
||||
|
||||
// 初始化持久化存储通道
|
||||
if engine.initOptions.UsePersistentStorage {
|
||||
engine.persistentStorageIndexDocumentChannels =
|
||||
make([]chan persistentStorageIndexDocumentRequest,
|
||||
engine.initOptions.PersistentStorageShards)
|
||||
for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
|
||||
engine.persistentStorageIndexDocumentChannels[shard] = make(
|
||||
chan persistentStorageIndexDocumentRequest)
|
||||
}
|
||||
engine.persistentStorageInitChannel = make(
|
||||
chan bool, engine.initOptions.PersistentStorageShards)
|
||||
}
|
||||
|
||||
// 启动分词器
|
||||
for iThread := 0; iThread < options.NumSegmenterThreads; iThread++ {
|
||||
go engine.segmenterWorker()
|
||||
}
|
||||
|
||||
// 启动索引器和排序器
|
||||
for shard := 0; shard < options.NumShards; shard++ {
|
||||
go engine.indexerAddDocumentWorker(shard)
|
||||
go engine.indexerRemoveDocWorker(shard)
|
||||
go engine.rankerAddDocWorker(shard)
|
||||
go engine.rankerRemoveDocWorker(shard)
|
||||
|
||||
for i := 0; i < options.NumIndexerThreadsPerShard; i++ {
|
||||
go engine.indexerLookupWorker(shard)
|
||||
}
|
||||
for i := 0; i < options.NumRankerThreadsPerShard; i++ {
|
||||
go engine.rankerRankWorker(shard)
|
||||
}
|
||||
}
|
||||
|
||||
// 启动持久化存储工作协程
|
||||
if engine.initOptions.UsePersistentStorage {
|
||||
err := os.MkdirAll(engine.initOptions.PersistentStorageFolder, 0700)
|
||||
if err != nil {
|
||||
log.Fatal("无法创建目录", engine.initOptions.PersistentStorageFolder)
|
||||
}
|
||||
|
||||
// 打开或者创建数据库
|
||||
engine.dbs = make([]storage.Storage, engine.initOptions.PersistentStorageShards)
|
||||
for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
|
||||
dbPath := engine.initOptions.PersistentStorageFolder + "/" + PersistentStorageFilePrefix + "." + strconv.Itoa(shard)
|
||||
db, err := storage.OpenStorage(dbPath)
|
||||
if db == nil || err != nil {
|
||||
log.Fatal("无法打开数据库", dbPath, ": ", err)
|
||||
}
|
||||
engine.dbs[shard] = db
|
||||
}
|
||||
|
||||
// 从数据库中恢复
|
||||
for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
|
||||
go engine.persistentStorageInitWorker(shard)
|
||||
}
|
||||
|
||||
// 等待恢复完成
|
||||
for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
|
||||
<-engine.persistentStorageInitChannel
|
||||
}
|
||||
for {
|
||||
runtime.Gosched()
|
||||
if engine.numIndexingRequests == engine.numDocumentsIndexed {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// 关闭并重新打开数据库
|
||||
for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
|
||||
engine.dbs[shard].Close()
|
||||
dbPath := engine.initOptions.PersistentStorageFolder + "/" + PersistentStorageFilePrefix + "." + strconv.Itoa(shard)
|
||||
db, err := storage.OpenStorage(dbPath)
|
||||
if db == nil || err != nil {
|
||||
log.Fatal("无法打开数据库", dbPath, ": ", err)
|
||||
}
|
||||
engine.dbs[shard] = db
|
||||
}
|
||||
|
||||
for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
|
||||
go engine.persistentStorageIndexDocumentWorker(shard)
|
||||
}
|
||||
}
|
||||
|
||||
atomic.AddUint64(&engine.numDocumentsStored, engine.numIndexingRequests)
|
||||
}
|
||||
|
||||
// 将文档加入索引
|
||||
//
|
||||
// 输入参数:
|
||||
// docId 标识文档编号,必须唯一,docId == 0 表示非法文档(用于强制刷新索引),[1, +oo) 表示合法文档
|
||||
// data 见DocumentIndexData注释
|
||||
// forceUpdate 是否强制刷新 cache,如果设为 true,则尽快添加到索引,否则等待 cache 满之后一次全量添加
|
||||
//
|
||||
// 注意:
|
||||
// 1. 这个函数是线程安全的,请尽可能并发调用以提高索引速度
|
||||
// 2. 这个函数调用是非同步的,也就是说在函数返回时有可能文档还没有加入索引中,因此
|
||||
// 如果立刻调用Search可能无法查询到这个文档。强制刷新索引请调用FlushIndex函数。
|
||||
func (engine *Engine) IndexDocument(docId uint64, data types.DocumentIndexData, forceUpdate bool) {
|
||||
engine.internalIndexDocument(docId, data, forceUpdate)
|
||||
|
||||
hash := murmur.Murmur3([]byte(fmt.Sprint("%d", docId))) % uint32(engine.initOptions.PersistentStorageShards)
|
||||
if engine.initOptions.UsePersistentStorage && docId != 0 {
|
||||
engine.persistentStorageIndexDocumentChannels[hash] <- persistentStorageIndexDocumentRequest{docId: docId, data: data}
|
||||
}
|
||||
}
|
||||
|
||||
func (engine *Engine) internalIndexDocument(
|
||||
docId uint64, data types.DocumentIndexData, forceUpdate bool) {
|
||||
if !engine.initialized {
|
||||
log.Fatal("必须先初始化引擎")
|
||||
}
|
||||
|
||||
if docId != 0 {
|
||||
atomic.AddUint64(&engine.numIndexingRequests, 1)
|
||||
}
|
||||
if forceUpdate {
|
||||
atomic.AddUint64(&engine.numForceUpdatingRequests, 1)
|
||||
}
|
||||
hash := murmur.Murmur3([]byte(fmt.Sprint("%d%s", docId, data.Content)))
|
||||
engine.segmenterChannel <- segmenterRequest{
|
||||
docId: docId, hash: hash, data: data, forceUpdate: forceUpdate}
|
||||
}
|
||||
|
||||
// 将文档从索引中删除
|
||||
//
|
||||
// 输入参数:
|
||||
// docId 标识文档编号,必须唯一,docId == 0 表示非法文档(用于强制刷新索引),[1, +oo) 表示合法文档
|
||||
// forceUpdate 是否强制刷新 cache,如果设为 true,则尽快删除索引,否则等待 cache 满之后一次全量删除
|
||||
//
|
||||
// 注意:
|
||||
// 1. 这个函数是线程安全的,请尽可能并发调用以提高索引速度
|
||||
// 2. 这个函数调用是非同步的,也就是说在函数返回时有可能文档还没有加入索引中,因此
|
||||
// 如果立刻调用Search可能无法查询到这个文档。强制刷新索引请调用FlushIndex函数。
|
||||
func (engine *Engine) RemoveDocument(docId uint64, forceUpdate bool) {
|
||||
if !engine.initialized {
|
||||
log.Fatal("必须先初始化引擎")
|
||||
}
|
||||
|
||||
if docId != 0 {
|
||||
atomic.AddUint64(&engine.numRemovingRequests, 1)
|
||||
}
|
||||
if forceUpdate {
|
||||
atomic.AddUint64(&engine.numForceUpdatingRequests, 1)
|
||||
}
|
||||
for shard := 0; shard < engine.initOptions.NumShards; shard++ {
|
||||
engine.indexerRemoveDocChannels[shard] <- indexerRemoveDocRequest{docId: docId, forceUpdate: forceUpdate}
|
||||
if docId == 0 {
|
||||
continue
|
||||
}
|
||||
engine.rankerRemoveDocChannels[shard] <- rankerRemoveDocRequest{docId: docId}
|
||||
}
|
||||
|
||||
if engine.initOptions.UsePersistentStorage && docId != 0 {
|
||||
// 从数据库中删除
|
||||
hash := murmur.Murmur3([]byte(fmt.Sprint("%d", docId))) % uint32(engine.initOptions.PersistentStorageShards)
|
||||
go engine.persistentStorageRemoveDocumentWorker(docId, hash)
|
||||
}
|
||||
}
|
||||
|
||||
// 查找满足搜索条件的文档,此函数线程安全
|
||||
func (engine *Engine) Search(request types.SearchRequest) (output types.SearchResponse) {
|
||||
if !engine.initialized {
|
||||
log.Fatal("必须先初始化引擎")
|
||||
}
|
||||
|
||||
var rankOptions types.RankOptions
|
||||
if request.RankOptions == nil {
|
||||
rankOptions = *engine.initOptions.DefaultRankOptions
|
||||
} else {
|
||||
rankOptions = *request.RankOptions
|
||||
}
|
||||
if rankOptions.ScoringCriteria == nil {
|
||||
rankOptions.ScoringCriteria = engine.initOptions.DefaultRankOptions.ScoringCriteria
|
||||
}
|
||||
|
||||
// 收集关键词
|
||||
tokens := []string{}
|
||||
if request.Text != "" {
|
||||
querySegments := engine.segmenter.Segment([]byte(request.Text))
|
||||
for _, s := range querySegments {
|
||||
token := s.Token().Text()
|
||||
if !engine.stopTokens.IsStopToken(token) {
|
||||
tokens = append(tokens, s.Token().Text())
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for _, t := range request.Tokens {
|
||||
tokens = append(tokens, t)
|
||||
}
|
||||
}
|
||||
|
||||
// 建立排序器返回的通信通道
|
||||
rankerReturnChannel := make(
|
||||
chan rankerReturnRequest, engine.initOptions.NumShards)
|
||||
|
||||
// 生成查找请求
|
||||
lookupRequest := indexerLookupRequest{
|
||||
countDocsOnly: request.CountDocsOnly,
|
||||
tokens: tokens,
|
||||
labels: request.Labels,
|
||||
docIds: request.DocIds,
|
||||
options: rankOptions,
|
||||
rankerReturnChannel: rankerReturnChannel,
|
||||
orderless: request.Orderless,
|
||||
}
|
||||
|
||||
// 向索引器发送查找请求
|
||||
for shard := 0; shard < engine.initOptions.NumShards; shard++ {
|
||||
engine.indexerLookupChannels[shard] <- lookupRequest
|
||||
}
|
||||
|
||||
// 从通信通道读取排序器的输出
|
||||
numDocs := 0
|
||||
rankOutput := types.ScoredDocuments{}
|
||||
timeout := request.Timeout
|
||||
isTimeout := false
|
||||
if timeout <= 0 {
|
||||
// 不设置超时
|
||||
for shard := 0; shard < engine.initOptions.NumShards; shard++ {
|
||||
rankerOutput := <-rankerReturnChannel
|
||||
if !request.CountDocsOnly {
|
||||
for _, doc := range rankerOutput.docs {
|
||||
rankOutput = append(rankOutput, doc)
|
||||
}
|
||||
}
|
||||
numDocs += rankerOutput.numDocs
|
||||
}
|
||||
} else {
|
||||
// 设置超时
|
||||
deadline := time.Now().Add(time.Nanosecond * time.Duration(NumNanosecondsInAMillisecond*request.Timeout))
|
||||
for shard := 0; shard < engine.initOptions.NumShards; shard++ {
|
||||
select {
|
||||
case rankerOutput := <-rankerReturnChannel:
|
||||
if !request.CountDocsOnly {
|
||||
for _, doc := range rankerOutput.docs {
|
||||
rankOutput = append(rankOutput, doc)
|
||||
}
|
||||
}
|
||||
numDocs += rankerOutput.numDocs
|
||||
case <-time.After(deadline.Sub(time.Now())):
|
||||
isTimeout = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 再排序
|
||||
if !request.CountDocsOnly && !request.Orderless {
|
||||
if rankOptions.ReverseOrder {
|
||||
sort.Sort(sort.Reverse(rankOutput))
|
||||
} else {
|
||||
sort.Sort(rankOutput)
|
||||
}
|
||||
}
|
||||
|
||||
// 准备输出
|
||||
output.Tokens = tokens
|
||||
// 仅当CountDocsOnly为false时才充填output.Docs
|
||||
if !request.CountDocsOnly {
|
||||
if request.Orderless {
|
||||
// 无序状态无需对Offset截断
|
||||
output.Docs = rankOutput
|
||||
} else {
|
||||
var start, end int
|
||||
if rankOptions.MaxOutputs == 0 {
|
||||
start = utils.MinInt(rankOptions.OutputOffset, len(rankOutput))
|
||||
end = len(rankOutput)
|
||||
} else {
|
||||
start = utils.MinInt(rankOptions.OutputOffset, len(rankOutput))
|
||||
end = utils.MinInt(start+rankOptions.MaxOutputs, len(rankOutput))
|
||||
}
|
||||
output.Docs = rankOutput[start:end]
|
||||
}
|
||||
}
|
||||
output.NumDocs = numDocs
|
||||
output.Timeout = isTimeout
|
||||
return
|
||||
}
|
||||
|
||||
// 阻塞等待直到所有索引添加完毕
|
||||
func (engine *Engine) FlushIndex() {
|
||||
for {
|
||||
runtime.Gosched()
|
||||
if engine.numIndexingRequests == engine.numDocumentsIndexed &&
|
||||
engine.numRemovingRequests*uint64(engine.initOptions.NumShards) == engine.numDocumentsRemoved &&
|
||||
(!engine.initOptions.UsePersistentStorage || engine.numIndexingRequests == engine.numDocumentsStored) {
|
||||
// 保证 CHANNEL 中 REQUESTS 全部被执行完
|
||||
break
|
||||
}
|
||||
}
|
||||
// 强制更新,保证其为最后的请求
|
||||
engine.IndexDocument(0, types.DocumentIndexData{}, true)
|
||||
for {
|
||||
runtime.Gosched()
|
||||
if engine.numForceUpdatingRequests*uint64(engine.initOptions.NumShards) == engine.numDocumentsForceUpdated {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 关闭引擎
|
||||
func (engine *Engine) Close() {
|
||||
engine.FlushIndex()
|
||||
if engine.initOptions.UsePersistentStorage {
|
||||
for _, db := range engine.dbs {
|
||||
db.Close()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 从文本hash得到要分配到的shard
|
||||
func (engine *Engine) getShard(hash uint32) int {
|
||||
return int(hash - hash/uint32(engine.initOptions.NumShards)*uint32(engine.initOptions.NumShards))
|
||||
}
|
||||
101
vendor/github.com/huichen/wukong/engine/indexer_worker.go
generated
vendored
Normal file
101
vendor/github.com/huichen/wukong/engine/indexer_worker.go
generated
vendored
Normal file
@@ -0,0 +1,101 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"github.com/huichen/wukong/types"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
type indexerAddDocumentRequest struct {
|
||||
document *types.DocumentIndex
|
||||
forceUpdate bool
|
||||
}
|
||||
|
||||
type indexerLookupRequest struct {
|
||||
countDocsOnly bool
|
||||
tokens []string
|
||||
labels []string
|
||||
docIds map[uint64]bool
|
||||
options types.RankOptions
|
||||
rankerReturnChannel chan rankerReturnRequest
|
||||
orderless bool
|
||||
}
|
||||
|
||||
type indexerRemoveDocRequest struct {
|
||||
docId uint64
|
||||
forceUpdate bool
|
||||
}
|
||||
|
||||
func (engine *Engine) indexerAddDocumentWorker(shard int) {
|
||||
for {
|
||||
request := <-engine.indexerAddDocChannels[shard]
|
||||
engine.indexers[shard].AddDocumentToCache(request.document, request.forceUpdate)
|
||||
if request.document != nil {
|
||||
atomic.AddUint64(&engine.numTokenIndexAdded,
|
||||
uint64(len(request.document.Keywords)))
|
||||
atomic.AddUint64(&engine.numDocumentsIndexed, 1)
|
||||
}
|
||||
if request.forceUpdate {
|
||||
atomic.AddUint64(&engine.numDocumentsForceUpdated, 1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (engine *Engine) indexerRemoveDocWorker(shard int) {
|
||||
for {
|
||||
request := <-engine.indexerRemoveDocChannels[shard]
|
||||
engine.indexers[shard].RemoveDocumentToCache(request.docId, request.forceUpdate)
|
||||
if request.docId != 0 {
|
||||
atomic.AddUint64(&engine.numDocumentsRemoved, 1)
|
||||
}
|
||||
if request.forceUpdate {
|
||||
atomic.AddUint64(&engine.numDocumentsForceUpdated, 1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (engine *Engine) indexerLookupWorker(shard int) {
|
||||
for {
|
||||
request := <-engine.indexerLookupChannels[shard]
|
||||
|
||||
var docs []types.IndexedDocument
|
||||
var numDocs int
|
||||
if request.docIds == nil {
|
||||
docs, numDocs = engine.indexers[shard].Lookup(request.tokens, request.labels, nil, request.countDocsOnly)
|
||||
} else {
|
||||
docs, numDocs = engine.indexers[shard].Lookup(request.tokens, request.labels, request.docIds, request.countDocsOnly)
|
||||
}
|
||||
|
||||
if request.countDocsOnly {
|
||||
request.rankerReturnChannel <- rankerReturnRequest{numDocs: numDocs}
|
||||
continue
|
||||
}
|
||||
|
||||
if len(docs) == 0 {
|
||||
request.rankerReturnChannel <- rankerReturnRequest{}
|
||||
continue
|
||||
}
|
||||
|
||||
if request.orderless {
|
||||
var outputDocs []types.ScoredDocument
|
||||
for _, d := range docs {
|
||||
outputDocs = append(outputDocs, types.ScoredDocument{
|
||||
DocId: d.DocId,
|
||||
TokenSnippetLocations: d.TokenSnippetLocations,
|
||||
TokenLocations: d.TokenLocations})
|
||||
}
|
||||
request.rankerReturnChannel <- rankerReturnRequest{
|
||||
docs: outputDocs,
|
||||
numDocs: len(outputDocs),
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
rankerRequest := rankerRankRequest{
|
||||
countDocsOnly: request.countDocsOnly,
|
||||
docs: docs,
|
||||
options: request.options,
|
||||
rankerReturnChannel: request.rankerReturnChannel,
|
||||
}
|
||||
engine.rankerRankChannels[shard] <- rankerRequest
|
||||
}
|
||||
}
|
||||
66
vendor/github.com/huichen/wukong/engine/persistent_storage_worker.go
generated
vendored
Normal file
66
vendor/github.com/huichen/wukong/engine/persistent_storage_worker.go
generated
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"encoding/gob"
|
||||
"github.com/huichen/wukong/types"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
type persistentStorageIndexDocumentRequest struct {
|
||||
docId uint64
|
||||
data types.DocumentIndexData
|
||||
}
|
||||
|
||||
func (engine *Engine) persistentStorageIndexDocumentWorker(shard int) {
|
||||
for {
|
||||
request := <-engine.persistentStorageIndexDocumentChannels[shard]
|
||||
|
||||
// 得到key
|
||||
b := make([]byte, 10)
|
||||
length := binary.PutUvarint(b, request.docId)
|
||||
|
||||
// 得到value
|
||||
var buf bytes.Buffer
|
||||
enc := gob.NewEncoder(&buf)
|
||||
err := enc.Encode(request.data)
|
||||
if err != nil {
|
||||
atomic.AddUint64(&engine.numDocumentsStored, 1)
|
||||
continue
|
||||
}
|
||||
|
||||
// 将key-value写入数据库
|
||||
engine.dbs[shard].Set(b[0:length], buf.Bytes())
|
||||
atomic.AddUint64(&engine.numDocumentsStored, 1)
|
||||
}
|
||||
}
|
||||
|
||||
func (engine *Engine) persistentStorageRemoveDocumentWorker(docId uint64, shard uint32) {
|
||||
// 得到key
|
||||
b := make([]byte, 10)
|
||||
length := binary.PutUvarint(b, docId)
|
||||
|
||||
// 从数据库删除该key
|
||||
engine.dbs[shard].Delete(b[0:length])
|
||||
}
|
||||
|
||||
func (engine *Engine) persistentStorageInitWorker(shard int) {
|
||||
engine.dbs[shard].ForEach(func(k, v []byte) error {
|
||||
key, value := k, v
|
||||
// 得到docID
|
||||
docId, _ := binary.Uvarint(key)
|
||||
|
||||
// 得到data
|
||||
buf := bytes.NewReader(value)
|
||||
dec := gob.NewDecoder(buf)
|
||||
var data types.DocumentIndexData
|
||||
err := dec.Decode(&data)
|
||||
if err == nil {
|
||||
// 添加索引
|
||||
engine.internalIndexDocument(docId, data, false)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
engine.persistentStorageInitChannel <- true
|
||||
}
|
||||
52
vendor/github.com/huichen/wukong/engine/ranker_worker.go
generated
vendored
Normal file
52
vendor/github.com/huichen/wukong/engine/ranker_worker.go
generated
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"github.com/huichen/wukong/types"
|
||||
)
|
||||
|
||||
type rankerAddDocRequest struct {
|
||||
docId uint64
|
||||
fields interface{}
|
||||
}
|
||||
|
||||
type rankerRankRequest struct {
|
||||
docs []types.IndexedDocument
|
||||
options types.RankOptions
|
||||
rankerReturnChannel chan rankerReturnRequest
|
||||
countDocsOnly bool
|
||||
}
|
||||
|
||||
type rankerReturnRequest struct {
|
||||
docs types.ScoredDocuments
|
||||
numDocs int
|
||||
}
|
||||
|
||||
type rankerRemoveDocRequest struct {
|
||||
docId uint64
|
||||
}
|
||||
|
||||
func (engine *Engine) rankerAddDocWorker(shard int) {
|
||||
for {
|
||||
request := <-engine.rankerAddDocChannels[shard]
|
||||
engine.rankers[shard].AddDoc(request.docId, request.fields)
|
||||
}
|
||||
}
|
||||
|
||||
func (engine *Engine) rankerRankWorker(shard int) {
|
||||
for {
|
||||
request := <-engine.rankerRankChannels[shard]
|
||||
if request.options.MaxOutputs != 0 {
|
||||
request.options.MaxOutputs += request.options.OutputOffset
|
||||
}
|
||||
request.options.OutputOffset = 0
|
||||
outputDocs, numDocs := engine.rankers[shard].Rank(request.docs, request.options, request.countDocsOnly)
|
||||
request.rankerReturnChannel <- rankerReturnRequest{docs: outputDocs, numDocs: numDocs}
|
||||
}
|
||||
}
|
||||
|
||||
func (engine *Engine) rankerRemoveDocWorker(shard int) {
|
||||
for {
|
||||
request := <-engine.rankerRemoveDocChannels[shard]
|
||||
engine.rankers[shard].RemoveDoc(request.docId)
|
||||
}
|
||||
}
|
||||
97
vendor/github.com/huichen/wukong/engine/segmenter_worker.go
generated
vendored
Normal file
97
vendor/github.com/huichen/wukong/engine/segmenter_worker.go
generated
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"github.com/huichen/wukong/types"
|
||||
)
|
||||
|
||||
type segmenterRequest struct {
|
||||
docId uint64
|
||||
hash uint32
|
||||
data types.DocumentIndexData
|
||||
forceUpdate bool
|
||||
}
|
||||
|
||||
func (engine *Engine) segmenterWorker() {
|
||||
for {
|
||||
request := <-engine.segmenterChannel
|
||||
if request.docId == 0 {
|
||||
if request.forceUpdate {
|
||||
for i := 0; i < engine.initOptions.NumShards; i++ {
|
||||
engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
shard := engine.getShard(request.hash)
|
||||
tokensMap := make(map[string][]int)
|
||||
numTokens := 0
|
||||
if !engine.initOptions.NotUsingSegmenter && request.data.Content != "" {
|
||||
// 当文档正文不为空时,优先从内容分词中得到关键词
|
||||
segments := engine.segmenter.Segment([]byte(request.data.Content))
|
||||
for _, segment := range segments {
|
||||
token := segment.Token().Text()
|
||||
if !engine.stopTokens.IsStopToken(token) {
|
||||
tokensMap[token] = append(tokensMap[token], segment.Start())
|
||||
}
|
||||
}
|
||||
numTokens = len(segments)
|
||||
} else {
|
||||
// 否则载入用户输入的关键词
|
||||
for _, t := range request.data.Tokens {
|
||||
if !engine.stopTokens.IsStopToken(t.Text) {
|
||||
tokensMap[t.Text] = t.Locations
|
||||
}
|
||||
}
|
||||
numTokens = len(request.data.Tokens)
|
||||
}
|
||||
|
||||
// 加入非分词的文档标签
|
||||
for _, label := range request.data.Labels {
|
||||
if !engine.initOptions.NotUsingSegmenter {
|
||||
if !engine.stopTokens.IsStopToken(label) {
|
||||
//当正文中已存在关键字时,若不判断,位置信息将会丢失
|
||||
if _, ok := tokensMap[label]; !ok {
|
||||
tokensMap[label] = []int{}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//当正文中已存在关键字时,若不判断,位置信息将会丢失
|
||||
if _, ok := tokensMap[label]; !ok {
|
||||
tokensMap[label] = []int{}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
indexerRequest := indexerAddDocumentRequest{
|
||||
document: &types.DocumentIndex{
|
||||
DocId: request.docId,
|
||||
TokenLength: float32(numTokens),
|
||||
Keywords: make([]types.KeywordIndex, len(tokensMap)),
|
||||
},
|
||||
forceUpdate: request.forceUpdate,
|
||||
}
|
||||
iTokens := 0
|
||||
for k, v := range tokensMap {
|
||||
indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{
|
||||
Text: k,
|
||||
// 非分词标注的词频设置为0,不参与tf-idf计算
|
||||
Frequency: float32(len(v)),
|
||||
Starts: v}
|
||||
iTokens++
|
||||
}
|
||||
|
||||
engine.indexerAddDocChannels[shard] <- indexerRequest
|
||||
if request.forceUpdate {
|
||||
for i := 0; i < engine.initOptions.NumShards; i++ {
|
||||
if i == shard {
|
||||
continue
|
||||
}
|
||||
engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}
|
||||
}
|
||||
}
|
||||
rankerRequest := rankerAddDocRequest{
|
||||
docId: request.docId, fields: request.data.Fields}
|
||||
engine.rankerAddDocChannels[shard] <- rankerRequest
|
||||
}
|
||||
}
|
||||
40
vendor/github.com/huichen/wukong/engine/stop_tokens.go
generated
vendored
Normal file
40
vendor/github.com/huichen/wukong/engine/stop_tokens.go
generated
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"log"
|
||||
"os"
|
||||
)
|
||||
|
||||
type StopTokens struct {
|
||||
stopTokens map[string]bool
|
||||
}
|
||||
|
||||
// 从stopTokenFile中读入停用词,一个词一行
|
||||
// 文档索引建立时会跳过这些停用词
|
||||
func (st *StopTokens) Init(stopTokenFile string) {
|
||||
st.stopTokens = make(map[string]bool)
|
||||
if stopTokenFile == "" {
|
||||
return
|
||||
}
|
||||
|
||||
file, err := os.Open(stopTokenFile)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
text := scanner.Text()
|
||||
if text != "" {
|
||||
st.stopTokens[text] = true
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func (st *StopTokens) IsStopToken(token string) bool {
|
||||
_, found := st.stopTokens[token]
|
||||
return found
|
||||
}
|
||||
13
vendor/github.com/huichen/wukong/license.txt
generated
vendored
Normal file
13
vendor/github.com/huichen/wukong/license.txt
generated
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
Copyright 2013 Hui Chen
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
69
vendor/github.com/huichen/wukong/storage/bolt_storage.go
generated
vendored
Normal file
69
vendor/github.com/huichen/wukong/storage/bolt_storage.go
generated
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"github.com/boltdb/bolt"
|
||||
"time"
|
||||
)
|
||||
|
||||
var wukong_documents = []byte("wukong_documents")
|
||||
|
||||
type boltStorage struct {
|
||||
db *bolt.DB
|
||||
}
|
||||
|
||||
func openBoltStorage(path string) (Storage, error) {
|
||||
db, err := bolt.Open(path, 0600, &bolt.Options{Timeout: 3600 * time.Second})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
err = db.Update(func(tx *bolt.Tx) error {
|
||||
_, err := tx.CreateBucketIfNotExists(wukong_documents)
|
||||
return err
|
||||
})
|
||||
if err != nil {
|
||||
db.Close()
|
||||
return nil, err
|
||||
}
|
||||
return &boltStorage{db}, nil
|
||||
}
|
||||
|
||||
func (s *boltStorage) WALName() string {
|
||||
return s.db.Path()
|
||||
}
|
||||
|
||||
func (s *boltStorage) Set(k []byte, v []byte) error {
|
||||
return s.db.Update(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(wukong_documents).Put(k, v)
|
||||
})
|
||||
}
|
||||
|
||||
func (s *boltStorage) Get(k []byte) (b []byte, err error) {
|
||||
err = s.db.View(func(tx *bolt.Tx) error {
|
||||
b = tx.Bucket(wukong_documents).Get(k)
|
||||
return nil
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
func (s *boltStorage) Delete(k []byte) error {
|
||||
return s.db.Update(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(wukong_documents).Delete(k)
|
||||
})
|
||||
}
|
||||
|
||||
func (s *boltStorage) ForEach(fn func(k, v []byte) error) error {
|
||||
return s.db.View(func(tx *bolt.Tx) error {
|
||||
b := tx.Bucket(wukong_documents)
|
||||
c := b.Cursor()
|
||||
for k, v := c.First(); k != nil; k, v = c.Next() {
|
||||
if err := fn(k, v); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
func (s *boltStorage) Close() error {
|
||||
return s.db.Close()
|
||||
}
|
||||
64
vendor/github.com/huichen/wukong/storage/kv_storage.go
generated
vendored
Normal file
64
vendor/github.com/huichen/wukong/storage/kv_storage.go
generated
vendored
Normal file
@@ -0,0 +1,64 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"github.com/cznic/kv"
|
||||
"io"
|
||||
)
|
||||
|
||||
type kvStorage struct {
|
||||
db *kv.DB
|
||||
}
|
||||
|
||||
func openKVStorage(path string) (Storage, error) {
|
||||
options := &kv.Options{}
|
||||
db, errOpen := kv.Open(path, options)
|
||||
if errOpen != nil {
|
||||
var errCreate error
|
||||
db, errCreate = kv.Create(path, options)
|
||||
if errCreate != nil {
|
||||
return &kvStorage{db}, errCreate
|
||||
}
|
||||
}
|
||||
return &kvStorage{db}, nil
|
||||
}
|
||||
|
||||
func (s *kvStorage) WALName() string {
|
||||
return s.db.WALName()
|
||||
}
|
||||
|
||||
func (s *kvStorage) Set(k []byte, v []byte) error {
|
||||
return s.db.Set(k, v)
|
||||
}
|
||||
|
||||
func (s *kvStorage) Get(k []byte) ([]byte, error) {
|
||||
return s.db.Get(nil, k)
|
||||
}
|
||||
|
||||
func (s *kvStorage) Delete(k []byte) error {
|
||||
return s.db.Delete(k)
|
||||
}
|
||||
|
||||
func (s *kvStorage) ForEach(fn func(k, v []byte) error) error {
|
||||
iter, err := s.db.SeekFirst()
|
||||
if err == io.EOF {
|
||||
return nil
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
for {
|
||||
key, value, err := iter.Next()
|
||||
if err == io.EOF {
|
||||
break
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := fn(key, value); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *kvStorage) Close() error {
|
||||
return s.db.Close()
|
||||
}
|
||||
37
vendor/github.com/huichen/wukong/storage/storage.go
generated
vendored
Normal file
37
vendor/github.com/huichen/wukong/storage/storage.go
generated
vendored
Normal file
@@ -0,0 +1,37 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
)
|
||||
|
||||
const DEFAULT_STORAGE_ENGINE = "bolt"
|
||||
|
||||
var supportedStorage = map[string]func(path string) (Storage, error){
|
||||
"kv": openKVStorage,
|
||||
"bolt": openBoltStorage,
|
||||
}
|
||||
|
||||
func RegisterStorageEngine(name string, fn func(path string) (Storage, error)) {
|
||||
supportedStorage[name] = fn
|
||||
}
|
||||
|
||||
type Storage interface {
|
||||
Set(k, v []byte) error
|
||||
Get(k []byte) ([]byte, error)
|
||||
Delete(k []byte) error
|
||||
ForEach(fn func(k, v []byte) error) error
|
||||
Close() error
|
||||
WALName() string
|
||||
}
|
||||
|
||||
func OpenStorage(path string) (Storage, error) {
|
||||
wse := os.Getenv("WUKONG_STORAGE_ENGINE")
|
||||
if wse == "" {
|
||||
wse = DEFAULT_STORAGE_ENGINE
|
||||
}
|
||||
if fn, has := supportedStorage[wse]; has {
|
||||
return fn(path)
|
||||
}
|
||||
return nil, fmt.Errorf("unsupported storage engine %v", wse)
|
||||
}
|
||||
27
vendor/github.com/huichen/wukong/types/document_index_data.go
generated
vendored
Normal file
27
vendor/github.com/huichen/wukong/types/document_index_data.go
generated
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
package types
|
||||
|
||||
type DocumentIndexData struct {
|
||||
// 文档全文(必须是UTF-8格式),用于生成待索引的关键词
|
||||
Content string
|
||||
|
||||
// 文档的关键词
|
||||
// 当Content不为空的时候,优先从Content中分词得到关键词。
|
||||
// Tokens存在的意义在于绕过悟空内置的分词器,在引擎外部
|
||||
// 进行分词和预处理。
|
||||
Tokens []TokenData
|
||||
|
||||
// 文档标签(必须是UTF-8格式),比如文档的类别属性等,这些标签并不出现在文档文本中
|
||||
Labels []string
|
||||
|
||||
// 文档的评分字段,可以接纳任何类型的结构体
|
||||
Fields interface{}
|
||||
}
|
||||
|
||||
// 文档的一个关键词
|
||||
type TokenData struct {
|
||||
// 关键词的字符串
|
||||
Text string
|
||||
|
||||
// 关键词的首字节在文档中出现的位置
|
||||
Locations []int
|
||||
}
|
||||
126
vendor/github.com/huichen/wukong/types/engine_init_options.go
generated
vendored
Normal file
126
vendor/github.com/huichen/wukong/types/engine_init_options.go
generated
vendored
Normal file
@@ -0,0 +1,126 @@
|
||||
package types
|
||||
|
||||
import (
|
||||
"log"
|
||||
"runtime"
|
||||
)
|
||||
|
||||
var (
|
||||
// EngineInitOptions的默认值
|
||||
defaultNumSegmenterThreads = runtime.NumCPU()
|
||||
defaultNumShards = 2
|
||||
defaultIndexerBufferLength = runtime.NumCPU()
|
||||
defaultNumIndexerThreadsPerShard = runtime.NumCPU()
|
||||
defaultRankerBufferLength = runtime.NumCPU()
|
||||
defaultNumRankerThreadsPerShard = runtime.NumCPU()
|
||||
defaultDefaultRankOptions = RankOptions{
|
||||
ScoringCriteria: RankByBM25{},
|
||||
}
|
||||
defaultIndexerInitOptions = IndexerInitOptions{
|
||||
IndexType: FrequenciesIndex,
|
||||
BM25Parameters: &defaultBM25Parameters,
|
||||
}
|
||||
defaultBM25Parameters = BM25Parameters{
|
||||
K1: 2.0,
|
||||
B: 0.75,
|
||||
}
|
||||
defaultPersistentStorageShards = 8
|
||||
)
|
||||
|
||||
type EngineInitOptions struct {
|
||||
// 是否使用分词器
|
||||
// 默认使用,否则在启动阶段跳过SegmenterDictionaries和StopTokenFile设置
|
||||
// 如果你不需要在引擎内分词,可以将这个选项设为true
|
||||
// 注意,如果你不用分词器,那么在调用IndexDocument时DocumentIndexData中的Content会被忽略
|
||||
NotUsingSegmenter bool
|
||||
|
||||
// 半角逗号分隔的字典文件,具体用法见
|
||||
// sego.Segmenter.LoadDictionary函数的注释
|
||||
SegmenterDictionaries string
|
||||
|
||||
// 停用词文件
|
||||
StopTokenFile string
|
||||
|
||||
// 分词器线程数
|
||||
NumSegmenterThreads int
|
||||
|
||||
// 索引器和排序器的shard数目
|
||||
// 被检索/排序的文档会被均匀分配到各个shard中
|
||||
NumShards int
|
||||
|
||||
// 索引器的信道缓冲长度
|
||||
IndexerBufferLength int
|
||||
|
||||
// 索引器每个shard分配的线程数
|
||||
NumIndexerThreadsPerShard int
|
||||
|
||||
// 排序器的信道缓冲长度
|
||||
RankerBufferLength int
|
||||
|
||||
// 排序器每个shard分配的线程数
|
||||
NumRankerThreadsPerShard int
|
||||
|
||||
// 索引器初始化选项
|
||||
IndexerInitOptions *IndexerInitOptions
|
||||
|
||||
// 默认的搜索选项
|
||||
DefaultRankOptions *RankOptions
|
||||
|
||||
// 是否使用持久数据库,以及数据库文件保存的目录和裂分数目
|
||||
UsePersistentStorage bool
|
||||
PersistentStorageFolder string
|
||||
PersistentStorageShards int
|
||||
}
|
||||
|
||||
// 初始化EngineInitOptions,当用户未设定某个选项的值时用默认值取代
|
||||
func (options *EngineInitOptions) Init() {
|
||||
if !options.NotUsingSegmenter {
|
||||
if options.SegmenterDictionaries == "" {
|
||||
log.Fatal("字典文件不能为空")
|
||||
}
|
||||
}
|
||||
|
||||
if options.NumSegmenterThreads == 0 {
|
||||
options.NumSegmenterThreads = defaultNumSegmenterThreads
|
||||
}
|
||||
|
||||
if options.NumShards == 0 {
|
||||
options.NumShards = defaultNumShards
|
||||
}
|
||||
|
||||
if options.IndexerBufferLength == 0 {
|
||||
options.IndexerBufferLength = defaultIndexerBufferLength
|
||||
}
|
||||
|
||||
if options.NumIndexerThreadsPerShard == 0 {
|
||||
options.NumIndexerThreadsPerShard = defaultNumIndexerThreadsPerShard
|
||||
}
|
||||
|
||||
if options.RankerBufferLength == 0 {
|
||||
options.RankerBufferLength = defaultRankerBufferLength
|
||||
}
|
||||
|
||||
if options.NumRankerThreadsPerShard == 0 {
|
||||
options.NumRankerThreadsPerShard = defaultNumRankerThreadsPerShard
|
||||
}
|
||||
|
||||
if options.IndexerInitOptions == nil {
|
||||
options.IndexerInitOptions = &defaultIndexerInitOptions
|
||||
}
|
||||
|
||||
if options.IndexerInitOptions.BM25Parameters == nil {
|
||||
options.IndexerInitOptions.BM25Parameters = &defaultBM25Parameters
|
||||
}
|
||||
|
||||
if options.DefaultRankOptions == nil {
|
||||
options.DefaultRankOptions = &defaultDefaultRankOptions
|
||||
}
|
||||
|
||||
if options.DefaultRankOptions.ScoringCriteria == nil {
|
||||
options.DefaultRankOptions.ScoringCriteria = defaultDefaultRankOptions.ScoringCriteria
|
||||
}
|
||||
|
||||
if options.PersistentStorageShards == 0 {
|
||||
options.PersistentStorageShards = defaultPersistentStorageShards
|
||||
}
|
||||
}
|
||||
70
vendor/github.com/huichen/wukong/types/index.go
generated
vendored
Normal file
70
vendor/github.com/huichen/wukong/types/index.go
generated
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
package types
|
||||
|
||||
type DocumentIndex struct {
|
||||
// 文本的DocId
|
||||
DocId uint64
|
||||
|
||||
// 文本的关键词长
|
||||
TokenLength float32
|
||||
|
||||
// 加入的索引键
|
||||
Keywords []KeywordIndex
|
||||
}
|
||||
|
||||
// 反向索引项,这实际上标注了一个(搜索键,文档)对。
|
||||
type KeywordIndex struct {
|
||||
// 搜索键的UTF-8文本
|
||||
Text string
|
||||
|
||||
// 搜索键词频
|
||||
Frequency float32
|
||||
|
||||
// 搜索键在文档中的起始字节位置,按照升序排列
|
||||
Starts []int
|
||||
}
|
||||
|
||||
// 索引器返回结果
|
||||
type IndexedDocument struct {
|
||||
DocId uint64
|
||||
|
||||
// BM25,仅当索引类型为FrequenciesIndex或者LocationsIndex时返回有效值
|
||||
BM25 float32
|
||||
|
||||
// 关键词在文档中的紧邻距离,紧邻距离的含义见computeTokenProximity的注释。
|
||||
// 仅当索引类型为LocationsIndex时返回有效值。
|
||||
TokenProximity int32
|
||||
|
||||
// 紧邻距离计算得到的关键词位置,和Lookup函数输入tokens的长度一样且一一对应。
|
||||
// 仅当索引类型为LocationsIndex时返回有效值。
|
||||
TokenSnippetLocations []int
|
||||
|
||||
// 关键词在文本中的具体位置。
|
||||
// 仅当索引类型为LocationsIndex时返回有效值。
|
||||
TokenLocations [][]int
|
||||
}
|
||||
|
||||
// 方便批量加入文档索引
|
||||
type DocumentsIndex []*DocumentIndex
|
||||
|
||||
func (docs DocumentsIndex) Len() int {
|
||||
return len(docs)
|
||||
}
|
||||
func (docs DocumentsIndex) Swap(i, j int) {
|
||||
docs[i], docs[j] = docs[j], docs[i]
|
||||
}
|
||||
func (docs DocumentsIndex) Less(i, j int) bool {
|
||||
return docs[i].DocId < docs[j].DocId
|
||||
}
|
||||
|
||||
// 方便批量删除文档索引
|
||||
type DocumentsId []uint64
|
||||
|
||||
func (docs DocumentsId) Len() int {
|
||||
return len(docs)
|
||||
}
|
||||
func (docs DocumentsId) Swap(i, j int) {
|
||||
docs[i], docs[j] = docs[j], docs[i]
|
||||
}
|
||||
func (docs DocumentsId) Less(i, j int) bool {
|
||||
return docs[i] < docs[j]
|
||||
}
|
||||
42
vendor/github.com/huichen/wukong/types/indexer_init_options.go
generated
vendored
Normal file
42
vendor/github.com/huichen/wukong/types/indexer_init_options.go
generated
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
package types
|
||||
|
||||
// 这些常数定义了反向索引表存储的数据类型
|
||||
const (
|
||||
// 仅存储文档的docId
|
||||
DocIdsIndex = 0
|
||||
|
||||
// 存储关键词的词频,用于计算BM25
|
||||
FrequenciesIndex = 1
|
||||
|
||||
// 存储关键词在文档中出现的具体字节位置(可能有多个)
|
||||
// 如果你希望得到关键词紧邻度数据,必须使用LocationsIndex类型的索引
|
||||
LocationsIndex = 2
|
||||
|
||||
// 默认插入索引表文档 CACHE SIZE
|
||||
defaultDocCacheSize = 300000
|
||||
)
|
||||
|
||||
// 初始化索引器选项
|
||||
type IndexerInitOptions struct {
|
||||
// 索引表的类型,见上面的常数
|
||||
IndexType int
|
||||
|
||||
// 待插入索引表文档 CACHE SIZE
|
||||
DocCacheSize int
|
||||
|
||||
// BM25参数
|
||||
BM25Parameters *BM25Parameters
|
||||
}
|
||||
|
||||
// 见http://en.wikipedia.org/wiki/Okapi_BM25
|
||||
// 默认值见engine_init_options.go
|
||||
type BM25Parameters struct {
|
||||
K1 float32
|
||||
B float32
|
||||
}
|
||||
|
||||
func (options *IndexerInitOptions) Init() {
|
||||
if options.DocCacheSize == 0 {
|
||||
options.DocCacheSize = defaultDocCacheSize
|
||||
}
|
||||
}
|
||||
17
vendor/github.com/huichen/wukong/types/scoring_criteria.go
generated
vendored
Normal file
17
vendor/github.com/huichen/wukong/types/scoring_criteria.go
generated
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
package types
|
||||
|
||||
// 评分规则通用接口
|
||||
type ScoringCriteria interface {
|
||||
// 给一个文档评分,文档排序时先用第一个分值比较,如果
|
||||
// 分值相同则转移到第二个分值,以此类推。
|
||||
// 返回空切片表明该文档应该从最终排序结果中剔除。
|
||||
Score(doc IndexedDocument, fields interface{}) []float32
|
||||
}
|
||||
|
||||
// 一个简单的评分规则,文档分数为BM25
|
||||
type RankByBM25 struct {
|
||||
}
|
||||
|
||||
func (rule RankByBM25) Score(doc IndexedDocument, fields interface{}) []float32 {
|
||||
return []float32{doc.BM25}
|
||||
}
|
||||
45
vendor/github.com/huichen/wukong/types/search_request.go
generated
vendored
Normal file
45
vendor/github.com/huichen/wukong/types/search_request.go
generated
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
package types
|
||||
|
||||
type SearchRequest struct {
|
||||
// 搜索的短语(必须是UTF-8格式),会被分词
|
||||
// 当值为空字符串时关键词会从下面的Tokens读入
|
||||
Text string
|
||||
|
||||
// 关键词(必须是UTF-8格式),当Text不为空时优先使用Text
|
||||
// 通常你不需要自己指定关键词,除非你运行自己的分词程序
|
||||
Tokens []string
|
||||
|
||||
// 文档标签(必须是UTF-8格式),标签不存在文档文本中,但也属于搜索键的一种
|
||||
Labels []string
|
||||
|
||||
// 当不为nil时,仅从这些DocIds包含的键中搜索(忽略值)
|
||||
DocIds map[uint64]bool
|
||||
|
||||
// 排序选项
|
||||
RankOptions *RankOptions
|
||||
|
||||
// 超时,单位毫秒(千分之一秒)。此值小于等于零时不设超时。
|
||||
// 搜索超时的情况下仍有可能返回部分排序结果。
|
||||
Timeout int
|
||||
|
||||
// 设为true时仅统计搜索到的文档个数,不返回具体的文档
|
||||
CountDocsOnly bool
|
||||
|
||||
// 不排序,对于可在引擎外部(比如客户端)排序情况适用
|
||||
// 对返回文档很多的情况打开此选项可以有效节省时间
|
||||
Orderless bool
|
||||
}
|
||||
|
||||
type RankOptions struct {
|
||||
// 文档的评分规则,值为nil时使用Engine初始化时设定的规则
|
||||
ScoringCriteria ScoringCriteria
|
||||
|
||||
// 默认情况下(ReverseOrder=false)按照分数从大到小排序,否则从小到大排序
|
||||
ReverseOrder bool
|
||||
|
||||
// 从第几条结果开始输出
|
||||
OutputOffset int
|
||||
|
||||
// 最大输出的搜索结果数,为0时无限制
|
||||
MaxOutputs int
|
||||
}
|
||||
57
vendor/github.com/huichen/wukong/types/search_response.go
generated
vendored
Normal file
57
vendor/github.com/huichen/wukong/types/search_response.go
generated
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
package types
|
||||
|
||||
import (
|
||||
"github.com/huichen/wukong/utils"
|
||||
)
|
||||
|
||||
type SearchResponse struct {
|
||||
// 搜索用到的关键词
|
||||
Tokens []string
|
||||
|
||||
// 搜索到的文档,已排序
|
||||
Docs []ScoredDocument
|
||||
|
||||
// 搜索是否超时。超时的情况下也可能会返回部分结果
|
||||
Timeout bool
|
||||
|
||||
// 搜索到的文档个数。注意这是全部文档中满足条件的个数,可能比返回的文档数要大
|
||||
NumDocs int
|
||||
}
|
||||
|
||||
type ScoredDocument struct {
|
||||
DocId uint64
|
||||
|
||||
// 文档的打分值
|
||||
// 搜索结果按照Scores的值排序,先按照第一个数排,如果相同则按照第二个数排序,依次类推。
|
||||
Scores []float32
|
||||
|
||||
// 用于生成摘要的关键词在文本中的字节位置,该切片长度和SearchResponse.Tokens的长度一样
|
||||
// 只有当IndexType == LocationsIndex时不为空
|
||||
TokenSnippetLocations []int
|
||||
|
||||
// 关键词出现的位置
|
||||
// 只有当IndexType == LocationsIndex时不为空
|
||||
TokenLocations [][]int
|
||||
}
|
||||
|
||||
// 为了方便排序
|
||||
|
||||
type ScoredDocuments []ScoredDocument
|
||||
|
||||
func (docs ScoredDocuments) Len() int {
|
||||
return len(docs)
|
||||
}
|
||||
func (docs ScoredDocuments) Swap(i, j int) {
|
||||
docs[i], docs[j] = docs[j], docs[i]
|
||||
}
|
||||
func (docs ScoredDocuments) Less(i, j int) bool {
|
||||
// 为了从大到小排序,这实际上实现的是More的功能
|
||||
for iScore := 0; iScore < utils.MinInt(len(docs[i].Scores), len(docs[j].Scores)); iScore++ {
|
||||
if docs[i].Scores[iScore] > docs[j].Scores[iScore] {
|
||||
return true
|
||||
} else if docs[i].Scores[iScore] < docs[j].Scores[iScore] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return len(docs[i].Scores) > len(docs[j].Scores)
|
||||
}
|
||||
13
vendor/github.com/huichen/wukong/utils/test_utils.go
generated
vendored
Normal file
13
vendor/github.com/huichen/wukong/utils/test_utils.go
generated
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func Expect(t *testing.T, expect string, actual interface{}) {
|
||||
actualString := fmt.Sprint(actual)
|
||||
if expect != actualString {
|
||||
t.Errorf("期待值=\"%s\", 实际=\"%s\"", expect, actualString)
|
||||
}
|
||||
}
|
||||
15
vendor/github.com/huichen/wukong/utils/utils.go
generated
vendored
Normal file
15
vendor/github.com/huichen/wukong/utils/utils.go
generated
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
package utils
|
||||
|
||||
func AbsInt(a int) int {
|
||||
if a < 0 {
|
||||
return -a
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
func MinInt(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
Reference in New Issue
Block a user