搭建框架

This commit is contained in:
Minho
2017-04-21 18:20:35 +08:00
parent d58087f723
commit 67486f0866
727 changed files with 831224 additions and 37 deletions

8
vendor/github.com/huichen/murmur/README.md generated vendored Normal file
View File

@@ -0,0 +1,8 @@
murmur
======
Go Murmur3 hash implementation
Based on
http://en.wikipedia.org/wiki/MurmurHash

13
vendor/github.com/huichen/murmur/license.txt generated vendored Normal file
View File

@@ -0,0 +1,13 @@
Copyright 2013 Hui Chen
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

58
vendor/github.com/huichen/murmur/murmur.go generated vendored Normal file
View File

@@ -0,0 +1,58 @@
// Murmur3 32bit hash function based on
// http://en.wikipedia.org/wiki/MurmurHash
package murmur
const (
c1 = 0xcc9e2d51
c2 = 0x1b873593
c3 = 0x85ebca6b
c4 = 0xc2b2ae35
r1 = 15
r2 = 13
m = 5
n = 0xe6546b64
)
var (
Seed = uint32(1)
)
func Murmur3(key []byte) (hash uint32) {
hash = Seed
iByte := 0
for ; iByte+4 <= len(key); iByte += 4 {
k := uint32(key[iByte]) | uint32(key[iByte+1])<<8 | uint32(key[iByte+2])<<16 | uint32(key[iByte+3])<<24
k *= c1
k = (k << r1) | (k >> (32 - r1))
k *= c2
hash ^= k
hash = (hash << r2) | (hash >> (32 - r2))
hash = hash*m + n
}
var remainingBytes uint32
switch len(key) - iByte {
case 3:
remainingBytes += uint32(key[iByte+2]) << 16
fallthrough
case 2:
remainingBytes += uint32(key[iByte+1]) << 8
fallthrough
case 1:
remainingBytes += uint32(key[iByte])
remainingBytes *= c1
remainingBytes = (remainingBytes << r1) | (remainingBytes >> (32 - r1))
remainingBytes = remainingBytes * c2
hash ^= remainingBytes
}
hash ^= uint32(len(key))
hash ^= hash >> 16
hash *= c3
hash ^= hash >> 13
hash *= c4
hash ^= hash >> 16
// 出发吧,狗嬷嬷!
return
}

43
vendor/github.com/huichen/sego/README.md generated vendored Normal file
View File

@@ -0,0 +1,43 @@
sego
====
Go中文分词
<a href="https://github.com/huichen/sego/blob/master/dictionary.go">词典</a>用双数组trieDouble-Array Trie实现
<a href="https://github.com/huichen/sego/blob/master/segmenter.go">分词器</a>算法为基于词频的最短路径加动态规划。
支持普通和搜索引擎两种分词模式,支持用户词典、词性标注,可运行<a href="https://github.com/huichen/sego/blob/master/server/server.go">JSON RPC服务</a>。
分词速度<a href="https://github.com/huichen/sego/blob/master/tools/benchmark.go">单线程</a>9MB/s<a href="https://github.com/huichen/sego/blob/master/tools/goroutines.go">goroutines并发</a>42MB/s8核Macbook Pro
# 安装/更新
```
go get -u github.com/huichen/sego
```
# 使用
```go
package main
import (
"fmt"
"github.com/huichen/sego"
)
func main() {
// 载入词典
var segmenter sego.Segmenter
segmenter.LoadDictionary("github.com/huichen/sego/data/dictionary.txt")
// 分词
text := []byte("中华人民共和国中央人民政府")
segments := segmenter.Segment(text)
// 处理分词结果
// 支持普通模式和搜索模式两种分词见代码中SegmentsToString函数的注释。
fmt.Println(sego.SegmentsToString(segments, false))
}
```

65
vendor/github.com/huichen/sego/dictionary.go generated vendored Normal file
View File

@@ -0,0 +1,65 @@
package sego
import "github.com/adamzy/cedar-go"
// Dictionary结构体实现了一个字串前缀树一个分词可能出现在叶子节点也有可能出现在非叶节点
type Dictionary struct {
trie *cedar.Cedar // Cedar 前缀树
maxTokenLength int // 词典中最长的分词
tokens []Token // 词典中所有的分词,方便遍历
totalFrequency int64 // 词典中所有分词的频率之和
}
func NewDictionary() *Dictionary {
return &Dictionary{trie: cedar.New()}
}
// 词典中最长的分词
func (dict *Dictionary) MaxTokenLength() int {
return dict.maxTokenLength
}
// 词典中分词数目
func (dict *Dictionary) NumTokens() int {
return len(dict.tokens)
}
// 词典中所有分词的频率之和
func (dict *Dictionary) TotalFrequency() int64 {
return dict.totalFrequency
}
// 向词典中加入一个分词
func (dict *Dictionary) addToken(token Token) {
bytes := textSliceToBytes(token.text)
_, err := dict.trie.Get(bytes)
if err == nil {
return
}
dict.trie.Insert(bytes, dict.NumTokens())
dict.tokens = append(dict.tokens, token)
dict.totalFrequency += int64(token.frequency)
if len(token.text) > dict.maxTokenLength {
dict.maxTokenLength = len(token.text)
}
}
// 在词典中查找和字元组words可以前缀匹配的所有分词
// 返回值为找到的分词数
func (dict *Dictionary) lookupTokens(words []Text, tokens []*Token) (numOfTokens int) {
var id, value int
var err error
for _, word := range words {
id, err = dict.trie.Jump(word, id)
if err != nil {
break
}
value, err = dict.trie.Value(id)
if err == nil {
tokens[numOfTokens] = &dict.tokens[value]
numOfTokens++
}
}
return
}

13
vendor/github.com/huichen/sego/license.txt generated vendored Normal file
View File

@@ -0,0 +1,13 @@
Copyright 2013 Hui Chen
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

28
vendor/github.com/huichen/sego/segment.go generated vendored Normal file
View File

@@ -0,0 +1,28 @@
package sego
// 文本中的一个分词
type Segment struct {
// 分词在文本中的起始字节位置
start int
// 分词在文本中的结束字节位置(不包括该位置)
end int
// 分词信息
token *Token
}
// 返回分词在文本中的起始字节位置
func (s *Segment) Start() int {
return s.start
}
// 返回分词在文本中的结束字节位置(不包括该位置)
func (s *Segment) End() int {
return s.end
}
// 返回分词信息
func (s *Segment) Token() *Token {
return s.token
}

295
vendor/github.com/huichen/sego/segmenter.go generated vendored Normal file
View File

@@ -0,0 +1,295 @@
//Go中文分词
package sego
import (
"bufio"
"fmt"
"log"
"math"
"os"
"strconv"
"strings"
"unicode"
"unicode/utf8"
)
const (
minTokenFrequency = 2 // 仅从字典文件中读取大于等于此频率的分词
)
// 分词器结构体
type Segmenter struct {
dict *Dictionary
}
// 该结构体用于记录Viterbi算法中某字元处的向前分词跳转信息
type jumper struct {
minDistance float32
token *Token
}
// 返回分词器使用的词典
func (seg *Segmenter) Dictionary() *Dictionary {
return seg.dict
}
// 从文件中载入词典
//
// 可以载入多个词典文件,文件名用","分隔,排在前面的词典优先载入分词,比如
// "用户词典.txt,通用词典.txt"
// 当一个分词既出现在用户词典也出现在通用词典中,则优先使用用户词典。
//
// 词典的格式为(每个分词一行):
// 分词文本 频率 词性
func (seg *Segmenter) LoadDictionary(files string) {
seg.dict = NewDictionary()
for _, file := range strings.Split(files, ",") {
log.Printf("载入sego词典 %s", file)
dictFile, err := os.Open(file)
defer dictFile.Close()
if err != nil {
log.Fatalf("无法载入字典文件 \"%s\" \n", file)
}
reader := bufio.NewReader(dictFile)
var text string
var freqText string
var frequency int
var pos string
// 逐行读入分词
for {
size, _ := fmt.Fscanln(reader, &text, &freqText, &pos)
if size == 0 {
// 文件结束
break
} else if size < 2 {
// 无效行
continue
} else if size == 2 {
// 没有词性标注时设为空字符串
pos = ""
}
// 解析词频
var err error
frequency, err = strconv.Atoi(freqText)
if err != nil {
continue
}
// 过滤频率太小的词
if frequency < minTokenFrequency {
continue
}
// 将分词添加到字典中
words := splitTextToWords([]byte(text))
token := Token{text: words, frequency: frequency, pos: pos}
seg.dict.addToken(token)
}
}
// 计算每个分词的路径值路径值含义见Token结构体的注释
logTotalFrequency := float32(math.Log2(float64(seg.dict.totalFrequency)))
for i := range seg.dict.tokens {
token := &seg.dict.tokens[i]
token.distance = logTotalFrequency - float32(math.Log2(float64(token.frequency)))
}
// 对每个分词进行细致划分用于搜索引擎模式该模式用法见Token结构体的注释。
for i := range seg.dict.tokens {
token := &seg.dict.tokens[i]
segments := seg.segmentWords(token.text, true)
// 计算需要添加的子分词数目
numTokensToAdd := 0
for iToken := 0; iToken < len(segments); iToken++ {
if len(segments[iToken].token.text) > 1 {
// 略去字元长度为一的分词
// TODO: 这值得进一步推敲,特别是当字典中有英文复合词的时候
numTokensToAdd++
}
}
token.segments = make([]*Segment, numTokensToAdd)
// 添加子分词
iSegmentsToAdd := 0
for iToken := 0; iToken < len(segments); iToken++ {
if len(segments[iToken].token.text) > 1 {
token.segments[iSegmentsToAdd] = &segments[iToken]
iSegmentsToAdd++
}
}
}
log.Println("sego词典载入完毕")
}
// 对文本分词
//
// 输入参数:
// bytes UTF8文本的字节数组
//
// 输出:
// []Segment 划分的分词
func (seg *Segmenter) Segment(bytes []byte) []Segment {
return seg.internalSegment(bytes, false)
}
func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
// 处理特殊情况
if len(bytes) == 0 {
return []Segment{}
}
// 划分字元
text := splitTextToWords(bytes)
return seg.segmentWords(text, searchMode)
}
func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
// 搜索模式下该分词已无继续划分可能的情况
if searchMode && len(text) == 1 {
return []Segment{}
}
// jumpers定义了每个字元处的向前跳转信息包括这个跳转对应的分词
// 以及从文本段开始到该字元的最短路径值
jumpers := make([]jumper, len(text))
tokens := make([]*Token, seg.dict.maxTokenLength)
for current := 0; current < len(text); current++ {
// 找到前一个字元处的最短路径,以便计算后续路径值
var baseDistance float32
if current == 0 {
// 当本字元在文本首部时,基础距离应该是零
baseDistance = 0
} else {
baseDistance = jumpers[current-1].minDistance
}
// 寻找所有以当前字元开头的分词
numTokens := seg.dict.lookupTokens(
text[current:minInt(current+seg.dict.maxTokenLength, len(text))], tokens)
// 对所有可能的分词,更新分词结束字元处的跳转信息
for iToken := 0; iToken < numTokens; iToken++ {
location := current + len(tokens[iToken].text) - 1
if !searchMode || current != 0 || location != len(text)-1 {
updateJumper(&jumpers[location], baseDistance, tokens[iToken])
}
}
// 当前字元没有对应分词时补加一个伪分词
if numTokens == 0 || len(tokens[0].text) > 1 {
updateJumper(&jumpers[current], baseDistance,
&Token{text: []Text{text[current]}, frequency: 1, distance: 32, pos: "x"})
}
}
// 从后向前扫描第一遍得到需要添加的分词数目
numSeg := 0
for index := len(text) - 1; index >= 0; {
location := index - len(jumpers[index].token.text) + 1
numSeg++
index = location - 1
}
// 从后向前扫描第二遍添加分词到最终结果
outputSegments := make([]Segment, numSeg)
for index := len(text) - 1; index >= 0; {
location := index - len(jumpers[index].token.text) + 1
numSeg--
outputSegments[numSeg].token = jumpers[index].token
index = location - 1
}
// 计算各个分词的字节位置
bytePosition := 0
for iSeg := 0; iSeg < len(outputSegments); iSeg++ {
outputSegments[iSeg].start = bytePosition
bytePosition += textSliceByteLength(outputSegments[iSeg].token.text)
outputSegments[iSeg].end = bytePosition
}
return outputSegments
}
// 更新跳转信息:
// 1. 当该位置从未被访问过时(jumper.minDistance为零的情况),或者
// 2. 当该位置的当前最短路径大于新的最短路径时
// 将当前位置的最短路径值更新为baseDistance加上新分词的概率
func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
newDistance := baseDistance + token.distance
if jumper.minDistance == 0 || jumper.minDistance > newDistance {
jumper.minDistance = newDistance
jumper.token = token
}
}
// 取两整数较小值
func minInt(a, b int) int {
if a > b {
return b
}
return a
}
// 取两整数较大值
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
// 将文本划分成字元
func splitTextToWords(text Text) []Text {
output := make([]Text, 0, len(text)/3)
current := 0
inAlphanumeric := true
alphanumericStart := 0
for current < len(text) {
r, size := utf8.DecodeRune(text[current:])
if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) {
// 当前是拉丁字母或数字(非中日韩文字)
if !inAlphanumeric {
alphanumericStart = current
inAlphanumeric = true
}
} else {
if inAlphanumeric {
inAlphanumeric = false
if current != 0 {
output = append(output, toLower(text[alphanumericStart:current]))
}
}
output = append(output, text[current:current+size])
}
current += size
}
// 处理最后一个字元是英文的情况
if inAlphanumeric {
if current != 0 {
output = append(output, toLower(text[alphanumericStart:current]))
}
}
return output
}
// 将英文词转化为小写
func toLower(text []byte) []byte {
output := make([]byte, len(text))
for i, t := range text {
if t >= 'A' && t <= 'Z' {
output[i] = t - 'A' + 'a'
} else {
output[i] = t
}
}
return output
}

38
vendor/github.com/huichen/sego/test_utils.go generated vendored Normal file
View File

@@ -0,0 +1,38 @@
package sego
import (
"fmt"
"testing"
)
func expect(t *testing.T, expect string, actual interface{}) {
actualString := fmt.Sprint(actual)
if expect != actualString {
t.Errorf("期待值=\"%s\", 实际=\"%s\"", expect, actualString)
}
}
func printTokens(tokens []*Token, numTokens int) (output string) {
for iToken := 0; iToken < numTokens; iToken++ {
for _, word := range tokens[iToken].text {
output += fmt.Sprint(string(word))
}
output += " "
}
return
}
func toWords(strings ...string) []Text {
words := []Text{}
for _, s := range strings {
words = append(words, []byte(s))
}
return words
}
func bytesToString(bytes []Text) (output string) {
for _, b := range bytes {
output += (string(b) + "/")
}
return
}

50
vendor/github.com/huichen/sego/token.go generated vendored Normal file
View File

@@ -0,0 +1,50 @@
package sego
// 字串类型,可以用来表达
// 1. 一个字元,比如"中"又如"国", 英文的一个字元是一个词
// 2. 一个分词,比如"中国"又如"人口"
// 3. 一段文字,比如"中国有十三亿人口"
type Text []byte
// 一个分词
type Token struct {
// 分词的字串,这实际上是个字元数组
text []Text
// 分词在语料库中的词频
frequency int
// log2(总词频/该分词词频)这相当于log2(1/p(分词)),用作动态规划中
// 该分词的路径长度。求解prod(p(分词))的最大值相当于求解
// sum(distance(分词))的最小值,这就是“最短路径”的来历。
distance float32
// 词性标注
pos string
// 该分词文本的进一步分词划分见Segments函数注释。
segments []*Segment
}
// 返回分词文本
func (token *Token) Text() string {
return textSliceToString(token.text)
}
// 返回分词在语料库中的词频
func (token *Token) Frequency() int {
return token.frequency
}
// 返回分词词性标注
func (token *Token) Pos() string {
return token.pos
}
// 该分词文本的进一步分词划分,比如"中华人民共和国中央人民政府"这个分词
// 有两个子分词"中华人民共和国"和"中央人民政府"。子分词也可以进一步有子分词
// 形成一个树结构,遍历这个树就可以得到该分词的所有细致分词划分,这主要
// 用于搜索引擎对一段文本进行全文搜索。
func (token *Token) Segments() []*Segment {
return token.segments
}

93
vendor/github.com/huichen/sego/utils.go generated vendored Normal file
View File

@@ -0,0 +1,93 @@
package sego
import (
"bytes"
"fmt"
)
// 输出分词结果为字符串
//
// 有两种输出模式,以"中华人民共和国"为例
//
// 普通模式searchMode=false输出一个分词"中华人民共和国/ns "
// 搜索模式searchMode=true 输出普通模式的再细致切分:
// "中华/nz 人民/n 共和/nz 共和国/ns 人民共和国/nt 中华人民共和国/ns "
//
// 搜索模式主要用于给搜索引擎提供尽可能多的关键字详情请见Token结构体的注释。
func SegmentsToString(segs []Segment, searchMode bool) (output string) {
if searchMode {
for _, seg := range segs {
output += tokenToString(seg.token)
}
} else {
for _, seg := range segs {
output += fmt.Sprintf(
"%s/%s ", textSliceToString(seg.token.text), seg.token.pos)
}
}
return
}
func tokenToString(token *Token) (output string) {
for _, s := range token.segments {
output += tokenToString(s.token)
}
output += fmt.Sprintf("%s/%s ", textSliceToString(token.text), token.pos)
return
}
// 输出分词结果到一个字符串slice
//
// 有两种输出模式,以"中华人民共和国"为例
//
// 普通模式searchMode=false输出一个分词"[中华人民共和国]"
// 搜索模式searchMode=true 输出普通模式的再细致切分:
// "[中华 人民 共和 共和国 人民共和国 中华人民共和国]"
//
// 搜索模式主要用于给搜索引擎提供尽可能多的关键字详情请见Token结构体的注释。
func SegmentsToSlice(segs []Segment, searchMode bool) (output []string) {
if searchMode {
for _, seg := range segs {
output = append(output, tokenToSlice(seg.token)...)
}
} else {
for _, seg := range segs {
output = append(output, seg.token.Text())
}
}
return
}
func tokenToSlice(token *Token) (output []string) {
for _, s := range token.segments {
output = append(output, tokenToSlice(s.token)...)
}
output = append(output, textSliceToString(token.text))
return output
}
// 将多个字元拼接一个字符串输出
func textSliceToString(text []Text) string {
var output string
for _, word := range text {
output += string(word)
}
return output
}
// 返回多个字元的字节总长度
func textSliceByteLength(text []Text) (length int) {
for _, word := range text {
length += len(word)
}
return
}
func textSliceToBytes(text []Text) []byte {
var buf bytes.Buffer
for _, word := range text {
buf.Write(word)
}
return buf.Bytes()
}

574
vendor/github.com/huichen/wukong/core/indexer.go generated vendored Normal file
View File

@@ -0,0 +1,574 @@
package core
import (
"log"
"math"
"sort"
"sync"
"github.com/huichen/wukong/types"
"github.com/huichen/wukong/utils"
)
// 索引器
type Indexer struct {
// 从搜索键到文档列表的反向索引
// 加了读写锁以保证读写安全
tableLock struct {
sync.RWMutex
table map[string]*KeywordIndices
docsState map[uint64]int // nil: 表示无状态记录0: 存在于索引中1: 等待删除2: 等待加入
}
addCacheLock struct {
sync.RWMutex
addCachePointer int
addCache types.DocumentsIndex
}
removeCacheLock struct {
sync.RWMutex
removeCachePointer int
removeCache types.DocumentsId
}
initOptions types.IndexerInitOptions
initialized bool
// 这实际上是总文档数的一个近似
numDocuments uint64
// 所有被索引文本的总关键词数
totalTokenLength float32
// 每个文档的关键词长度
docTokenLengths map[uint64]float32
}
// 反向索引表的一行收集了一个搜索键出现的所有文档按照DocId从小到大排序。
type KeywordIndices struct {
// 下面的切片是否为空取决于初始化时IndexType的值
docIds []uint64 // 全部类型都有
frequencies []float32 // IndexType == FrequenciesIndex
locations [][]int // IndexType == LocationsIndex
}
// 初始化索引器
func (indexer *Indexer) Init(options types.IndexerInitOptions) {
if indexer.initialized == true {
log.Fatal("索引器不能初始化两次")
}
options.Init()
indexer.initOptions = options
indexer.initialized = true
indexer.tableLock.table = make(map[string]*KeywordIndices)
indexer.tableLock.docsState = make(map[uint64]int)
indexer.addCacheLock.addCache = make([]*types.DocumentIndex, indexer.initOptions.DocCacheSize)
indexer.removeCacheLock.removeCache = make([]uint64, indexer.initOptions.DocCacheSize*2)
indexer.docTokenLengths = make(map[uint64]float32)
}
// 从KeywordIndices中得到第i个文档的DocId
func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {
return ti.docIds[i]
}
// 得到KeywordIndices中文档总数
func (indexer *Indexer) getIndexLength(ti *KeywordIndices) int {
return len(ti.docIds)
}
// 向 ADDCACHE 中加入一个文档
func (indexer *Indexer) AddDocumentToCache(document *types.DocumentIndex, forceUpdate bool) {
if indexer.initialized == false {
log.Fatal("索引器尚未初始化")
}
indexer.addCacheLock.Lock()
if document != nil {
indexer.addCacheLock.addCache[indexer.addCacheLock.addCachePointer] = document
indexer.addCacheLock.addCachePointer++
}
if indexer.addCacheLock.addCachePointer >= indexer.initOptions.DocCacheSize || forceUpdate {
indexer.tableLock.Lock()
position := 0
for i := 0; i < indexer.addCacheLock.addCachePointer; i++ {
docIndex := indexer.addCacheLock.addCache[i]
if docState, ok := indexer.tableLock.docsState[docIndex.DocId]; ok && docState <= 1 {
// ok && docState == 0 表示存在于索引中,需先删除再添加
// ok && docState == 1 表示不一定存在于索引中,等待删除,需先删除再添加
if position != i {
indexer.addCacheLock.addCache[position], indexer.addCacheLock.addCache[i] =
indexer.addCacheLock.addCache[i], indexer.addCacheLock.addCache[position]
}
if docState == 0 {
indexer.removeCacheLock.Lock()
indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] =
docIndex.DocId
indexer.removeCacheLock.removeCachePointer++
indexer.removeCacheLock.Unlock()
indexer.tableLock.docsState[docIndex.DocId] = 1
indexer.numDocuments--
}
position++
} else if !ok {
indexer.tableLock.docsState[docIndex.DocId] = 2
}
}
indexer.tableLock.Unlock()
if indexer.RemoveDocumentToCache(0, forceUpdate) {
// 只有当存在于索引表中的文档已被删除,其才可以重新加入到索引表中
position = 0
}
addCachedDocuments := indexer.addCacheLock.addCache[position:indexer.addCacheLock.addCachePointer]
indexer.addCacheLock.addCachePointer = position
indexer.addCacheLock.Unlock()
sort.Sort(addCachedDocuments)
indexer.AddDocuments(&addCachedDocuments)
} else {
indexer.addCacheLock.Unlock()
}
}
// 向反向索引表中加入 ADDCACHE 中所有文档
func (indexer *Indexer) AddDocuments(documents *types.DocumentsIndex) {
if indexer.initialized == false {
log.Fatal("索引器尚未初始化")
}
indexer.tableLock.Lock()
defer indexer.tableLock.Unlock()
indexPointers := make(map[string]int, len(indexer.tableLock.table))
// DocId 递增顺序遍历插入文档保证索引移动次数最少
for i, document := range *documents {
if i < len(*documents)-1 && (*documents)[i].DocId == (*documents)[i+1].DocId {
// 如果有重复文档加入,因为稳定排序,只加入最后一个
continue
}
if docState, ok := indexer.tableLock.docsState[document.DocId]; ok && docState == 1 {
// 如果此时 docState 仍为 1说明该文档需被删除
// docState 合法状态为 nil & 2保证一定不会插入已经在索引表中的文档
continue
}
// 更新文档关键词总长度
if document.TokenLength != 0 {
indexer.docTokenLengths[document.DocId] = float32(document.TokenLength)
indexer.totalTokenLength += document.TokenLength
}
docIdIsNew := true
for _, keyword := range document.Keywords {
indices, foundKeyword := indexer.tableLock.table[keyword.Text]
if !foundKeyword {
// 如果没找到该搜索键则加入
ti := KeywordIndices{}
switch indexer.initOptions.IndexType {
case types.LocationsIndex:
ti.locations = [][]int{keyword.Starts}
case types.FrequenciesIndex:
ti.frequencies = []float32{keyword.Frequency}
}
ti.docIds = []uint64{document.DocId}
indexer.tableLock.table[keyword.Text] = &ti
continue
}
// 查找应该插入的位置,且索引一定不存在
position, _ := indexer.searchIndex(
indices, indexPointers[keyword.Text], indexer.getIndexLength(indices)-1, document.DocId)
indexPointers[keyword.Text] = position
switch indexer.initOptions.IndexType {
case types.LocationsIndex:
indices.locations = append(indices.locations, []int{})
copy(indices.locations[position+1:], indices.locations[position:])
indices.locations[position] = keyword.Starts
case types.FrequenciesIndex:
indices.frequencies = append(indices.frequencies, float32(0))
copy(indices.frequencies[position+1:], indices.frequencies[position:])
indices.frequencies[position] = keyword.Frequency
}
indices.docIds = append(indices.docIds, 0)
copy(indices.docIds[position+1:], indices.docIds[position:])
indices.docIds[position] = document.DocId
}
// 更新文章状态和总数
if docIdIsNew {
indexer.tableLock.docsState[document.DocId] = 0
indexer.numDocuments++
}
}
}
// 向 REMOVECACHE 中加入一个待删除文档
// 返回值表示文档是否在索引表中被删除
func (indexer *Indexer) RemoveDocumentToCache(docId uint64, forceUpdate bool) bool {
if indexer.initialized == false {
log.Fatal("索引器尚未初始化")
}
indexer.removeCacheLock.Lock()
if docId != 0 {
indexer.tableLock.Lock()
if docState, ok := indexer.tableLock.docsState[docId]; ok && docState == 0 {
indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] = docId
indexer.removeCacheLock.removeCachePointer++
indexer.tableLock.docsState[docId] = 1
indexer.numDocuments--
} else if ok && docState == 2 {
// 删除一个等待加入的文档
indexer.tableLock.docsState[docId] = 1
} else if !ok {
// 若文档不存在,则无法判断其是否在 addCache 中,需避免这样的操作
}
indexer.tableLock.Unlock()
}
if indexer.removeCacheLock.removeCachePointer > 0 &&
(indexer.removeCacheLock.removeCachePointer >= indexer.initOptions.DocCacheSize ||
forceUpdate) {
removeCachedDocuments := indexer.removeCacheLock.removeCache[:indexer.removeCacheLock.removeCachePointer]
indexer.removeCacheLock.removeCachePointer = 0
indexer.removeCacheLock.Unlock()
sort.Sort(removeCachedDocuments)
indexer.RemoveDocuments(&removeCachedDocuments)
return true
}
indexer.removeCacheLock.Unlock()
return false
}
// 向反向索引表中删除 REMOVECACHE 中所有文档
func (indexer *Indexer) RemoveDocuments(documents *types.DocumentsId) {
if indexer.initialized == false {
log.Fatal("索引器尚未初始化")
}
indexer.tableLock.Lock()
defer indexer.tableLock.Unlock()
// 更新文档关键词总长度,删除文档状态
for _, docId := range *documents {
indexer.totalTokenLength -= indexer.docTokenLengths[docId]
delete(indexer.docTokenLengths, docId)
delete(indexer.tableLock.docsState, docId)
}
for keyword, indices := range indexer.tableLock.table {
indicesTop, indicesPointer := 0, 0
documentsPointer := sort.Search(
len(*documents), func(i int) bool { return (*documents)[i] >= indices.docIds[0] })
// 双指针扫描,进行批量删除操作
for documentsPointer < len(*documents) && indicesPointer < indexer.getIndexLength(indices) {
if indices.docIds[indicesPointer] < (*documents)[documentsPointer] {
if indicesTop != indicesPointer {
switch indexer.initOptions.IndexType {
case types.LocationsIndex:
indices.locations[indicesTop] = indices.locations[indicesPointer]
case types.FrequenciesIndex:
indices.frequencies[indicesTop] = indices.frequencies[indicesPointer]
}
indices.docIds[indicesTop] = indices.docIds[indicesPointer]
}
indicesTop++
indicesPointer++
} else if indices.docIds[indicesPointer] == (*documents)[documentsPointer] {
indicesPointer++
documentsPointer++
} else {
documentsPointer++
}
}
if indicesTop != indicesPointer {
switch indexer.initOptions.IndexType {
case types.LocationsIndex:
indices.locations = append(
indices.locations[:indicesTop], indices.locations[indicesPointer:]...)
case types.FrequenciesIndex:
indices.frequencies = append(
indices.frequencies[:indicesTop], indices.frequencies[indicesPointer:]...)
}
indices.docIds = append(
indices.docIds[:indicesTop], indices.docIds[indicesPointer:]...)
}
if len(indices.docIds) == 0 {
delete(indexer.tableLock.table, keyword)
}
}
}
// 查找包含全部搜索键(AND操作)的文档
// 当docIds不为nil时仅从docIds指定的文档中查找
func (indexer *Indexer) Lookup(
tokens []string, labels []string, docIds map[uint64]bool, countDocsOnly bool) (docs []types.IndexedDocument, numDocs int) {
if indexer.initialized == false {
log.Fatal("索引器尚未初始化")
}
if indexer.numDocuments == 0 {
return
}
numDocs = 0
// 合并关键词和标签为搜索键
keywords := make([]string, len(tokens)+len(labels))
copy(keywords, tokens)
copy(keywords[len(tokens):], labels)
indexer.tableLock.RLock()
defer indexer.tableLock.RUnlock()
table := make([]*KeywordIndices, len(keywords))
for i, keyword := range keywords {
indices, found := indexer.tableLock.table[keyword]
if !found {
// 当反向索引表中无此搜索键时直接返回
return
} else {
// 否则加入反向表中
table[i] = indices
}
}
// 当没有找到时直接返回
if len(table) == 0 {
return
}
// 归并查找各个搜索键出现文档的交集
// 从后向前查保证先输出DocId较大文档
indexPointers := make([]int, len(table))
for iTable := 0; iTable < len(table); iTable++ {
indexPointers[iTable] = indexer.getIndexLength(table[iTable]) - 1
}
// 平均文本关键词长度用于计算BM25
avgDocLength := indexer.totalTokenLength / float32(indexer.numDocuments)
for ; indexPointers[0] >= 0; indexPointers[0]-- {
// 以第一个搜索键出现的文档作为基准,并遍历其他搜索键搜索同一文档
baseDocId := indexer.getDocId(table[0], indexPointers[0])
if docIds != nil {
if _, found := docIds[baseDocId]; !found {
continue
}
}
iTable := 1
found := true
for ; iTable < len(table); iTable++ {
// 二分法比简单的顺序归并效率高,也有更高效率的算法,
// 但顺序归并也许是更好的选择,考虑到将来需要用链表重新实现
// 以避免反向表添加新文档时的写锁。
// TODO: 进一步研究不同求交集算法的速度和可扩展性。
position, foundBaseDocId := indexer.searchIndex(table[iTable],
0, indexPointers[iTable], baseDocId)
if foundBaseDocId {
indexPointers[iTable] = position
} else {
if position == 0 {
// 该搜索键中所有的文档ID都比baseDocId大因此已经没有
// 继续查找的必要。
return
} else {
// 继续下一indexPointers[0]的查找
indexPointers[iTable] = position - 1
found = false
break
}
}
}
if found {
if docState, ok := indexer.tableLock.docsState[baseDocId]; !ok || docState != 0 {
continue
}
indexedDoc := types.IndexedDocument{}
// 当为LocationsIndex时计算关键词紧邻距离
if indexer.initOptions.IndexType == types.LocationsIndex {
// 计算有多少关键词是带有距离信息的
numTokensWithLocations := 0
for i, t := range table[:len(tokens)] {
if len(t.locations[indexPointers[i]]) > 0 {
numTokensWithLocations++
}
}
if numTokensWithLocations != len(tokens) {
if !countDocsOnly {
docs = append(docs, types.IndexedDocument{
DocId: baseDocId,
})
}
numDocs++
//当某个关键字对应多个文档且有lable关键字存在时若直接break,将会丢失相当一部分搜索结果
continue
}
// 计算搜索键在文档中的紧邻距离
tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], indexPointers, tokens)
indexedDoc.TokenProximity = int32(tokenProximity)
indexedDoc.TokenSnippetLocations = tokenLocations
// 添加TokenLocations
indexedDoc.TokenLocations = make([][]int, len(tokens))
for i, t := range table[:len(tokens)] {
indexedDoc.TokenLocations[i] = t.locations[indexPointers[i]]
}
}
// 当为LocationsIndex或者FrequenciesIndex时计算BM25
if indexer.initOptions.IndexType == types.LocationsIndex ||
indexer.initOptions.IndexType == types.FrequenciesIndex {
bm25 := float32(0)
d := indexer.docTokenLengths[baseDocId]
for i, t := range table[:len(tokens)] {
var frequency float32
if indexer.initOptions.IndexType == types.LocationsIndex {
frequency = float32(len(t.locations[indexPointers[i]]))
} else {
frequency = t.frequencies[indexPointers[i]]
}
// 计算BM25
if len(t.docIds) > 0 && frequency > 0 && indexer.initOptions.BM25Parameters != nil && avgDocLength != 0 {
// 带平滑的idf
idf := float32(math.Log2(float64(indexer.numDocuments)/float64(len(t.docIds)) + 1))
k1 := indexer.initOptions.BM25Parameters.K1
b := indexer.initOptions.BM25Parameters.B
bm25 += idf * frequency * (k1 + 1) / (frequency + k1*(1-b+b*d/avgDocLength))
}
}
indexedDoc.BM25 = float32(bm25)
}
indexedDoc.DocId = baseDocId
if !countDocsOnly {
docs = append(docs, indexedDoc)
}
numDocs++
}
}
return
}
// 二分法查找indices中某文档的索引项
// 第一个返回参数为找到的位置或需要插入的位置
// 第二个返回参数标明是否找到
func (indexer *Indexer) searchIndex(
indices *KeywordIndices, start int, end int, docId uint64) (int, bool) {
// 特殊情况
if indexer.getIndexLength(indices) == start {
return start, false
}
if docId < indexer.getDocId(indices, start) {
return start, false
} else if docId == indexer.getDocId(indices, start) {
return start, true
}
if docId > indexer.getDocId(indices, end) {
return end + 1, false
} else if docId == indexer.getDocId(indices, end) {
return end, true
}
// 二分
var middle int
for end-start > 1 {
middle = (start + end) / 2
if docId == indexer.getDocId(indices, middle) {
return middle, true
} else if docId > indexer.getDocId(indices, middle) {
start = middle
} else {
end = middle
}
}
return end, false
}
// 计算搜索键在文本中的紧邻距离
//
// 假定第 i 个搜索键首字节出现在文本中的位置为 P_i长度 L_i
// 紧邻距离计算公式为
//
// ArgMin(Sum(Abs(P_(i+1) - P_i - L_i)))
//
// 具体由动态规划实现,依次计算前 i 个 token 在每个出现位置的最优值。
// 选定的 P_i 通过 tokenLocations 参数传回。
func computeTokenProximity(table []*KeywordIndices, indexPointers []int, tokens []string) (
minTokenProximity int, tokenLocations []int) {
minTokenProximity = -1
tokenLocations = make([]int, len(tokens))
var (
currentLocations, nextLocations []int
currentMinValues, nextMinValues []int
path [][]int
)
// 初始化路径数组
path = make([][]int, len(tokens))
for i := 1; i < len(path); i++ {
path[i] = make([]int, len(table[i].locations[indexPointers[i]]))
}
// 动态规划
currentLocations = table[0].locations[indexPointers[0]]
currentMinValues = make([]int, len(currentLocations))
for i := 1; i < len(tokens); i++ {
nextLocations = table[i].locations[indexPointers[i]]
nextMinValues = make([]int, len(nextLocations))
for j, _ := range nextMinValues {
nextMinValues[j] = -1
}
var iNext int
for iCurrent, currentLocation := range currentLocations {
if currentMinValues[iCurrent] == -1 {
continue
}
for iNext+1 < len(nextLocations) && nextLocations[iNext+1] < currentLocation {
iNext++
}
update := func(from int, to int) {
if to >= len(nextLocations) {
return
}
value := currentMinValues[from] + utils.AbsInt(nextLocations[to]-currentLocations[from]-len(tokens[i-1]))
if nextMinValues[to] == -1 || value < nextMinValues[to] {
nextMinValues[to] = value
path[i][to] = from
}
}
// 最优解的状态转移只发生在左右最接近的位置
update(iCurrent, iNext)
update(iCurrent, iNext+1)
}
currentLocations = nextLocations
currentMinValues = nextMinValues
}
// 找出最优解
var cursor int
for i, value := range currentMinValues {
if value == -1 {
continue
}
if minTokenProximity == -1 || value < minTokenProximity {
minTokenProximity = value
cursor = i
}
}
// 从路径倒推出最优解的位置
for i := len(tokens) - 1; i >= 0; i-- {
if i != len(tokens)-1 {
cursor = path[i+1][cursor]
}
tokenLocations[i] = table[i].locations[indexPointers[i]][cursor]
}
return
}

106
vendor/github.com/huichen/wukong/core/ranker.go generated vendored Normal file
View File

@@ -0,0 +1,106 @@
package core
import (
"github.com/huichen/wukong/types"
"github.com/huichen/wukong/utils"
"log"
"sort"
"sync"
)
type Ranker struct {
lock struct {
sync.RWMutex
fields map[uint64]interface{}
docs map[uint64]bool
}
initialized bool
}
func (ranker *Ranker) Init() {
if ranker.initialized == true {
log.Fatal("排序器不能初始化两次")
}
ranker.initialized = true
ranker.lock.fields = make(map[uint64]interface{})
ranker.lock.docs = make(map[uint64]bool)
}
// 给某个文档添加评分字段
func (ranker *Ranker) AddDoc(docId uint64, fields interface{}) {
if ranker.initialized == false {
log.Fatal("排序器尚未初始化")
}
ranker.lock.Lock()
ranker.lock.fields[docId] = fields
ranker.lock.docs[docId] = true
ranker.lock.Unlock()
}
// 删除某个文档的评分字段
func (ranker *Ranker) RemoveDoc(docId uint64) {
if ranker.initialized == false {
log.Fatal("排序器尚未初始化")
}
ranker.lock.Lock()
delete(ranker.lock.fields, docId)
delete(ranker.lock.docs, docId)
ranker.lock.Unlock()
}
// 给文档评分并排序
func (ranker *Ranker) Rank(
docs []types.IndexedDocument, options types.RankOptions, countDocsOnly bool) (types.ScoredDocuments, int) {
if ranker.initialized == false {
log.Fatal("排序器尚未初始化")
}
// 对每个文档评分
var outputDocs types.ScoredDocuments
numDocs := 0
for _, d := range docs {
ranker.lock.RLock()
// 判断doc是否存在
if _, ok := ranker.lock.docs[d.DocId]; ok {
fs := ranker.lock.fields[d.DocId]
ranker.lock.RUnlock()
// 计算评分并剔除没有分值的文档
scores := options.ScoringCriteria.Score(d, fs)
if len(scores) > 0 {
if !countDocsOnly {
outputDocs = append(outputDocs, types.ScoredDocument{
DocId: d.DocId,
Scores: scores,
TokenSnippetLocations: d.TokenSnippetLocations,
TokenLocations: d.TokenLocations})
}
numDocs++
}
} else {
ranker.lock.RUnlock()
}
}
// 排序
if !countDocsOnly {
if options.ReverseOrder {
sort.Sort(sort.Reverse(outputDocs))
} else {
sort.Sort(outputDocs)
}
// 当用户要求只返回部分结果时返回部分结果
var start, end int
if options.MaxOutputs != 0 {
start = utils.MinInt(options.OutputOffset, len(outputDocs))
end = utils.MinInt(options.OutputOffset+options.MaxOutputs, len(outputDocs))
} else {
start = utils.MinInt(options.OutputOffset, len(outputDocs))
end = len(outputDocs)
}
return outputDocs[start:end], numDocs
}
return outputDocs, numDocs
}

35
vendor/github.com/huichen/wukong/core/test_utils.go generated vendored Normal file
View File

@@ -0,0 +1,35 @@
package core
import (
"fmt"
"github.com/huichen/wukong/types"
)
func indicesToString(indexer *Indexer, token string) (output string) {
if indices, ok := indexer.tableLock.table[token]; ok {
for i := 0; i < indexer.getIndexLength(indices); i++ {
output += fmt.Sprintf("%d ",
indexer.getDocId(indices, i))
}
}
return
}
func indexedDocsToString(docs []types.IndexedDocument, numDocs int) (output string) {
for _, doc := range docs {
output += fmt.Sprintf("[%d %d %v] ",
doc.DocId, doc.TokenProximity, doc.TokenSnippetLocations)
}
return
}
func scoredDocsToString(docs []types.ScoredDocument) (output string) {
for _, doc := range docs {
output += fmt.Sprintf("[%d [", doc.DocId)
for _, score := range doc.Scores {
output += fmt.Sprintf("%d ", int(score*1000))
}
output += "]] "
}
return
}

13
vendor/github.com/huichen/wukong/engine/counters.go generated vendored Normal file
View File

@@ -0,0 +1,13 @@
package engine
func (engine *Engine) NumTokenIndexAdded() uint64 {
return engine.numTokenIndexAdded
}
func (engine *Engine) NumDocumentsIndexed() uint64 {
return engine.numDocumentsIndexed
}
func (engine *Engine) NumDocumentsRemoved() uint64 {
return engine.numDocumentsRemoved
}

446
vendor/github.com/huichen/wukong/engine/engine.go generated vendored Normal file
View File

@@ -0,0 +1,446 @@
package engine
import (
"fmt"
"github.com/huichen/murmur"
"github.com/huichen/sego"
"github.com/huichen/wukong/core"
"github.com/huichen/wukong/storage"
"github.com/huichen/wukong/types"
"github.com/huichen/wukong/utils"
"log"
"os"
"runtime"
"sort"
"strconv"
"sync/atomic"
"time"
)
const (
NumNanosecondsInAMillisecond = 1000000
PersistentStorageFilePrefix = "wukong"
)
type Engine struct {
// 计数器,用来统计有多少文档被索引等信息
numDocumentsIndexed uint64
numDocumentsRemoved uint64
numDocumentsForceUpdated uint64
numIndexingRequests uint64
numRemovingRequests uint64
numForceUpdatingRequests uint64
numTokenIndexAdded uint64
numDocumentsStored uint64
// 记录初始化参数
initOptions types.EngineInitOptions
initialized bool
indexers []core.Indexer
rankers []core.Ranker
segmenter sego.Segmenter
stopTokens StopTokens
dbs []storage.Storage
// 建立索引器使用的通信通道
segmenterChannel chan segmenterRequest
indexerAddDocChannels []chan indexerAddDocumentRequest
indexerRemoveDocChannels []chan indexerRemoveDocRequest
rankerAddDocChannels []chan rankerAddDocRequest
// 建立排序器使用的通信通道
indexerLookupChannels []chan indexerLookupRequest
rankerRankChannels []chan rankerRankRequest
rankerRemoveDocChannels []chan rankerRemoveDocRequest
// 建立持久存储使用的通信通道
persistentStorageIndexDocumentChannels []chan persistentStorageIndexDocumentRequest
persistentStorageInitChannel chan bool
}
func (engine *Engine) Init(options types.EngineInitOptions) {
// 将线程数设置为CPU数
runtime.GOMAXPROCS(runtime.NumCPU())
// 初始化初始参数
if engine.initialized {
log.Fatal("请勿重复初始化引擎")
}
options.Init()
engine.initOptions = options
engine.initialized = true
if !options.NotUsingSegmenter {
// 载入分词器词典
engine.segmenter.LoadDictionary(options.SegmenterDictionaries)
// 初始化停用词
engine.stopTokens.Init(options.StopTokenFile)
}
// 初始化索引器和排序器
for shard := 0; shard < options.NumShards; shard++ {
engine.indexers = append(engine.indexers, core.Indexer{})
engine.indexers[shard].Init(*options.IndexerInitOptions)
engine.rankers = append(engine.rankers, core.Ranker{})
engine.rankers[shard].Init()
}
// 初始化分词器通道
engine.segmenterChannel = make(
chan segmenterRequest, options.NumSegmenterThreads)
// 初始化索引器通道
engine.indexerAddDocChannels = make(
[]chan indexerAddDocumentRequest, options.NumShards)
engine.indexerRemoveDocChannels = make(
[]chan indexerRemoveDocRequest, options.NumShards)
engine.indexerLookupChannels = make(
[]chan indexerLookupRequest, options.NumShards)
for shard := 0; shard < options.NumShards; shard++ {
engine.indexerAddDocChannels[shard] = make(
chan indexerAddDocumentRequest,
options.IndexerBufferLength)
engine.indexerRemoveDocChannels[shard] = make(
chan indexerRemoveDocRequest,
options.IndexerBufferLength)
engine.indexerLookupChannels[shard] = make(
chan indexerLookupRequest,
options.IndexerBufferLength)
}
// 初始化排序器通道
engine.rankerAddDocChannels = make(
[]chan rankerAddDocRequest, options.NumShards)
engine.rankerRankChannels = make(
[]chan rankerRankRequest, options.NumShards)
engine.rankerRemoveDocChannels = make(
[]chan rankerRemoveDocRequest, options.NumShards)
for shard := 0; shard < options.NumShards; shard++ {
engine.rankerAddDocChannels[shard] = make(
chan rankerAddDocRequest,
options.RankerBufferLength)
engine.rankerRankChannels[shard] = make(
chan rankerRankRequest,
options.RankerBufferLength)
engine.rankerRemoveDocChannels[shard] = make(
chan rankerRemoveDocRequest,
options.RankerBufferLength)
}
// 初始化持久化存储通道
if engine.initOptions.UsePersistentStorage {
engine.persistentStorageIndexDocumentChannels =
make([]chan persistentStorageIndexDocumentRequest,
engine.initOptions.PersistentStorageShards)
for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
engine.persistentStorageIndexDocumentChannels[shard] = make(
chan persistentStorageIndexDocumentRequest)
}
engine.persistentStorageInitChannel = make(
chan bool, engine.initOptions.PersistentStorageShards)
}
// 启动分词器
for iThread := 0; iThread < options.NumSegmenterThreads; iThread++ {
go engine.segmenterWorker()
}
// 启动索引器和排序器
for shard := 0; shard < options.NumShards; shard++ {
go engine.indexerAddDocumentWorker(shard)
go engine.indexerRemoveDocWorker(shard)
go engine.rankerAddDocWorker(shard)
go engine.rankerRemoveDocWorker(shard)
for i := 0; i < options.NumIndexerThreadsPerShard; i++ {
go engine.indexerLookupWorker(shard)
}
for i := 0; i < options.NumRankerThreadsPerShard; i++ {
go engine.rankerRankWorker(shard)
}
}
// 启动持久化存储工作协程
if engine.initOptions.UsePersistentStorage {
err := os.MkdirAll(engine.initOptions.PersistentStorageFolder, 0700)
if err != nil {
log.Fatal("无法创建目录", engine.initOptions.PersistentStorageFolder)
}
// 打开或者创建数据库
engine.dbs = make([]storage.Storage, engine.initOptions.PersistentStorageShards)
for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
dbPath := engine.initOptions.PersistentStorageFolder + "/" + PersistentStorageFilePrefix + "." + strconv.Itoa(shard)
db, err := storage.OpenStorage(dbPath)
if db == nil || err != nil {
log.Fatal("无法打开数据库", dbPath, ": ", err)
}
engine.dbs[shard] = db
}
// 从数据库中恢复
for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
go engine.persistentStorageInitWorker(shard)
}
// 等待恢复完成
for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
<-engine.persistentStorageInitChannel
}
for {
runtime.Gosched()
if engine.numIndexingRequests == engine.numDocumentsIndexed {
break
}
}
// 关闭并重新打开数据库
for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
engine.dbs[shard].Close()
dbPath := engine.initOptions.PersistentStorageFolder + "/" + PersistentStorageFilePrefix + "." + strconv.Itoa(shard)
db, err := storage.OpenStorage(dbPath)
if db == nil || err != nil {
log.Fatal("无法打开数据库", dbPath, ": ", err)
}
engine.dbs[shard] = db
}
for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
go engine.persistentStorageIndexDocumentWorker(shard)
}
}
atomic.AddUint64(&engine.numDocumentsStored, engine.numIndexingRequests)
}
// 将文档加入索引
//
// 输入参数:
// docId 标识文档编号必须唯一docId == 0 表示非法文档(用于强制刷新索引),[1, +oo) 表示合法文档
// data 见DocumentIndexData注释
// forceUpdate 是否强制刷新 cache如果设为 true则尽快添加到索引否则等待 cache 满之后一次全量添加
//
// 注意:
// 1. 这个函数是线程安全的,请尽可能并发调用以提高索引速度
// 2. 这个函数调用是非同步的,也就是说在函数返回时有可能文档还没有加入索引中,因此
// 如果立刻调用Search可能无法查询到这个文档。强制刷新索引请调用FlushIndex函数。
func (engine *Engine) IndexDocument(docId uint64, data types.DocumentIndexData, forceUpdate bool) {
engine.internalIndexDocument(docId, data, forceUpdate)
hash := murmur.Murmur3([]byte(fmt.Sprint("%d", docId))) % uint32(engine.initOptions.PersistentStorageShards)
if engine.initOptions.UsePersistentStorage && docId != 0 {
engine.persistentStorageIndexDocumentChannels[hash] <- persistentStorageIndexDocumentRequest{docId: docId, data: data}
}
}
func (engine *Engine) internalIndexDocument(
docId uint64, data types.DocumentIndexData, forceUpdate bool) {
if !engine.initialized {
log.Fatal("必须先初始化引擎")
}
if docId != 0 {
atomic.AddUint64(&engine.numIndexingRequests, 1)
}
if forceUpdate {
atomic.AddUint64(&engine.numForceUpdatingRequests, 1)
}
hash := murmur.Murmur3([]byte(fmt.Sprint("%d%s", docId, data.Content)))
engine.segmenterChannel <- segmenterRequest{
docId: docId, hash: hash, data: data, forceUpdate: forceUpdate}
}
// 将文档从索引中删除
//
// 输入参数:
// docId 标识文档编号必须唯一docId == 0 表示非法文档(用于强制刷新索引),[1, +oo) 表示合法文档
// forceUpdate 是否强制刷新 cache如果设为 true则尽快删除索引否则等待 cache 满之后一次全量删除
//
// 注意:
// 1. 这个函数是线程安全的,请尽可能并发调用以提高索引速度
// 2. 这个函数调用是非同步的,也就是说在函数返回时有可能文档还没有加入索引中,因此
// 如果立刻调用Search可能无法查询到这个文档。强制刷新索引请调用FlushIndex函数。
func (engine *Engine) RemoveDocument(docId uint64, forceUpdate bool) {
if !engine.initialized {
log.Fatal("必须先初始化引擎")
}
if docId != 0 {
atomic.AddUint64(&engine.numRemovingRequests, 1)
}
if forceUpdate {
atomic.AddUint64(&engine.numForceUpdatingRequests, 1)
}
for shard := 0; shard < engine.initOptions.NumShards; shard++ {
engine.indexerRemoveDocChannels[shard] <- indexerRemoveDocRequest{docId: docId, forceUpdate: forceUpdate}
if docId == 0 {
continue
}
engine.rankerRemoveDocChannels[shard] <- rankerRemoveDocRequest{docId: docId}
}
if engine.initOptions.UsePersistentStorage && docId != 0 {
// 从数据库中删除
hash := murmur.Murmur3([]byte(fmt.Sprint("%d", docId))) % uint32(engine.initOptions.PersistentStorageShards)
go engine.persistentStorageRemoveDocumentWorker(docId, hash)
}
}
// 查找满足搜索条件的文档,此函数线程安全
func (engine *Engine) Search(request types.SearchRequest) (output types.SearchResponse) {
if !engine.initialized {
log.Fatal("必须先初始化引擎")
}
var rankOptions types.RankOptions
if request.RankOptions == nil {
rankOptions = *engine.initOptions.DefaultRankOptions
} else {
rankOptions = *request.RankOptions
}
if rankOptions.ScoringCriteria == nil {
rankOptions.ScoringCriteria = engine.initOptions.DefaultRankOptions.ScoringCriteria
}
// 收集关键词
tokens := []string{}
if request.Text != "" {
querySegments := engine.segmenter.Segment([]byte(request.Text))
for _, s := range querySegments {
token := s.Token().Text()
if !engine.stopTokens.IsStopToken(token) {
tokens = append(tokens, s.Token().Text())
}
}
} else {
for _, t := range request.Tokens {
tokens = append(tokens, t)
}
}
// 建立排序器返回的通信通道
rankerReturnChannel := make(
chan rankerReturnRequest, engine.initOptions.NumShards)
// 生成查找请求
lookupRequest := indexerLookupRequest{
countDocsOnly: request.CountDocsOnly,
tokens: tokens,
labels: request.Labels,
docIds: request.DocIds,
options: rankOptions,
rankerReturnChannel: rankerReturnChannel,
orderless: request.Orderless,
}
// 向索引器发送查找请求
for shard := 0; shard < engine.initOptions.NumShards; shard++ {
engine.indexerLookupChannels[shard] <- lookupRequest
}
// 从通信通道读取排序器的输出
numDocs := 0
rankOutput := types.ScoredDocuments{}
timeout := request.Timeout
isTimeout := false
if timeout <= 0 {
// 不设置超时
for shard := 0; shard < engine.initOptions.NumShards; shard++ {
rankerOutput := <-rankerReturnChannel
if !request.CountDocsOnly {
for _, doc := range rankerOutput.docs {
rankOutput = append(rankOutput, doc)
}
}
numDocs += rankerOutput.numDocs
}
} else {
// 设置超时
deadline := time.Now().Add(time.Nanosecond * time.Duration(NumNanosecondsInAMillisecond*request.Timeout))
for shard := 0; shard < engine.initOptions.NumShards; shard++ {
select {
case rankerOutput := <-rankerReturnChannel:
if !request.CountDocsOnly {
for _, doc := range rankerOutput.docs {
rankOutput = append(rankOutput, doc)
}
}
numDocs += rankerOutput.numDocs
case <-time.After(deadline.Sub(time.Now())):
isTimeout = true
break
}
}
}
// 再排序
if !request.CountDocsOnly && !request.Orderless {
if rankOptions.ReverseOrder {
sort.Sort(sort.Reverse(rankOutput))
} else {
sort.Sort(rankOutput)
}
}
// 准备输出
output.Tokens = tokens
// 仅当CountDocsOnly为false时才充填output.Docs
if !request.CountDocsOnly {
if request.Orderless {
// 无序状态无需对Offset截断
output.Docs = rankOutput
} else {
var start, end int
if rankOptions.MaxOutputs == 0 {
start = utils.MinInt(rankOptions.OutputOffset, len(rankOutput))
end = len(rankOutput)
} else {
start = utils.MinInt(rankOptions.OutputOffset, len(rankOutput))
end = utils.MinInt(start+rankOptions.MaxOutputs, len(rankOutput))
}
output.Docs = rankOutput[start:end]
}
}
output.NumDocs = numDocs
output.Timeout = isTimeout
return
}
// 阻塞等待直到所有索引添加完毕
func (engine *Engine) FlushIndex() {
for {
runtime.Gosched()
if engine.numIndexingRequests == engine.numDocumentsIndexed &&
engine.numRemovingRequests*uint64(engine.initOptions.NumShards) == engine.numDocumentsRemoved &&
(!engine.initOptions.UsePersistentStorage || engine.numIndexingRequests == engine.numDocumentsStored) {
// 保证 CHANNEL 中 REQUESTS 全部被执行完
break
}
}
// 强制更新,保证其为最后的请求
engine.IndexDocument(0, types.DocumentIndexData{}, true)
for {
runtime.Gosched()
if engine.numForceUpdatingRequests*uint64(engine.initOptions.NumShards) == engine.numDocumentsForceUpdated {
return
}
}
}
// 关闭引擎
func (engine *Engine) Close() {
engine.FlushIndex()
if engine.initOptions.UsePersistentStorage {
for _, db := range engine.dbs {
db.Close()
}
}
}
// 从文本hash得到要分配到的shard
func (engine *Engine) getShard(hash uint32) int {
return int(hash - hash/uint32(engine.initOptions.NumShards)*uint32(engine.initOptions.NumShards))
}

View File

@@ -0,0 +1,101 @@
package engine
import (
"github.com/huichen/wukong/types"
"sync/atomic"
)
type indexerAddDocumentRequest struct {
document *types.DocumentIndex
forceUpdate bool
}
type indexerLookupRequest struct {
countDocsOnly bool
tokens []string
labels []string
docIds map[uint64]bool
options types.RankOptions
rankerReturnChannel chan rankerReturnRequest
orderless bool
}
type indexerRemoveDocRequest struct {
docId uint64
forceUpdate bool
}
func (engine *Engine) indexerAddDocumentWorker(shard int) {
for {
request := <-engine.indexerAddDocChannels[shard]
engine.indexers[shard].AddDocumentToCache(request.document, request.forceUpdate)
if request.document != nil {
atomic.AddUint64(&engine.numTokenIndexAdded,
uint64(len(request.document.Keywords)))
atomic.AddUint64(&engine.numDocumentsIndexed, 1)
}
if request.forceUpdate {
atomic.AddUint64(&engine.numDocumentsForceUpdated, 1)
}
}
}
func (engine *Engine) indexerRemoveDocWorker(shard int) {
for {
request := <-engine.indexerRemoveDocChannels[shard]
engine.indexers[shard].RemoveDocumentToCache(request.docId, request.forceUpdate)
if request.docId != 0 {
atomic.AddUint64(&engine.numDocumentsRemoved, 1)
}
if request.forceUpdate {
atomic.AddUint64(&engine.numDocumentsForceUpdated, 1)
}
}
}
func (engine *Engine) indexerLookupWorker(shard int) {
for {
request := <-engine.indexerLookupChannels[shard]
var docs []types.IndexedDocument
var numDocs int
if request.docIds == nil {
docs, numDocs = engine.indexers[shard].Lookup(request.tokens, request.labels, nil, request.countDocsOnly)
} else {
docs, numDocs = engine.indexers[shard].Lookup(request.tokens, request.labels, request.docIds, request.countDocsOnly)
}
if request.countDocsOnly {
request.rankerReturnChannel <- rankerReturnRequest{numDocs: numDocs}
continue
}
if len(docs) == 0 {
request.rankerReturnChannel <- rankerReturnRequest{}
continue
}
if request.orderless {
var outputDocs []types.ScoredDocument
for _, d := range docs {
outputDocs = append(outputDocs, types.ScoredDocument{
DocId: d.DocId,
TokenSnippetLocations: d.TokenSnippetLocations,
TokenLocations: d.TokenLocations})
}
request.rankerReturnChannel <- rankerReturnRequest{
docs: outputDocs,
numDocs: len(outputDocs),
}
continue
}
rankerRequest := rankerRankRequest{
countDocsOnly: request.countDocsOnly,
docs: docs,
options: request.options,
rankerReturnChannel: request.rankerReturnChannel,
}
engine.rankerRankChannels[shard] <- rankerRequest
}
}

View File

@@ -0,0 +1,66 @@
package engine
import (
"bytes"
"encoding/binary"
"encoding/gob"
"github.com/huichen/wukong/types"
"sync/atomic"
)
type persistentStorageIndexDocumentRequest struct {
docId uint64
data types.DocumentIndexData
}
func (engine *Engine) persistentStorageIndexDocumentWorker(shard int) {
for {
request := <-engine.persistentStorageIndexDocumentChannels[shard]
// 得到key
b := make([]byte, 10)
length := binary.PutUvarint(b, request.docId)
// 得到value
var buf bytes.Buffer
enc := gob.NewEncoder(&buf)
err := enc.Encode(request.data)
if err != nil {
atomic.AddUint64(&engine.numDocumentsStored, 1)
continue
}
// 将key-value写入数据库
engine.dbs[shard].Set(b[0:length], buf.Bytes())
atomic.AddUint64(&engine.numDocumentsStored, 1)
}
}
func (engine *Engine) persistentStorageRemoveDocumentWorker(docId uint64, shard uint32) {
// 得到key
b := make([]byte, 10)
length := binary.PutUvarint(b, docId)
// 从数据库删除该key
engine.dbs[shard].Delete(b[0:length])
}
func (engine *Engine) persistentStorageInitWorker(shard int) {
engine.dbs[shard].ForEach(func(k, v []byte) error {
key, value := k, v
// 得到docID
docId, _ := binary.Uvarint(key)
// 得到data
buf := bytes.NewReader(value)
dec := gob.NewDecoder(buf)
var data types.DocumentIndexData
err := dec.Decode(&data)
if err == nil {
// 添加索引
engine.internalIndexDocument(docId, data, false)
}
return nil
})
engine.persistentStorageInitChannel <- true
}

View File

@@ -0,0 +1,52 @@
package engine
import (
"github.com/huichen/wukong/types"
)
type rankerAddDocRequest struct {
docId uint64
fields interface{}
}
type rankerRankRequest struct {
docs []types.IndexedDocument
options types.RankOptions
rankerReturnChannel chan rankerReturnRequest
countDocsOnly bool
}
type rankerReturnRequest struct {
docs types.ScoredDocuments
numDocs int
}
type rankerRemoveDocRequest struct {
docId uint64
}
func (engine *Engine) rankerAddDocWorker(shard int) {
for {
request := <-engine.rankerAddDocChannels[shard]
engine.rankers[shard].AddDoc(request.docId, request.fields)
}
}
func (engine *Engine) rankerRankWorker(shard int) {
for {
request := <-engine.rankerRankChannels[shard]
if request.options.MaxOutputs != 0 {
request.options.MaxOutputs += request.options.OutputOffset
}
request.options.OutputOffset = 0
outputDocs, numDocs := engine.rankers[shard].Rank(request.docs, request.options, request.countDocsOnly)
request.rankerReturnChannel <- rankerReturnRequest{docs: outputDocs, numDocs: numDocs}
}
}
func (engine *Engine) rankerRemoveDocWorker(shard int) {
for {
request := <-engine.rankerRemoveDocChannels[shard]
engine.rankers[shard].RemoveDoc(request.docId)
}
}

View File

@@ -0,0 +1,97 @@
package engine
import (
"github.com/huichen/wukong/types"
)
type segmenterRequest struct {
docId uint64
hash uint32
data types.DocumentIndexData
forceUpdate bool
}
func (engine *Engine) segmenterWorker() {
for {
request := <-engine.segmenterChannel
if request.docId == 0 {
if request.forceUpdate {
for i := 0; i < engine.initOptions.NumShards; i++ {
engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}
}
}
continue
}
shard := engine.getShard(request.hash)
tokensMap := make(map[string][]int)
numTokens := 0
if !engine.initOptions.NotUsingSegmenter && request.data.Content != "" {
// 当文档正文不为空时,优先从内容分词中得到关键词
segments := engine.segmenter.Segment([]byte(request.data.Content))
for _, segment := range segments {
token := segment.Token().Text()
if !engine.stopTokens.IsStopToken(token) {
tokensMap[token] = append(tokensMap[token], segment.Start())
}
}
numTokens = len(segments)
} else {
// 否则载入用户输入的关键词
for _, t := range request.data.Tokens {
if !engine.stopTokens.IsStopToken(t.Text) {
tokensMap[t.Text] = t.Locations
}
}
numTokens = len(request.data.Tokens)
}
// 加入非分词的文档标签
for _, label := range request.data.Labels {
if !engine.initOptions.NotUsingSegmenter {
if !engine.stopTokens.IsStopToken(label) {
//当正文中已存在关键字时,若不判断,位置信息将会丢失
if _, ok := tokensMap[label]; !ok {
tokensMap[label] = []int{}
}
}
} else {
//当正文中已存在关键字时,若不判断,位置信息将会丢失
if _, ok := tokensMap[label]; !ok {
tokensMap[label] = []int{}
}
}
}
indexerRequest := indexerAddDocumentRequest{
document: &types.DocumentIndex{
DocId: request.docId,
TokenLength: float32(numTokens),
Keywords: make([]types.KeywordIndex, len(tokensMap)),
},
forceUpdate: request.forceUpdate,
}
iTokens := 0
for k, v := range tokensMap {
indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{
Text: k,
// 非分词标注的词频设置为0不参与tf-idf计算
Frequency: float32(len(v)),
Starts: v}
iTokens++
}
engine.indexerAddDocChannels[shard] <- indexerRequest
if request.forceUpdate {
for i := 0; i < engine.initOptions.NumShards; i++ {
if i == shard {
continue
}
engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}
}
}
rankerRequest := rankerAddDocRequest{
docId: request.docId, fields: request.data.Fields}
engine.rankerAddDocChannels[shard] <- rankerRequest
}
}

40
vendor/github.com/huichen/wukong/engine/stop_tokens.go generated vendored Normal file
View File

@@ -0,0 +1,40 @@
package engine
import (
"bufio"
"log"
"os"
)
type StopTokens struct {
stopTokens map[string]bool
}
// 从stopTokenFile中读入停用词一个词一行
// 文档索引建立时会跳过这些停用词
func (st *StopTokens) Init(stopTokenFile string) {
st.stopTokens = make(map[string]bool)
if stopTokenFile == "" {
return
}
file, err := os.Open(stopTokenFile)
if err != nil {
log.Fatal(err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
text := scanner.Text()
if text != "" {
st.stopTokens[text] = true
}
}
}
func (st *StopTokens) IsStopToken(token string) bool {
_, found := st.stopTokens[token]
return found
}

13
vendor/github.com/huichen/wukong/license.txt generated vendored Normal file
View File

@@ -0,0 +1,13 @@
Copyright 2013 Hui Chen
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -0,0 +1,69 @@
package storage
import (
"github.com/boltdb/bolt"
"time"
)
var wukong_documents = []byte("wukong_documents")
type boltStorage struct {
db *bolt.DB
}
func openBoltStorage(path string) (Storage, error) {
db, err := bolt.Open(path, 0600, &bolt.Options{Timeout: 3600 * time.Second})
if err != nil {
return nil, err
}
err = db.Update(func(tx *bolt.Tx) error {
_, err := tx.CreateBucketIfNotExists(wukong_documents)
return err
})
if err != nil {
db.Close()
return nil, err
}
return &boltStorage{db}, nil
}
func (s *boltStorage) WALName() string {
return s.db.Path()
}
func (s *boltStorage) Set(k []byte, v []byte) error {
return s.db.Update(func(tx *bolt.Tx) error {
return tx.Bucket(wukong_documents).Put(k, v)
})
}
func (s *boltStorage) Get(k []byte) (b []byte, err error) {
err = s.db.View(func(tx *bolt.Tx) error {
b = tx.Bucket(wukong_documents).Get(k)
return nil
})
return
}
func (s *boltStorage) Delete(k []byte) error {
return s.db.Update(func(tx *bolt.Tx) error {
return tx.Bucket(wukong_documents).Delete(k)
})
}
func (s *boltStorage) ForEach(fn func(k, v []byte) error) error {
return s.db.View(func(tx *bolt.Tx) error {
b := tx.Bucket(wukong_documents)
c := b.Cursor()
for k, v := c.First(); k != nil; k, v = c.Next() {
if err := fn(k, v); err != nil {
return err
}
}
return nil
})
}
func (s *boltStorage) Close() error {
return s.db.Close()
}

64
vendor/github.com/huichen/wukong/storage/kv_storage.go generated vendored Normal file
View File

@@ -0,0 +1,64 @@
package storage
import (
"github.com/cznic/kv"
"io"
)
type kvStorage struct {
db *kv.DB
}
func openKVStorage(path string) (Storage, error) {
options := &kv.Options{}
db, errOpen := kv.Open(path, options)
if errOpen != nil {
var errCreate error
db, errCreate = kv.Create(path, options)
if errCreate != nil {
return &kvStorage{db}, errCreate
}
}
return &kvStorage{db}, nil
}
func (s *kvStorage) WALName() string {
return s.db.WALName()
}
func (s *kvStorage) Set(k []byte, v []byte) error {
return s.db.Set(k, v)
}
func (s *kvStorage) Get(k []byte) ([]byte, error) {
return s.db.Get(nil, k)
}
func (s *kvStorage) Delete(k []byte) error {
return s.db.Delete(k)
}
func (s *kvStorage) ForEach(fn func(k, v []byte) error) error {
iter, err := s.db.SeekFirst()
if err == io.EOF {
return nil
} else if err != nil {
return err
}
for {
key, value, err := iter.Next()
if err == io.EOF {
break
} else if err != nil {
return err
}
if err := fn(key, value); err != nil {
return err
}
}
return nil
}
func (s *kvStorage) Close() error {
return s.db.Close()
}

37
vendor/github.com/huichen/wukong/storage/storage.go generated vendored Normal file
View File

@@ -0,0 +1,37 @@
package storage
import (
"fmt"
"os"
)
const DEFAULT_STORAGE_ENGINE = "bolt"
var supportedStorage = map[string]func(path string) (Storage, error){
"kv": openKVStorage,
"bolt": openBoltStorage,
}
func RegisterStorageEngine(name string, fn func(path string) (Storage, error)) {
supportedStorage[name] = fn
}
type Storage interface {
Set(k, v []byte) error
Get(k []byte) ([]byte, error)
Delete(k []byte) error
ForEach(fn func(k, v []byte) error) error
Close() error
WALName() string
}
func OpenStorage(path string) (Storage, error) {
wse := os.Getenv("WUKONG_STORAGE_ENGINE")
if wse == "" {
wse = DEFAULT_STORAGE_ENGINE
}
if fn, has := supportedStorage[wse]; has {
return fn(path)
}
return nil, fmt.Errorf("unsupported storage engine %v", wse)
}

View File

@@ -0,0 +1,27 @@
package types
type DocumentIndexData struct {
// 文档全文必须是UTF-8格式用于生成待索引的关键词
Content string
// 文档的关键词
// 当Content不为空的时候优先从Content中分词得到关键词。
// Tokens存在的意义在于绕过悟空内置的分词器在引擎外部
// 进行分词和预处理。
Tokens []TokenData
// 文档标签必须是UTF-8格式比如文档的类别属性等这些标签并不出现在文档文本中
Labels []string
// 文档的评分字段,可以接纳任何类型的结构体
Fields interface{}
}
// 文档的一个关键词
type TokenData struct {
// 关键词的字符串
Text string
// 关键词的首字节在文档中出现的位置
Locations []int
}

View File

@@ -0,0 +1,126 @@
package types
import (
"log"
"runtime"
)
var (
// EngineInitOptions的默认值
defaultNumSegmenterThreads = runtime.NumCPU()
defaultNumShards = 2
defaultIndexerBufferLength = runtime.NumCPU()
defaultNumIndexerThreadsPerShard = runtime.NumCPU()
defaultRankerBufferLength = runtime.NumCPU()
defaultNumRankerThreadsPerShard = runtime.NumCPU()
defaultDefaultRankOptions = RankOptions{
ScoringCriteria: RankByBM25{},
}
defaultIndexerInitOptions = IndexerInitOptions{
IndexType: FrequenciesIndex,
BM25Parameters: &defaultBM25Parameters,
}
defaultBM25Parameters = BM25Parameters{
K1: 2.0,
B: 0.75,
}
defaultPersistentStorageShards = 8
)
type EngineInitOptions struct {
// 是否使用分词器
// 默认使用否则在启动阶段跳过SegmenterDictionaries和StopTokenFile设置
// 如果你不需要在引擎内分词可以将这个选项设为true
// 注意如果你不用分词器那么在调用IndexDocument时DocumentIndexData中的Content会被忽略
NotUsingSegmenter bool
// 半角逗号分隔的字典文件,具体用法见
// sego.Segmenter.LoadDictionary函数的注释
SegmenterDictionaries string
// 停用词文件
StopTokenFile string
// 分词器线程数
NumSegmenterThreads int
// 索引器和排序器的shard数目
// 被检索/排序的文档会被均匀分配到各个shard中
NumShards int
// 索引器的信道缓冲长度
IndexerBufferLength int
// 索引器每个shard分配的线程数
NumIndexerThreadsPerShard int
// 排序器的信道缓冲长度
RankerBufferLength int
// 排序器每个shard分配的线程数
NumRankerThreadsPerShard int
// 索引器初始化选项
IndexerInitOptions *IndexerInitOptions
// 默认的搜索选项
DefaultRankOptions *RankOptions
// 是否使用持久数据库,以及数据库文件保存的目录和裂分数目
UsePersistentStorage bool
PersistentStorageFolder string
PersistentStorageShards int
}
// 初始化EngineInitOptions当用户未设定某个选项的值时用默认值取代
func (options *EngineInitOptions) Init() {
if !options.NotUsingSegmenter {
if options.SegmenterDictionaries == "" {
log.Fatal("字典文件不能为空")
}
}
if options.NumSegmenterThreads == 0 {
options.NumSegmenterThreads = defaultNumSegmenterThreads
}
if options.NumShards == 0 {
options.NumShards = defaultNumShards
}
if options.IndexerBufferLength == 0 {
options.IndexerBufferLength = defaultIndexerBufferLength
}
if options.NumIndexerThreadsPerShard == 0 {
options.NumIndexerThreadsPerShard = defaultNumIndexerThreadsPerShard
}
if options.RankerBufferLength == 0 {
options.RankerBufferLength = defaultRankerBufferLength
}
if options.NumRankerThreadsPerShard == 0 {
options.NumRankerThreadsPerShard = defaultNumRankerThreadsPerShard
}
if options.IndexerInitOptions == nil {
options.IndexerInitOptions = &defaultIndexerInitOptions
}
if options.IndexerInitOptions.BM25Parameters == nil {
options.IndexerInitOptions.BM25Parameters = &defaultBM25Parameters
}
if options.DefaultRankOptions == nil {
options.DefaultRankOptions = &defaultDefaultRankOptions
}
if options.DefaultRankOptions.ScoringCriteria == nil {
options.DefaultRankOptions.ScoringCriteria = defaultDefaultRankOptions.ScoringCriteria
}
if options.PersistentStorageShards == 0 {
options.PersistentStorageShards = defaultPersistentStorageShards
}
}

70
vendor/github.com/huichen/wukong/types/index.go generated vendored Normal file
View File

@@ -0,0 +1,70 @@
package types
type DocumentIndex struct {
// 文本的DocId
DocId uint64
// 文本的关键词长
TokenLength float32
// 加入的索引键
Keywords []KeywordIndex
}
// 反向索引项,这实际上标注了一个(搜索键,文档)对。
type KeywordIndex struct {
// 搜索键的UTF-8文本
Text string
// 搜索键词频
Frequency float32
// 搜索键在文档中的起始字节位置,按照升序排列
Starts []int
}
// 索引器返回结果
type IndexedDocument struct {
DocId uint64
// BM25仅当索引类型为FrequenciesIndex或者LocationsIndex时返回有效值
BM25 float32
// 关键词在文档中的紧邻距离紧邻距离的含义见computeTokenProximity的注释。
// 仅当索引类型为LocationsIndex时返回有效值。
TokenProximity int32
// 紧邻距离计算得到的关键词位置和Lookup函数输入tokens的长度一样且一一对应。
// 仅当索引类型为LocationsIndex时返回有效值。
TokenSnippetLocations []int
// 关键词在文本中的具体位置。
// 仅当索引类型为LocationsIndex时返回有效值。
TokenLocations [][]int
}
// 方便批量加入文档索引
type DocumentsIndex []*DocumentIndex
func (docs DocumentsIndex) Len() int {
return len(docs)
}
func (docs DocumentsIndex) Swap(i, j int) {
docs[i], docs[j] = docs[j], docs[i]
}
func (docs DocumentsIndex) Less(i, j int) bool {
return docs[i].DocId < docs[j].DocId
}
// 方便批量删除文档索引
type DocumentsId []uint64
func (docs DocumentsId) Len() int {
return len(docs)
}
func (docs DocumentsId) Swap(i, j int) {
docs[i], docs[j] = docs[j], docs[i]
}
func (docs DocumentsId) Less(i, j int) bool {
return docs[i] < docs[j]
}

View File

@@ -0,0 +1,42 @@
package types
// 这些常数定义了反向索引表存储的数据类型
const (
// 仅存储文档的docId
DocIdsIndex = 0
// 存储关键词的词频用于计算BM25
FrequenciesIndex = 1
// 存储关键词在文档中出现的具体字节位置(可能有多个)
// 如果你希望得到关键词紧邻度数据必须使用LocationsIndex类型的索引
LocationsIndex = 2
// 默认插入索引表文档 CACHE SIZE
defaultDocCacheSize = 300000
)
// 初始化索引器选项
type IndexerInitOptions struct {
// 索引表的类型,见上面的常数
IndexType int
// 待插入索引表文档 CACHE SIZE
DocCacheSize int
// BM25参数
BM25Parameters *BM25Parameters
}
// 见http://en.wikipedia.org/wiki/Okapi_BM25
// 默认值见engine_init_options.go
type BM25Parameters struct {
K1 float32
B float32
}
func (options *IndexerInitOptions) Init() {
if options.DocCacheSize == 0 {
options.DocCacheSize = defaultDocCacheSize
}
}

View File

@@ -0,0 +1,17 @@
package types
// 评分规则通用接口
type ScoringCriteria interface {
// 给一个文档评分,文档排序时先用第一个分值比较,如果
// 分值相同则转移到第二个分值,以此类推。
// 返回空切片表明该文档应该从最终排序结果中剔除。
Score(doc IndexedDocument, fields interface{}) []float32
}
// 一个简单的评分规则文档分数为BM25
type RankByBM25 struct {
}
func (rule RankByBM25) Score(doc IndexedDocument, fields interface{}) []float32 {
return []float32{doc.BM25}
}

View File

@@ -0,0 +1,45 @@
package types
type SearchRequest struct {
// 搜索的短语必须是UTF-8格式会被分词
// 当值为空字符串时关键词会从下面的Tokens读入
Text string
// 关键词必须是UTF-8格式当Text不为空时优先使用Text
// 通常你不需要自己指定关键词,除非你运行自己的分词程序
Tokens []string
// 文档标签必须是UTF-8格式标签不存在文档文本中但也属于搜索键的一种
Labels []string
// 当不为nil时仅从这些DocIds包含的键中搜索忽略值
DocIds map[uint64]bool
// 排序选项
RankOptions *RankOptions
// 超时,单位毫秒(千分之一秒)。此值小于等于零时不设超时。
// 搜索超时的情况下仍有可能返回部分排序结果。
Timeout int
// 设为true时仅统计搜索到的文档个数不返回具体的文档
CountDocsOnly bool
// 不排序,对于可在引擎外部(比如客户端)排序情况适用
// 对返回文档很多的情况打开此选项可以有效节省时间
Orderless bool
}
type RankOptions struct {
// 文档的评分规则值为nil时使用Engine初始化时设定的规则
ScoringCriteria ScoringCriteria
// 默认情况下ReverseOrder=false按照分数从大到小排序否则从小到大排序
ReverseOrder bool
// 从第几条结果开始输出
OutputOffset int
// 最大输出的搜索结果数为0时无限制
MaxOutputs int
}

View File

@@ -0,0 +1,57 @@
package types
import (
"github.com/huichen/wukong/utils"
)
type SearchResponse struct {
// 搜索用到的关键词
Tokens []string
// 搜索到的文档,已排序
Docs []ScoredDocument
// 搜索是否超时。超时的情况下也可能会返回部分结果
Timeout bool
// 搜索到的文档个数。注意这是全部文档中满足条件的个数,可能比返回的文档数要大
NumDocs int
}
type ScoredDocument struct {
DocId uint64
// 文档的打分值
// 搜索结果按照Scores的值排序先按照第一个数排如果相同则按照第二个数排序依次类推。
Scores []float32
// 用于生成摘要的关键词在文本中的字节位置该切片长度和SearchResponse.Tokens的长度一样
// 只有当IndexType == LocationsIndex时不为空
TokenSnippetLocations []int
// 关键词出现的位置
// 只有当IndexType == LocationsIndex时不为空
TokenLocations [][]int
}
// 为了方便排序
type ScoredDocuments []ScoredDocument
func (docs ScoredDocuments) Len() int {
return len(docs)
}
func (docs ScoredDocuments) Swap(i, j int) {
docs[i], docs[j] = docs[j], docs[i]
}
func (docs ScoredDocuments) Less(i, j int) bool {
// 为了从大到小排序这实际上实现的是More的功能
for iScore := 0; iScore < utils.MinInt(len(docs[i].Scores), len(docs[j].Scores)); iScore++ {
if docs[i].Scores[iScore] > docs[j].Scores[iScore] {
return true
} else if docs[i].Scores[iScore] < docs[j].Scores[iScore] {
return false
}
}
return len(docs[i].Scores) > len(docs[j].Scores)
}

13
vendor/github.com/huichen/wukong/utils/test_utils.go generated vendored Normal file
View File

@@ -0,0 +1,13 @@
package utils
import (
"fmt"
"testing"
)
func Expect(t *testing.T, expect string, actual interface{}) {
actualString := fmt.Sprint(actual)
if expect != actualString {
t.Errorf("期待值=\"%s\", 实际=\"%s\"", expect, actualString)
}
}

15
vendor/github.com/huichen/wukong/utils/utils.go generated vendored Normal file
View File

@@ -0,0 +1,15 @@
package utils
func AbsInt(a int) int {
if a < 0 {
return -a
}
return a
}
func MinInt(a, b int) int {
if a < b {
return a
}
return b
}