搭建框架

2026-01-22 21:02:12 +08:00 · 2017-04-21 18:20:35 +08:00
parent d58087f723
commit 67486f0866
727 changed files with 831224 additions and 37 deletions
--- a/vendor/github.com/huichen/sego/README.md
+++ b/vendor/github.com/huichen/sego/README.md
@@ -0,0 +1,43 @@
+sego
+====
+
+Go中文分词
+
+<a href="https://github.com/huichen/sego/blob/master/dictionary.go">词典</a>用双数组trie（Double-Array Trie）实现，
+<a href="https://github.com/huichen/sego/blob/master/segmenter.go">分词器</a>算法为基于词频的最短路径加动态规划。
+
+支持普通和搜索引擎两种分词模式，支持用户词典、词性标注，可运行<a href="https://github.com/huichen/sego/blob/master/server/server.go">JSON RPC服务</a>。
+
+分词速度<a href="https://github.com/huichen/sego/blob/master/tools/benchmark.go">单线程</a>9MB/s，<a href="https://github.com/huichen/sego/blob/master/tools/goroutines.go">goroutines并发</a>42MB/s（8核Macbook Pro）。
+
+# 安装/更新
+
+```
+go get -u github.com/huichen/sego
+```
+
+# 使用
+
+
+```go
+package main
+
+import (
+	"fmt"
+	"github.com/huichen/sego"
+)
+
+func main() {
+	// 载入词典
+	var segmenter sego.Segmenter
+	segmenter.LoadDictionary("github.com/huichen/sego/data/dictionary.txt")
+
+	// 分词
+	text := []byte("中华人民共和国中央人民政府")
+	segments := segmenter.Segment(text)
+  
+	// 处理分词结果
+	// 支持普通模式和搜索模式两种分词，见代码中SegmentsToString函数的注释。
+	fmt.Println(sego.SegmentsToString(segments, false)) 
+}
+```
--- a/vendor/github.com/huichen/sego/dictionary.go
+++ b/vendor/github.com/huichen/sego/dictionary.go
@@ -0,0 +1,65 @@
+package sego
+
+import "github.com/adamzy/cedar-go"
+
+// Dictionary结构体实现了一个字串前缀树，一个分词可能出现在叶子节点也有可能出现在非叶节点
+type Dictionary struct {
+	trie           *cedar.Cedar // Cedar 前缀树
+	maxTokenLength int          // 词典中最长的分词
+	tokens         []Token      // 词典中所有的分词，方便遍历
+	totalFrequency int64        // 词典中所有分词的频率之和
+}
+
+func NewDictionary() *Dictionary {
+	return &Dictionary{trie: cedar.New()}
+}
+
+// 词典中最长的分词
+func (dict *Dictionary) MaxTokenLength() int {
+	return dict.maxTokenLength
+}
+
+// 词典中分词数目
+func (dict *Dictionary) NumTokens() int {
+	return len(dict.tokens)
+}
+
+// 词典中所有分词的频率之和
+func (dict *Dictionary) TotalFrequency() int64 {
+	return dict.totalFrequency
+}
+
+// 向词典中加入一个分词
+func (dict *Dictionary) addToken(token Token) {
+	bytes := textSliceToBytes(token.text)
+	_, err := dict.trie.Get(bytes)
+	if err == nil {
+		return
+	}
+
+	dict.trie.Insert(bytes, dict.NumTokens())
+	dict.tokens = append(dict.tokens, token)
+	dict.totalFrequency += int64(token.frequency)
+	if len(token.text) > dict.maxTokenLength {
+		dict.maxTokenLength = len(token.text)
+	}
+}
+
+// 在词典中查找和字元组words可以前缀匹配的所有分词
+// 返回值为找到的分词数
+func (dict *Dictionary) lookupTokens(words []Text, tokens []*Token) (numOfTokens int) {
+	var id, value int
+	var err error
+	for _, word := range words {
+		id, err = dict.trie.Jump(word, id)
+		if err != nil {
+			break
+		}
+		value, err = dict.trie.Value(id)
+		if err == nil {
+			tokens[numOfTokens] = &dict.tokens[value]
+			numOfTokens++
+		}
+	}
+	return
+}
--- a/vendor/github.com/huichen/sego/license.txt
+++ b/vendor/github.com/huichen/sego/license.txt
@@ -0,0 +1,13 @@
+Copyright 2013 Hui Chen
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
--- a/vendor/github.com/huichen/sego/segment.go
+++ b/vendor/github.com/huichen/sego/segment.go
@@ -0,0 +1,28 @@
+package sego
+
+// 文本中的一个分词
+type Segment struct {
+	// 分词在文本中的起始字节位置
+	start int
+
+	// 分词在文本中的结束字节位置（不包括该位置）
+	end int
+
+	// 分词信息
+	token *Token
+}
+
+// 返回分词在文本中的起始字节位置
+func (s *Segment) Start() int {
+	return s.start
+}
+
+// 返回分词在文本中的结束字节位置（不包括该位置）
+func (s *Segment) End() int {
+	return s.end
+}
+
+// 返回分词信息
+func (s *Segment) Token() *Token {
+	return s.token
+}
--- a/vendor/github.com/huichen/sego/segmenter.go
+++ b/vendor/github.com/huichen/sego/segmenter.go
@@ -0,0 +1,295 @@
+//Go中文分词
+package sego
+
+import (
+	"bufio"
+	"fmt"
+	"log"
+	"math"
+	"os"
+	"strconv"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+const (
+	minTokenFrequency = 2 // 仅从字典文件中读取大于等于此频率的分词
+)
+
+// 分词器结构体
+type Segmenter struct {
+	dict *Dictionary
+}
+
+// 该结构体用于记录Viterbi算法中某字元处的向前分词跳转信息
+type jumper struct {
+	minDistance float32
+	token       *Token
+}
+
+// 返回分词器使用的词典
+func (seg *Segmenter) Dictionary() *Dictionary {
+	return seg.dict
+}
+
+// 从文件中载入词典
+//
+// 可以载入多个词典文件，文件名用","分隔，排在前面的词典优先载入分词，比如
+// 	"用户词典.txt,通用词典.txt"
+// 当一个分词既出现在用户词典也出现在通用词典中，则优先使用用户词典。
+//
+// 词典的格式为（每个分词一行）：
+//	分词文本 频率 词性
+func (seg *Segmenter) LoadDictionary(files string) {
+	seg.dict = NewDictionary()
+	for _, file := range strings.Split(files, ",") {
+		log.Printf("载入sego词典 %s", file)
+		dictFile, err := os.Open(file)
+		defer dictFile.Close()
+		if err != nil {
+			log.Fatalf("无法载入字典文件 \"%s\" \n", file)
+		}
+
+		reader := bufio.NewReader(dictFile)
+		var text string
+		var freqText string
+		var frequency int
+		var pos string
+
+		// 逐行读入分词
+		for {
+			size, _ := fmt.Fscanln(reader, &text, &freqText, &pos)
+
+			if size == 0 {
+				// 文件结束
+				break
+			} else if size < 2 {
+				// 无效行
+				continue
+			} else if size == 2 {
+				// 没有词性标注时设为空字符串
+				pos = ""
+			}
+
+			// 解析词频
+			var err error
+			frequency, err = strconv.Atoi(freqText)
+			if err != nil {
+				continue
+			}
+
+			// 过滤频率太小的词
+			if frequency < minTokenFrequency {
+				continue
+			}
+
+			// 将分词添加到字典中
+			words := splitTextToWords([]byte(text))
+			token := Token{text: words, frequency: frequency, pos: pos}
+			seg.dict.addToken(token)
+		}
+	}
+
+	// 计算每个分词的路径值，路径值含义见Token结构体的注释
+	logTotalFrequency := float32(math.Log2(float64(seg.dict.totalFrequency)))
+	for i := range seg.dict.tokens {
+		token := &seg.dict.tokens[i]
+		token.distance = logTotalFrequency - float32(math.Log2(float64(token.frequency)))
+	}
+
+	// 对每个分词进行细致划分，用于搜索引擎模式，该模式用法见Token结构体的注释。
+	for i := range seg.dict.tokens {
+		token := &seg.dict.tokens[i]
+		segments := seg.segmentWords(token.text, true)
+
+		// 计算需要添加的子分词数目
+		numTokensToAdd := 0
+		for iToken := 0; iToken < len(segments); iToken++ {
+			if len(segments[iToken].token.text) > 1 {
+				// 略去字元长度为一的分词
+				// TODO: 这值得进一步推敲，特别是当字典中有英文复合词的时候
+				numTokensToAdd++
+			}
+		}
+		token.segments = make([]*Segment, numTokensToAdd)
+
+		// 添加子分词
+		iSegmentsToAdd := 0
+		for iToken := 0; iToken < len(segments); iToken++ {
+			if len(segments[iToken].token.text) > 1 {
+				token.segments[iSegmentsToAdd] = &segments[iToken]
+				iSegmentsToAdd++
+			}
+		}
+	}
+
+	log.Println("sego词典载入完毕")
+}
+
+// 对文本分词
+//
+// 输入参数：
+//	bytes	UTF8文本的字节数组
+//
+// 输出：
+//	[]Segment	划分的分词
+func (seg *Segmenter) Segment(bytes []byte) []Segment {
+	return seg.internalSegment(bytes, false)
+}
+
+func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
+	// 处理特殊情况
+	if len(bytes) == 0 {
+		return []Segment{}
+	}
+
+	// 划分字元
+	text := splitTextToWords(bytes)
+
+	return seg.segmentWords(text, searchMode)
+}
+
+func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
+	// 搜索模式下该分词已无继续划分可能的情况
+	if searchMode && len(text) == 1 {
+		return []Segment{}
+	}
+
+	// jumpers定义了每个字元处的向前跳转信息，包括这个跳转对应的分词，
+	// 以及从文本段开始到该字元的最短路径值
+	jumpers := make([]jumper, len(text))
+
+	tokens := make([]*Token, seg.dict.maxTokenLength)
+	for current := 0; current < len(text); current++ {
+		// 找到前一个字元处的最短路径，以便计算后续路径值
+		var baseDistance float32
+		if current == 0 {
+			// 当本字元在文本首部时，基础距离应该是零
+			baseDistance = 0
+		} else {
+			baseDistance = jumpers[current-1].minDistance
+		}
+
+		// 寻找所有以当前字元开头的分词
+		numTokens := seg.dict.lookupTokens(
+			text[current:minInt(current+seg.dict.maxTokenLength, len(text))], tokens)
+
+		// 对所有可能的分词，更新分词结束字元处的跳转信息
+		for iToken := 0; iToken < numTokens; iToken++ {
+			location := current + len(tokens[iToken].text) - 1
+			if !searchMode || current != 0 || location != len(text)-1 {
+				updateJumper(&jumpers[location], baseDistance, tokens[iToken])
+			}
+		}
+
+		// 当前字元没有对应分词时补加一个伪分词
+		if numTokens == 0 || len(tokens[0].text) > 1 {
+			updateJumper(&jumpers[current], baseDistance,
+				&Token{text: []Text{text[current]}, frequency: 1, distance: 32, pos: "x"})
+		}
+	}
+
+	// 从后向前扫描第一遍得到需要添加的分词数目
+	numSeg := 0
+	for index := len(text) - 1; index >= 0; {
+		location := index - len(jumpers[index].token.text) + 1
+		numSeg++
+		index = location - 1
+	}
+
+	// 从后向前扫描第二遍添加分词到最终结果
+	outputSegments := make([]Segment, numSeg)
+	for index := len(text) - 1; index >= 0; {
+		location := index - len(jumpers[index].token.text) + 1
+		numSeg--
+		outputSegments[numSeg].token = jumpers[index].token
+		index = location - 1
+	}
+
+	// 计算各个分词的字节位置
+	bytePosition := 0
+	for iSeg := 0; iSeg < len(outputSegments); iSeg++ {
+		outputSegments[iSeg].start = bytePosition
+		bytePosition += textSliceByteLength(outputSegments[iSeg].token.text)
+		outputSegments[iSeg].end = bytePosition
+	}
+	return outputSegments
+}
+
+// 更新跳转信息:
+// 	1. 当该位置从未被访问过时(jumper.minDistance为零的情况)，或者
+//	2. 当该位置的当前最短路径大于新的最短路径时
+// 将当前位置的最短路径值更新为baseDistance加上新分词的概率
+func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
+	newDistance := baseDistance + token.distance
+	if jumper.minDistance == 0 || jumper.minDistance > newDistance {
+		jumper.minDistance = newDistance
+		jumper.token = token
+	}
+}
+
+// 取两整数较小值
+func minInt(a, b int) int {
+	if a > b {
+		return b
+	}
+	return a
+}
+
+// 取两整数较大值
+func maxInt(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+// 将文本划分成字元
+func splitTextToWords(text Text) []Text {
+	output := make([]Text, 0, len(text)/3)
+	current := 0
+	inAlphanumeric := true
+	alphanumericStart := 0
+	for current < len(text) {
+		r, size := utf8.DecodeRune(text[current:])
+		if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) {
+			// 当前是拉丁字母或数字（非中日韩文字）
+			if !inAlphanumeric {
+				alphanumericStart = current
+				inAlphanumeric = true
+			}
+		} else {
+			if inAlphanumeric {
+				inAlphanumeric = false
+				if current != 0 {
+					output = append(output, toLower(text[alphanumericStart:current]))
+				}
+			}
+			output = append(output, text[current:current+size])
+		}
+		current += size
+	}
+
+	// 处理最后一个字元是英文的情况
+	if inAlphanumeric {
+		if current != 0 {
+			output = append(output, toLower(text[alphanumericStart:current]))
+		}
+	}
+
+	return output
+}
+
+// 将英文词转化为小写
+func toLower(text []byte) []byte {
+	output := make([]byte, len(text))
+	for i, t := range text {
+		if t >= 'A' && t <= 'Z' {
+			output[i] = t - 'A' + 'a'
+		} else {
+			output[i] = t
+		}
+	}
+	return output
+}
--- a/vendor/github.com/huichen/sego/test_utils.go
+++ b/vendor/github.com/huichen/sego/test_utils.go
@@ -0,0 +1,38 @@
+package sego
+
+import (
+	"fmt"
+	"testing"
+)
+
+func expect(t *testing.T, expect string, actual interface{}) {
+	actualString := fmt.Sprint(actual)
+	if expect != actualString {
+		t.Errorf("期待值=\"%s\", 实际=\"%s\"", expect, actualString)
+	}
+}
+
+func printTokens(tokens []*Token, numTokens int) (output string) {
+	for iToken := 0; iToken < numTokens; iToken++ {
+		for _, word := range tokens[iToken].text {
+			output += fmt.Sprint(string(word))
+		}
+		output += " "
+	}
+	return
+}
+
+func toWords(strings ...string) []Text {
+	words := []Text{}
+	for _, s := range strings {
+		words = append(words, []byte(s))
+	}
+	return words
+}
+
+func bytesToString(bytes []Text) (output string) {
+	for _, b := range bytes {
+		output += (string(b) + "/")
+	}
+	return
+}
--- a/vendor/github.com/huichen/sego/token.go
+++ b/vendor/github.com/huichen/sego/token.go
@@ -0,0 +1,50 @@
+package sego
+
+// 字串类型，可以用来表达
+//	1. 一个字元，比如"中"又如"国", 英文的一个字元是一个词
+//	2. 一个分词，比如"中国"又如"人口"
+//	3. 一段文字，比如"中国有十三亿人口"
+type Text []byte
+
+// 一个分词
+type Token struct {
+	// 分词的字串，这实际上是个字元数组
+	text []Text
+
+	// 分词在语料库中的词频
+	frequency int
+
+	// log2(总词频/该分词词频)，这相当于log2(1/p(分词))，用作动态规划中
+	// 该分词的路径长度。求解prod(p(分词))的最大值相当于求解
+	// sum(distance(分词))的最小值，这就是“最短路径”的来历。
+	distance float32
+
+	// 词性标注
+	pos string
+
+	// 该分词文本的进一步分词划分，见Segments函数注释。
+	segments []*Segment
+}
+
+// 返回分词文本
+func (token *Token) Text() string {
+	return textSliceToString(token.text)
+}
+
+// 返回分词在语料库中的词频
+func (token *Token) Frequency() int {
+	return token.frequency
+}
+
+// 返回分词词性标注
+func (token *Token) Pos() string {
+	return token.pos
+}
+
+// 该分词文本的进一步分词划分，比如"中华人民共和国中央人民政府"这个分词
+// 有两个子分词"中华人民共和国"和"中央人民政府"。子分词也可以进一步有子分词
+// 形成一个树结构，遍历这个树就可以得到该分词的所有细致分词划分，这主要
+// 用于搜索引擎对一段文本进行全文搜索。
+func (token *Token) Segments() []*Segment {
+	return token.segments
+}
--- a/vendor/github.com/huichen/sego/utils.go
+++ b/vendor/github.com/huichen/sego/utils.go
@@ -0,0 +1,93 @@
+package sego
+
+import (
+	"bytes"
+	"fmt"
+)
+
+// 输出分词结果为字符串
+//
+// 有两种输出模式，以"中华人民共和国"为例
+//
+//  普通模式（searchMode=false）输出一个分词"中华人民共和国/ns "
+//  搜索模式（searchMode=true） 输出普通模式的再细致切分：
+//      "中华/nz 人民/n 共和/nz 共和国/ns 人民共和国/nt 中华人民共和国/ns "
+//
+// 搜索模式主要用于给搜索引擎提供尽可能多的关键字，详情请见Token结构体的注释。
+func SegmentsToString(segs []Segment, searchMode bool) (output string) {
+	if searchMode {
+		for _, seg := range segs {
+			output += tokenToString(seg.token)
+		}
+	} else {
+		for _, seg := range segs {
+			output += fmt.Sprintf(
+				"%s/%s ", textSliceToString(seg.token.text), seg.token.pos)
+		}
+	}
+	return
+}
+
+func tokenToString(token *Token) (output string) {
+	for _, s := range token.segments {
+		output += tokenToString(s.token)
+	}
+	output += fmt.Sprintf("%s/%s ", textSliceToString(token.text), token.pos)
+	return
+}
+
+// 输出分词结果到一个字符串slice
+//
+// 有两种输出模式，以"中华人民共和国"为例
+//
+//  普通模式（searchMode=false）输出一个分词"[中华人民共和国]"
+//  搜索模式（searchMode=true） 输出普通模式的再细致切分：
+//      "[中华 人民 共和 共和国 人民共和国 中华人民共和国]"
+//
+// 搜索模式主要用于给搜索引擎提供尽可能多的关键字，详情请见Token结构体的注释。
+
+func SegmentsToSlice(segs []Segment, searchMode bool) (output []string) {
+	if searchMode {
+		for _, seg := range segs {
+			output = append(output, tokenToSlice(seg.token)...)
+		}
+	} else {
+		for _, seg := range segs {
+			output = append(output, seg.token.Text())
+		}
+	}
+	return
+}
+
+func tokenToSlice(token *Token) (output []string) {
+	for _, s := range token.segments {
+		output = append(output, tokenToSlice(s.token)...)
+	}
+	output = append(output, textSliceToString(token.text))
+	return output
+}
+
+// 将多个字元拼接一个字符串输出
+func textSliceToString(text []Text) string {
+	var output string
+	for _, word := range text {
+		output += string(word)
+	}
+	return output
+}
+
+// 返回多个字元的字节总长度
+func textSliceByteLength(text []Text) (length int) {
+	for _, word := range text {
+		length += len(word)
+	}
+	return
+}
+
+func textSliceToBytes(text []Text) []byte {
+	var buf bytes.Buffer
+	for _, word := range text {
+		buf.Write(word)
+	}
+	return buf.Bytes()
+}