mirror of
https://github.com/mindoc-org/mindoc.git
synced 2026-01-22 21:02:12 +08:00
搭建框架
This commit is contained in:
43
vendor/github.com/huichen/sego/README.md
generated
vendored
Normal file
43
vendor/github.com/huichen/sego/README.md
generated
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
sego
|
||||
====
|
||||
|
||||
Go中文分词
|
||||
|
||||
<a href="https://github.com/huichen/sego/blob/master/dictionary.go">词典</a>用双数组trie(Double-Array Trie)实现,
|
||||
<a href="https://github.com/huichen/sego/blob/master/segmenter.go">分词器</a>算法为基于词频的最短路径加动态规划。
|
||||
|
||||
支持普通和搜索引擎两种分词模式,支持用户词典、词性标注,可运行<a href="https://github.com/huichen/sego/blob/master/server/server.go">JSON RPC服务</a>。
|
||||
|
||||
分词速度<a href="https://github.com/huichen/sego/blob/master/tools/benchmark.go">单线程</a>9MB/s,<a href="https://github.com/huichen/sego/blob/master/tools/goroutines.go">goroutines并发</a>42MB/s(8核Macbook Pro)。
|
||||
|
||||
# 安装/更新
|
||||
|
||||
```
|
||||
go get -u github.com/huichen/sego
|
||||
```
|
||||
|
||||
# 使用
|
||||
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/huichen/sego"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// 载入词典
|
||||
var segmenter sego.Segmenter
|
||||
segmenter.LoadDictionary("github.com/huichen/sego/data/dictionary.txt")
|
||||
|
||||
// 分词
|
||||
text := []byte("中华人民共和国中央人民政府")
|
||||
segments := segmenter.Segment(text)
|
||||
|
||||
// 处理分词结果
|
||||
// 支持普通模式和搜索模式两种分词,见代码中SegmentsToString函数的注释。
|
||||
fmt.Println(sego.SegmentsToString(segments, false))
|
||||
}
|
||||
```
|
||||
65
vendor/github.com/huichen/sego/dictionary.go
generated
vendored
Normal file
65
vendor/github.com/huichen/sego/dictionary.go
generated
vendored
Normal file
@@ -0,0 +1,65 @@
|
||||
package sego
|
||||
|
||||
import "github.com/adamzy/cedar-go"
|
||||
|
||||
// Dictionary结构体实现了一个字串前缀树,一个分词可能出现在叶子节点也有可能出现在非叶节点
|
||||
type Dictionary struct {
|
||||
trie *cedar.Cedar // Cedar 前缀树
|
||||
maxTokenLength int // 词典中最长的分词
|
||||
tokens []Token // 词典中所有的分词,方便遍历
|
||||
totalFrequency int64 // 词典中所有分词的频率之和
|
||||
}
|
||||
|
||||
func NewDictionary() *Dictionary {
|
||||
return &Dictionary{trie: cedar.New()}
|
||||
}
|
||||
|
||||
// 词典中最长的分词
|
||||
func (dict *Dictionary) MaxTokenLength() int {
|
||||
return dict.maxTokenLength
|
||||
}
|
||||
|
||||
// 词典中分词数目
|
||||
func (dict *Dictionary) NumTokens() int {
|
||||
return len(dict.tokens)
|
||||
}
|
||||
|
||||
// 词典中所有分词的频率之和
|
||||
func (dict *Dictionary) TotalFrequency() int64 {
|
||||
return dict.totalFrequency
|
||||
}
|
||||
|
||||
// 向词典中加入一个分词
|
||||
func (dict *Dictionary) addToken(token Token) {
|
||||
bytes := textSliceToBytes(token.text)
|
||||
_, err := dict.trie.Get(bytes)
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
|
||||
dict.trie.Insert(bytes, dict.NumTokens())
|
||||
dict.tokens = append(dict.tokens, token)
|
||||
dict.totalFrequency += int64(token.frequency)
|
||||
if len(token.text) > dict.maxTokenLength {
|
||||
dict.maxTokenLength = len(token.text)
|
||||
}
|
||||
}
|
||||
|
||||
// 在词典中查找和字元组words可以前缀匹配的所有分词
|
||||
// 返回值为找到的分词数
|
||||
func (dict *Dictionary) lookupTokens(words []Text, tokens []*Token) (numOfTokens int) {
|
||||
var id, value int
|
||||
var err error
|
||||
for _, word := range words {
|
||||
id, err = dict.trie.Jump(word, id)
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
value, err = dict.trie.Value(id)
|
||||
if err == nil {
|
||||
tokens[numOfTokens] = &dict.tokens[value]
|
||||
numOfTokens++
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
13
vendor/github.com/huichen/sego/license.txt
generated
vendored
Normal file
13
vendor/github.com/huichen/sego/license.txt
generated
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
Copyright 2013 Hui Chen
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
28
vendor/github.com/huichen/sego/segment.go
generated
vendored
Normal file
28
vendor/github.com/huichen/sego/segment.go
generated
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
package sego
|
||||
|
||||
// 文本中的一个分词
|
||||
type Segment struct {
|
||||
// 分词在文本中的起始字节位置
|
||||
start int
|
||||
|
||||
// 分词在文本中的结束字节位置(不包括该位置)
|
||||
end int
|
||||
|
||||
// 分词信息
|
||||
token *Token
|
||||
}
|
||||
|
||||
// 返回分词在文本中的起始字节位置
|
||||
func (s *Segment) Start() int {
|
||||
return s.start
|
||||
}
|
||||
|
||||
// 返回分词在文本中的结束字节位置(不包括该位置)
|
||||
func (s *Segment) End() int {
|
||||
return s.end
|
||||
}
|
||||
|
||||
// 返回分词信息
|
||||
func (s *Segment) Token() *Token {
|
||||
return s.token
|
||||
}
|
||||
295
vendor/github.com/huichen/sego/segmenter.go
generated
vendored
Normal file
295
vendor/github.com/huichen/sego/segmenter.go
generated
vendored
Normal file
@@ -0,0 +1,295 @@
|
||||
//Go中文分词
|
||||
package sego
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log"
|
||||
"math"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
const (
|
||||
minTokenFrequency = 2 // 仅从字典文件中读取大于等于此频率的分词
|
||||
)
|
||||
|
||||
// 分词器结构体
|
||||
type Segmenter struct {
|
||||
dict *Dictionary
|
||||
}
|
||||
|
||||
// 该结构体用于记录Viterbi算法中某字元处的向前分词跳转信息
|
||||
type jumper struct {
|
||||
minDistance float32
|
||||
token *Token
|
||||
}
|
||||
|
||||
// 返回分词器使用的词典
|
||||
func (seg *Segmenter) Dictionary() *Dictionary {
|
||||
return seg.dict
|
||||
}
|
||||
|
||||
// 从文件中载入词典
|
||||
//
|
||||
// 可以载入多个词典文件,文件名用","分隔,排在前面的词典优先载入分词,比如
|
||||
// "用户词典.txt,通用词典.txt"
|
||||
// 当一个分词既出现在用户词典也出现在通用词典中,则优先使用用户词典。
|
||||
//
|
||||
// 词典的格式为(每个分词一行):
|
||||
// 分词文本 频率 词性
|
||||
func (seg *Segmenter) LoadDictionary(files string) {
|
||||
seg.dict = NewDictionary()
|
||||
for _, file := range strings.Split(files, ",") {
|
||||
log.Printf("载入sego词典 %s", file)
|
||||
dictFile, err := os.Open(file)
|
||||
defer dictFile.Close()
|
||||
if err != nil {
|
||||
log.Fatalf("无法载入字典文件 \"%s\" \n", file)
|
||||
}
|
||||
|
||||
reader := bufio.NewReader(dictFile)
|
||||
var text string
|
||||
var freqText string
|
||||
var frequency int
|
||||
var pos string
|
||||
|
||||
// 逐行读入分词
|
||||
for {
|
||||
size, _ := fmt.Fscanln(reader, &text, &freqText, &pos)
|
||||
|
||||
if size == 0 {
|
||||
// 文件结束
|
||||
break
|
||||
} else if size < 2 {
|
||||
// 无效行
|
||||
continue
|
||||
} else if size == 2 {
|
||||
// 没有词性标注时设为空字符串
|
||||
pos = ""
|
||||
}
|
||||
|
||||
// 解析词频
|
||||
var err error
|
||||
frequency, err = strconv.Atoi(freqText)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// 过滤频率太小的词
|
||||
if frequency < minTokenFrequency {
|
||||
continue
|
||||
}
|
||||
|
||||
// 将分词添加到字典中
|
||||
words := splitTextToWords([]byte(text))
|
||||
token := Token{text: words, frequency: frequency, pos: pos}
|
||||
seg.dict.addToken(token)
|
||||
}
|
||||
}
|
||||
|
||||
// 计算每个分词的路径值,路径值含义见Token结构体的注释
|
||||
logTotalFrequency := float32(math.Log2(float64(seg.dict.totalFrequency)))
|
||||
for i := range seg.dict.tokens {
|
||||
token := &seg.dict.tokens[i]
|
||||
token.distance = logTotalFrequency - float32(math.Log2(float64(token.frequency)))
|
||||
}
|
||||
|
||||
// 对每个分词进行细致划分,用于搜索引擎模式,该模式用法见Token结构体的注释。
|
||||
for i := range seg.dict.tokens {
|
||||
token := &seg.dict.tokens[i]
|
||||
segments := seg.segmentWords(token.text, true)
|
||||
|
||||
// 计算需要添加的子分词数目
|
||||
numTokensToAdd := 0
|
||||
for iToken := 0; iToken < len(segments); iToken++ {
|
||||
if len(segments[iToken].token.text) > 1 {
|
||||
// 略去字元长度为一的分词
|
||||
// TODO: 这值得进一步推敲,特别是当字典中有英文复合词的时候
|
||||
numTokensToAdd++
|
||||
}
|
||||
}
|
||||
token.segments = make([]*Segment, numTokensToAdd)
|
||||
|
||||
// 添加子分词
|
||||
iSegmentsToAdd := 0
|
||||
for iToken := 0; iToken < len(segments); iToken++ {
|
||||
if len(segments[iToken].token.text) > 1 {
|
||||
token.segments[iSegmentsToAdd] = &segments[iToken]
|
||||
iSegmentsToAdd++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.Println("sego词典载入完毕")
|
||||
}
|
||||
|
||||
// 对文本分词
|
||||
//
|
||||
// 输入参数:
|
||||
// bytes UTF8文本的字节数组
|
||||
//
|
||||
// 输出:
|
||||
// []Segment 划分的分词
|
||||
func (seg *Segmenter) Segment(bytes []byte) []Segment {
|
||||
return seg.internalSegment(bytes, false)
|
||||
}
|
||||
|
||||
func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
|
||||
// 处理特殊情况
|
||||
if len(bytes) == 0 {
|
||||
return []Segment{}
|
||||
}
|
||||
|
||||
// 划分字元
|
||||
text := splitTextToWords(bytes)
|
||||
|
||||
return seg.segmentWords(text, searchMode)
|
||||
}
|
||||
|
||||
func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
|
||||
// 搜索模式下该分词已无继续划分可能的情况
|
||||
if searchMode && len(text) == 1 {
|
||||
return []Segment{}
|
||||
}
|
||||
|
||||
// jumpers定义了每个字元处的向前跳转信息,包括这个跳转对应的分词,
|
||||
// 以及从文本段开始到该字元的最短路径值
|
||||
jumpers := make([]jumper, len(text))
|
||||
|
||||
tokens := make([]*Token, seg.dict.maxTokenLength)
|
||||
for current := 0; current < len(text); current++ {
|
||||
// 找到前一个字元处的最短路径,以便计算后续路径值
|
||||
var baseDistance float32
|
||||
if current == 0 {
|
||||
// 当本字元在文本首部时,基础距离应该是零
|
||||
baseDistance = 0
|
||||
} else {
|
||||
baseDistance = jumpers[current-1].minDistance
|
||||
}
|
||||
|
||||
// 寻找所有以当前字元开头的分词
|
||||
numTokens := seg.dict.lookupTokens(
|
||||
text[current:minInt(current+seg.dict.maxTokenLength, len(text))], tokens)
|
||||
|
||||
// 对所有可能的分词,更新分词结束字元处的跳转信息
|
||||
for iToken := 0; iToken < numTokens; iToken++ {
|
||||
location := current + len(tokens[iToken].text) - 1
|
||||
if !searchMode || current != 0 || location != len(text)-1 {
|
||||
updateJumper(&jumpers[location], baseDistance, tokens[iToken])
|
||||
}
|
||||
}
|
||||
|
||||
// 当前字元没有对应分词时补加一个伪分词
|
||||
if numTokens == 0 || len(tokens[0].text) > 1 {
|
||||
updateJumper(&jumpers[current], baseDistance,
|
||||
&Token{text: []Text{text[current]}, frequency: 1, distance: 32, pos: "x"})
|
||||
}
|
||||
}
|
||||
|
||||
// 从后向前扫描第一遍得到需要添加的分词数目
|
||||
numSeg := 0
|
||||
for index := len(text) - 1; index >= 0; {
|
||||
location := index - len(jumpers[index].token.text) + 1
|
||||
numSeg++
|
||||
index = location - 1
|
||||
}
|
||||
|
||||
// 从后向前扫描第二遍添加分词到最终结果
|
||||
outputSegments := make([]Segment, numSeg)
|
||||
for index := len(text) - 1; index >= 0; {
|
||||
location := index - len(jumpers[index].token.text) + 1
|
||||
numSeg--
|
||||
outputSegments[numSeg].token = jumpers[index].token
|
||||
index = location - 1
|
||||
}
|
||||
|
||||
// 计算各个分词的字节位置
|
||||
bytePosition := 0
|
||||
for iSeg := 0; iSeg < len(outputSegments); iSeg++ {
|
||||
outputSegments[iSeg].start = bytePosition
|
||||
bytePosition += textSliceByteLength(outputSegments[iSeg].token.text)
|
||||
outputSegments[iSeg].end = bytePosition
|
||||
}
|
||||
return outputSegments
|
||||
}
|
||||
|
||||
// 更新跳转信息:
|
||||
// 1. 当该位置从未被访问过时(jumper.minDistance为零的情况),或者
|
||||
// 2. 当该位置的当前最短路径大于新的最短路径时
|
||||
// 将当前位置的最短路径值更新为baseDistance加上新分词的概率
|
||||
func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
|
||||
newDistance := baseDistance + token.distance
|
||||
if jumper.minDistance == 0 || jumper.minDistance > newDistance {
|
||||
jumper.minDistance = newDistance
|
||||
jumper.token = token
|
||||
}
|
||||
}
|
||||
|
||||
// 取两整数较小值
|
||||
func minInt(a, b int) int {
|
||||
if a > b {
|
||||
return b
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// 取两整数较大值
|
||||
func maxInt(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// 将文本划分成字元
|
||||
func splitTextToWords(text Text) []Text {
|
||||
output := make([]Text, 0, len(text)/3)
|
||||
current := 0
|
||||
inAlphanumeric := true
|
||||
alphanumericStart := 0
|
||||
for current < len(text) {
|
||||
r, size := utf8.DecodeRune(text[current:])
|
||||
if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) {
|
||||
// 当前是拉丁字母或数字(非中日韩文字)
|
||||
if !inAlphanumeric {
|
||||
alphanumericStart = current
|
||||
inAlphanumeric = true
|
||||
}
|
||||
} else {
|
||||
if inAlphanumeric {
|
||||
inAlphanumeric = false
|
||||
if current != 0 {
|
||||
output = append(output, toLower(text[alphanumericStart:current]))
|
||||
}
|
||||
}
|
||||
output = append(output, text[current:current+size])
|
||||
}
|
||||
current += size
|
||||
}
|
||||
|
||||
// 处理最后一个字元是英文的情况
|
||||
if inAlphanumeric {
|
||||
if current != 0 {
|
||||
output = append(output, toLower(text[alphanumericStart:current]))
|
||||
}
|
||||
}
|
||||
|
||||
return output
|
||||
}
|
||||
|
||||
// 将英文词转化为小写
|
||||
func toLower(text []byte) []byte {
|
||||
output := make([]byte, len(text))
|
||||
for i, t := range text {
|
||||
if t >= 'A' && t <= 'Z' {
|
||||
output[i] = t - 'A' + 'a'
|
||||
} else {
|
||||
output[i] = t
|
||||
}
|
||||
}
|
||||
return output
|
||||
}
|
||||
38
vendor/github.com/huichen/sego/test_utils.go
generated
vendored
Normal file
38
vendor/github.com/huichen/sego/test_utils.go
generated
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
package sego
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func expect(t *testing.T, expect string, actual interface{}) {
|
||||
actualString := fmt.Sprint(actual)
|
||||
if expect != actualString {
|
||||
t.Errorf("期待值=\"%s\", 实际=\"%s\"", expect, actualString)
|
||||
}
|
||||
}
|
||||
|
||||
func printTokens(tokens []*Token, numTokens int) (output string) {
|
||||
for iToken := 0; iToken < numTokens; iToken++ {
|
||||
for _, word := range tokens[iToken].text {
|
||||
output += fmt.Sprint(string(word))
|
||||
}
|
||||
output += " "
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func toWords(strings ...string) []Text {
|
||||
words := []Text{}
|
||||
for _, s := range strings {
|
||||
words = append(words, []byte(s))
|
||||
}
|
||||
return words
|
||||
}
|
||||
|
||||
func bytesToString(bytes []Text) (output string) {
|
||||
for _, b := range bytes {
|
||||
output += (string(b) + "/")
|
||||
}
|
||||
return
|
||||
}
|
||||
50
vendor/github.com/huichen/sego/token.go
generated
vendored
Normal file
50
vendor/github.com/huichen/sego/token.go
generated
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
package sego
|
||||
|
||||
// 字串类型,可以用来表达
|
||||
// 1. 一个字元,比如"中"又如"国", 英文的一个字元是一个词
|
||||
// 2. 一个分词,比如"中国"又如"人口"
|
||||
// 3. 一段文字,比如"中国有十三亿人口"
|
||||
type Text []byte
|
||||
|
||||
// 一个分词
|
||||
type Token struct {
|
||||
// 分词的字串,这实际上是个字元数组
|
||||
text []Text
|
||||
|
||||
// 分词在语料库中的词频
|
||||
frequency int
|
||||
|
||||
// log2(总词频/该分词词频),这相当于log2(1/p(分词)),用作动态规划中
|
||||
// 该分词的路径长度。求解prod(p(分词))的最大值相当于求解
|
||||
// sum(distance(分词))的最小值,这就是“最短路径”的来历。
|
||||
distance float32
|
||||
|
||||
// 词性标注
|
||||
pos string
|
||||
|
||||
// 该分词文本的进一步分词划分,见Segments函数注释。
|
||||
segments []*Segment
|
||||
}
|
||||
|
||||
// 返回分词文本
|
||||
func (token *Token) Text() string {
|
||||
return textSliceToString(token.text)
|
||||
}
|
||||
|
||||
// 返回分词在语料库中的词频
|
||||
func (token *Token) Frequency() int {
|
||||
return token.frequency
|
||||
}
|
||||
|
||||
// 返回分词词性标注
|
||||
func (token *Token) Pos() string {
|
||||
return token.pos
|
||||
}
|
||||
|
||||
// 该分词文本的进一步分词划分,比如"中华人民共和国中央人民政府"这个分词
|
||||
// 有两个子分词"中华人民共和国"和"中央人民政府"。子分词也可以进一步有子分词
|
||||
// 形成一个树结构,遍历这个树就可以得到该分词的所有细致分词划分,这主要
|
||||
// 用于搜索引擎对一段文本进行全文搜索。
|
||||
func (token *Token) Segments() []*Segment {
|
||||
return token.segments
|
||||
}
|
||||
93
vendor/github.com/huichen/sego/utils.go
generated
vendored
Normal file
93
vendor/github.com/huichen/sego/utils.go
generated
vendored
Normal file
@@ -0,0 +1,93 @@
|
||||
package sego
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// 输出分词结果为字符串
|
||||
//
|
||||
// 有两种输出模式,以"中华人民共和国"为例
|
||||
//
|
||||
// 普通模式(searchMode=false)输出一个分词"中华人民共和国/ns "
|
||||
// 搜索模式(searchMode=true) 输出普通模式的再细致切分:
|
||||
// "中华/nz 人民/n 共和/nz 共和国/ns 人民共和国/nt 中华人民共和国/ns "
|
||||
//
|
||||
// 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见Token结构体的注释。
|
||||
func SegmentsToString(segs []Segment, searchMode bool) (output string) {
|
||||
if searchMode {
|
||||
for _, seg := range segs {
|
||||
output += tokenToString(seg.token)
|
||||
}
|
||||
} else {
|
||||
for _, seg := range segs {
|
||||
output += fmt.Sprintf(
|
||||
"%s/%s ", textSliceToString(seg.token.text), seg.token.pos)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func tokenToString(token *Token) (output string) {
|
||||
for _, s := range token.segments {
|
||||
output += tokenToString(s.token)
|
||||
}
|
||||
output += fmt.Sprintf("%s/%s ", textSliceToString(token.text), token.pos)
|
||||
return
|
||||
}
|
||||
|
||||
// 输出分词结果到一个字符串slice
|
||||
//
|
||||
// 有两种输出模式,以"中华人民共和国"为例
|
||||
//
|
||||
// 普通模式(searchMode=false)输出一个分词"[中华人民共和国]"
|
||||
// 搜索模式(searchMode=true) 输出普通模式的再细致切分:
|
||||
// "[中华 人民 共和 共和国 人民共和国 中华人民共和国]"
|
||||
//
|
||||
// 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见Token结构体的注释。
|
||||
|
||||
func SegmentsToSlice(segs []Segment, searchMode bool) (output []string) {
|
||||
if searchMode {
|
||||
for _, seg := range segs {
|
||||
output = append(output, tokenToSlice(seg.token)...)
|
||||
}
|
||||
} else {
|
||||
for _, seg := range segs {
|
||||
output = append(output, seg.token.Text())
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func tokenToSlice(token *Token) (output []string) {
|
||||
for _, s := range token.segments {
|
||||
output = append(output, tokenToSlice(s.token)...)
|
||||
}
|
||||
output = append(output, textSliceToString(token.text))
|
||||
return output
|
||||
}
|
||||
|
||||
// 将多个字元拼接一个字符串输出
|
||||
func textSliceToString(text []Text) string {
|
||||
var output string
|
||||
for _, word := range text {
|
||||
output += string(word)
|
||||
}
|
||||
return output
|
||||
}
|
||||
|
||||
// 返回多个字元的字节总长度
|
||||
func textSliceByteLength(text []Text) (length int) {
|
||||
for _, word := range text {
|
||||
length += len(word)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func textSliceToBytes(text []Text) []byte {
|
||||
var buf bytes.Buffer
|
||||
for _, word := range text {
|
||||
buf.Write(word)
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
Reference in New Issue
Block a user