You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
89 lines
2.4 KiB
89 lines
2.4 KiB
// Copyright 2013 Hui Chen |
|
// Copyright 2016 ego authors |
|
// |
|
// Licensed under the Apache License, Version 2.0 (the "License"): you may |
|
// not use this file except in compliance with the License. You may obtain |
|
// a copy of the License at |
|
// |
|
// http://www.apache.org/licenses/LICENSE-2.0 |
|
// |
|
// Unless required by applicable law or agreed to in writing, software |
|
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
|
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
|
// License for the specific language governing permissions and limitations |
|
// under the License. |
|
|
|
package gse |
|
|
|
import ( |
|
"github.com/go-ego/cedar" |
|
) |
|
|
|
// Dictionary 结构体实现了一个字串前缀树, |
|
// 一个分词可能出现在叶子节点也有可能出现在非叶节点 |
|
type Dictionary struct { |
|
trie *cedar.Cedar // Cedar 前缀树 |
|
maxTokenLen int // 词典中最长的分词 |
|
tokens []Token // 词典中所有的分词,方便遍历 |
|
totalFrequency int64 // 词典中所有分词的频率之和 |
|
} |
|
|
|
// NewDict new dictionary |
|
func NewDict() *Dictionary { |
|
return &Dictionary{trie: cedar.New()} |
|
} |
|
|
|
// MaxTokenLen 词典中最长的分词 |
|
func (dict *Dictionary) MaxTokenLen() int { |
|
return dict.maxTokenLen |
|
} |
|
|
|
// NumTokens 词典中分词数目 |
|
func (dict *Dictionary) NumTokens() int { |
|
return len(dict.tokens) |
|
} |
|
|
|
// TotalFrequency 词典中所有分词的频率之和 |
|
func (dict *Dictionary) TotalFrequency() int64 { |
|
return dict.totalFrequency |
|
} |
|
|
|
// addToken 向词典中加入一个分词 |
|
func (dict *Dictionary) addToken(token Token) { |
|
bytes := textSliceToBytes(token.text) |
|
_, err := dict.trie.Get(bytes) |
|
if err == nil { |
|
return |
|
} |
|
|
|
dict.trie.Insert(bytes, dict.NumTokens()) |
|
dict.tokens = append(dict.tokens, token) |
|
dict.totalFrequency += int64(token.frequency) |
|
if len(token.text) > dict.maxTokenLen { |
|
dict.maxTokenLen = len(token.text) |
|
} |
|
} |
|
|
|
// lookupTokens 在词典中查找和字元组 words 可以前缀匹配的所有分词 |
|
// 返回值为找到的分词数 |
|
func (dict *Dictionary) lookupTokens(words []Text, |
|
tokens []*Token) (numOfTokens int) { |
|
var ( |
|
id, value int |
|
err error |
|
) |
|
|
|
for _, word := range words { |
|
id, err = dict.trie.Jump(word, id) |
|
if err != nil { |
|
break |
|
} |
|
value, err = dict.trie.Value(id) |
|
if err == nil { |
|
tokens[numOfTokens] = &dict.tokens[value] |
|
numOfTokens++ |
|
} |
|
} |
|
|
|
return |
|
}
|
|
|