-
Notifications
You must be signed in to change notification settings - Fork 217
/
Copy pathdictionary.go
executable file
·143 lines (117 loc) · 3.28 KB
/
dictionary.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
// Copyright 2013 Hui Chen
// Copyright 2016 ego authors
//
// Copyright 2016 The go-ego Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// https://github.com/go-ego/gse/blob/master/LICENSE
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
package gse
import (
"github.com/vcaesar/cedar"
)
// Dictionary 结构体实现了一个字串双数组树,
// 一个分词可能出现在叶子节点也有可能出现在非叶节点
type Dictionary struct {
trie *cedar.Cedar // Cedar 双数组树
maxTokenLen int // 词典中最长的分词
Tokens []Token // 词典中所有的分词,方便遍历
totalFrequency float64 // 词典中所有分词的频率之和
}
// NewDict new dictionary
func NewDict() *Dictionary {
return &Dictionary{trie: cedar.New()}
}
// MaxTokenLen 词典中最长的分词
func (dict *Dictionary) MaxTokenLen() int {
return dict.maxTokenLen
}
// NumTokens 词典中分词数目
func (dict *Dictionary) NumTokens() int {
return len(dict.Tokens)
}
// TotalFreq 词典中所有分词的频率之和
func (dict *Dictionary) TotalFreq() float64 {
return dict.totalFrequency
}
// AddToken 向词典中加入一个分词
func (dict *Dictionary) AddToken(token Token) error {
bytes := textSliceToBytes(token.text)
val, err := dict.trie.Get(bytes)
if err == nil || val > 0 {
return nil
}
err = dict.trie.Insert(bytes, dict.NumTokens())
if err != nil {
return err
}
dict.Tokens = append(dict.Tokens, token)
dict.totalFrequency += token.frequency
if len(token.text) > dict.maxTokenLen {
dict.maxTokenLen = len(token.text)
}
return nil
}
// RemoveToken remove token in dictionary
func (dict *Dictionary) RemoveToken(token Token) error {
bytes := textSliceToBytes(token.text)
return dict.trie.Delete(bytes)
}
// LookupTokens 在词典中查找和字元组 words 可以前缀匹配的所有分词
// 返回值为找到的分词数
func (dict *Dictionary) LookupTokens(
words []Text, tokens []*Token) (numOfTokens int) {
var (
id, value int
err error
)
for _, word := range words {
id, err = dict.trie.Jump(word, id)
if err != nil {
break
}
value, err = dict.trie.Value(id)
if err == nil {
tokens[numOfTokens] = &dict.Tokens[value]
numOfTokens++
}
}
return
}
// Find find the word in the dictionary is non-existent
// and the word's frequency, pos
func (dict *Dictionary) Find(word []byte) (float64, string, bool) {
var (
id, value int
freq float64
err error
)
id, err = dict.trie.Jump(word, id)
if err != nil {
return 0, "", false
}
value, err = dict.trie.Value(id)
if err != nil && id != 0 {
return 0, "", true
}
if err != nil {
return 0, "", false
}
freq = dict.Tokens[value].frequency
pos := dict.Tokens[value].pos
return freq, pos, true
}
// Value find word in the dictionary
// retrun the word's value, id
func (dict *Dictionary) Value(word []byte) (val, id int, err error) {
id, err = dict.trie.Jump(word, id)
if err != nil {
return 0, id, err
}
val, err = dict.trie.Value(id)
return
}