-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbleve-sego.go
66 lines (57 loc) · 1.55 KB
/
bleve-sego.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
package bleve_sego
import (
"errors"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/huichen/sego"
)
func init() {
registry.RegisterAnalyzer("sego", analyzerConstructor)
registry.RegisterTokenizer("sego", tokenizerConstructor)
}
type SegoTokenizer struct {
tker sego.Segmenter
}
func (s *SegoTokenizer) loadDictory(dict string) {
s.tker.LoadDictionary(dict)
}
func (s *SegoTokenizer) Tokenize(sentence []byte) analysis.TokenStream {
result := make(analysis.TokenStream, 0)
words := s.tker.Segment(sentence)
for pos, word := range words {
word.Token().Text()
token := analysis.Token{
Start: word.Start(),
End: word.End(),
Position: pos + 1,
Term: []byte(word.Token().Text()),
Type: analysis.Ideographic,
}
result = append(result, &token)
}
return result
}
func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
dictpath, ok := config["dictpath"].(string)
if !ok {
return nil, errors.New("config dictpath not found")
}
tokenizer := &SegoTokenizer{}
tokenizer.loadDictory(dictpath)
return tokenizer, nil
}
type SegoAnalyzer struct{}
func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizerName, ok := config["tokenizer"].(string)
if !ok {
return nil, errors.New("must specify tokenizer")
}
tokenizer, err := cache.TokenizerNamed(tokenizerName)
if err != nil {
return nil, err
}
alz := &analysis.Analyzer{
Tokenizer: tokenizer,
}
return alz, nil
}