Skip to content

Commit

Permalink
Merge pull request #112 from go-ego/range-pr
Browse files Browse the repository at this point in the history
add Find() function pos return support and more error return
  • Loading branch information
vcaesar authored Sep 3, 2021
2 parents e354e97 + 058a054 commit 85162d8
Show file tree
Hide file tree
Showing 13 changed files with 64 additions and 54 deletions.
21 changes: 11 additions & 10 deletions dag.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ type route struct {
}

// Find find word in dictionary return word's frequency and existence
func (seg *Segmenter) Find(str string) (float64, bool) {
func (seg *Segmenter) Find(str string) (float64, string, bool) {
return seg.Dict.Find([]byte(str))
}

Expand All @@ -58,13 +58,14 @@ func (seg *Segmenter) Analyze(text []string) (az []AnalyzeToken) {
end = az[k-1].End + len([]rune(v))
}

freq, _ := seg.Find(v)
freq, pos, _ := seg.Find(v)
az = append(az, AnalyzeToken{
Position: k,
Start: start,
End: end,
Text: v,
Freq: freq,
Pos: pos,
})
}

Expand All @@ -86,7 +87,7 @@ func (seg *Segmenter) getDag(runes []rune) map[int][]int {
frag = runes[k : k+1]

for {
freq, ok := seg.Find(string(frag))
freq, _, ok := seg.Find(string(frag))
if !ok {
break
}
Expand Down Expand Up @@ -123,7 +124,7 @@ func (seg *Segmenter) calc(runes []rune) map[int]route {
logT := math.Log(seg.Dict.totalFrequency)
for idx := n - 1; idx >= 0; idx-- {
for _, i := range dag[idx] {
freq, ok := seg.Find(string(runes[idx : i+1]))
freq, _, ok := seg.Find(string(runes[idx : i+1]))

if ok {
f := math.Log(freq) - logT + rs[i+1].frequency
Expand All @@ -149,7 +150,7 @@ func (seg *Segmenter) calc(runes []rune) map[int]route {

func (seg *Segmenter) hmm(bufString string, buf []rune) (result []string) {

v, ok := seg.Find(bufString)
v, _, ok := seg.Find(bufString)
if !ok || v == 0 {
result = append(result, seg.HMMCut(bufString)...)
return
Expand Down Expand Up @@ -298,7 +299,7 @@ func (seg *Segmenter) cutForSearch(str string, hmm ...bool) []string {
var gram string
for i := 0; i < len(runes)-incr+1; i++ {
gram = string(runes[i : i+incr])
v, ok := seg.Find(gram)
v, _, ok := seg.Find(gram)
if ok && v > 0 {
result = append(result, gram)
}
Expand All @@ -320,7 +321,7 @@ func (seg *Segmenter) SuggestFreq(words ...string) float64 {

if len(words) > 1 {
for _, word := range words {
freq, ok := seg.Find(word)
freq, _, ok := seg.Find(word)
if ok {
frequency *= freq
}
Expand All @@ -330,7 +331,7 @@ func (seg *Segmenter) SuggestFreq(words ...string) float64 {

frequency, _ = math.Modf(frequency * total)
wordFreq := 0.0
freq, ok := seg.Find(strings.Join(words, ""))
freq, _, ok := seg.Find(strings.Join(words, ""))
if ok {
wordFreq = freq
}
Expand All @@ -344,7 +345,7 @@ func (seg *Segmenter) SuggestFreq(words ...string) float64 {

word := words[0]
for _, segment := range seg.Cut(word, false) {
freq, ok := seg.Find(segment)
freq, _, ok := seg.Find(segment)
if ok {
frequency *= freq
}
Expand All @@ -356,7 +357,7 @@ func (seg *Segmenter) SuggestFreq(words ...string) float64 {
frequency += 1.0
wordFreq := 1.0

freq, ok := seg.Find(word)
freq, _, ok := seg.Find(word)
if ok {
wordFreq = freq
}
Expand Down
2 changes: 1 addition & 1 deletion dict_1.16.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func (seg *Segmenter) LoadDictStr(dict string) error {

pos := ""
if size > 2 {
pos = strings.Trim(s1[2], "\n")
pos = strings.TrimSpace(strings.Trim(s1[2], "\n"))
}

// 将分词添加到字典中
Expand Down
8 changes: 5 additions & 3 deletions dict_1.16_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,17 @@ func TestLoadDictEmbed(t *testing.T) {
seg1, err := NewEmbed("zh, world 20 n", "en")
tt.Nil(t, err)

f, ok := seg1.Find("1号店")
f, pos, ok := seg1.Find("1号店")
tt.Bool(t, ok)
tt.Equal(t, "n", pos)
tt.Equal(t, 3, f)

f, ok = seg1.Find("world")
f, pos, ok = seg1.Find("world")
tt.Bool(t, ok)
tt.Equal(t, "n", pos)
tt.Equal(t, 20, f)

f, ok = seg1.Find("八千一百三十七万七千二百三十六口")
f, _, ok = seg1.Find("八千一百三十七万七千二百三十六口")
tt.Bool(t, ok)
tt.Equal(t, 2, f)
}
Expand Down
15 changes: 8 additions & 7 deletions dictionary.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ func (dict *Dictionary) LookupTokens(
return
}

// Find find word in the dictionary is non-existent
// and the word's frequency
func (dict *Dictionary) Find(word []byte) (float64, bool) {
// Find find the word in the dictionary is non-existent
// and the word's frequency, pos
func (dict *Dictionary) Find(word []byte) (float64, string, bool) {
var (
id, value int
freq float64
Expand All @@ -115,20 +115,21 @@ func (dict *Dictionary) Find(word []byte) (float64, bool) {

id, err = dict.trie.Jump(word, id)
if err != nil {
return 0, false
return 0, "", false
}

value, err = dict.trie.Value(id)
if err != nil && id != 0 {
return 0, true
return 0, "", true
}

if err != nil {
return 0, false
return 0, "", false
}

freq = dict.Tokens[value].frequency
return freq, true
pos := dict.Tokens[value].pos
return freq, pos, true
}

// Value find word in the dictionary
Expand Down
8 changes: 4 additions & 4 deletions examples/en/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ func main() {
// seg.LoadDict("zh, ../../testdata/test_dict3.txt")
seg.AddToken("winter is coming!", 100, "n")

freq, ok := seg.Find("hello")
fmt.Println(freq, ok)
freq, pos, ok := seg.Find("hello")
fmt.Println(freq, pos, ok)

freq, ok = seg.Find("world")
fmt.Println(freq, ok)
freq, pos, ok = seg.Find("world")
fmt.Println(freq, pos, ok)

text := "Helloworld, winter is coming! 你好世界."

Expand Down
4 changes: 2 additions & 2 deletions examples/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ func addToken() {
// seg.AddTokenForce("上海东方明珠广播电视塔", 100, "n")
//
seg.AddToken("太空针", 100)
freq, ok := seg.Find("太空针")
fmt.Println("seg.Find: ", freq, ok)
freq, pos, ok := seg.Find("太空针")
fmt.Println("seg.Find: ", freq, pos, ok)

// seg.CalcToken()
err = seg.RemoveToken("太空针")
Expand Down
10 changes: 6 additions & 4 deletions gse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@ func TestLoadDictMap(t *testing.T) {
},
}

prodSeg.LoadDictMap(m)
err := prodSeg.LoadDictMap(m)
tt.Nil(t, err)

f, ok := prodSeg.Find("一城山水")
f, pos, ok := prodSeg.Find("一城山水")
tt.Bool(t, ok)
tt.Equal(t, "n", pos)
tt.Equal(t, 10, f)

f, ok = prodSeg.Find("山河日月")
f, _, ok = prodSeg.Find("山河日月")
tt.Bool(t, ok)
tt.Equal(t, 13, f)
}
Expand All @@ -44,7 +46,7 @@ func TestAnalyze(t *testing.T) {

a := prodSeg.Analyze(s)
tt.Equal(t, 23, len(a))
tt.Equal(t, "[{0 4 0 0 城市地标 3} {4 6 1 0 建筑 14397} {6 8 2 0 : 0} {8 10 3 0 纽约 1758} {10 14 4 0 帝国大厦 3} {14 16 5 0 , 0} {16 20 6 0 旧金山湾 3} {20 24 7 0 金门大桥 38} {24 26 8 0 , 0} {26 33 9 0 Seattle 0} {33 34 10 0 0} {34 39 11 0 Space 0} {39 40 12 0 0} {40 46 13 0 Needle 0} {46 48 14 0 , 0} {48 55 15 0 Toronto 0} {55 56 16 0 0} {56 58 17 0 CN 0} {58 59 18 0 0} {59 64 19 0 Tower 0} {64 66 20 0 , 0} {66 68 21 0 伦敦 2255} {68 71 22 0 大笨钟 0}]", a)
tt.Equal(t, "[{0 4 0 0 城市地标 3 j} {4 6 1 0 建筑 14397 n} {6 8 2 0 : 0 } {8 10 3 0 纽约 1758 ns} {10 14 4 0 帝国大厦 3 nr} {14 16 5 0 , 0 } {16 20 6 0 旧金山湾 3 ns} {20 24 7 0 金门大桥 38 nz} {24 26 8 0 , 0 } {26 33 9 0 Seattle 0 } {33 34 10 0 0 } {34 39 11 0 Space 0 } {39 40 12 0 0 } {40 46 13 0 Needle 0 } {46 48 14 0 , 0 } {48 55 15 0 Toronto 0 } {55 56 16 0 0 } {56 58 17 0 CN 0 } {58 59 18 0 0 } {59 64 19 0 Tower 0 } {64 66 20 0 , 0 } {66 68 21 0 伦敦 2255 ns} {68 71 22 0 大笨钟 0 }]", a)

tt.Equal(t, 0, a[0].Start)
tt.Equal(t, 4, a[0].End)
Expand Down
7 changes: 4 additions & 3 deletions hmm/idf/idf.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,13 @@ type Idf struct {
}

// AddToken adds a new word with IDF into it's dictionary.
func (i *Idf) AddToken(text string, frequency float64, pos ...string) {
i.seg.AddToken(text, frequency, pos...)
func (i *Idf) AddToken(text string, frequency float64, pos ...string) error {
err := i.seg.AddToken(text, frequency, pos...)

i.freqs = append(i.freqs, frequency)
sort.Float64s(i.freqs)
i.median = i.freqs[len(i.freqs)/2]
return err
}

// LoadDict load idf dictionary
Expand All @@ -48,7 +49,7 @@ func (i *Idf) LoadDict(files ...string) error {
}

// Frequency returns the IDF of given word.
func (i *Idf) Frequency(key string) (float64, bool) {
func (i *Idf) Frequency(key string) (float64, string, bool) {
return i.seg.Find(key)
}

Expand Down
6 changes: 3 additions & 3 deletions hmm/idf/tag_extracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
ws := make(Segments, 0)
var s Segment
for k, v := range freqMap {
if freq, ok := t.Idf.Frequency(k); ok {
if freq, _, ok := t.Idf.Frequency(k); ok {
s = Segment{text: k, weight: freq * v}
} else {
s = Segment{text: k, weight: t.Idf.median * v}
Expand All @@ -119,9 +119,9 @@ func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {

if len(ws) > topK {
tags = ws[:topK]
} else {
tags = ws
return
}

tags = ws
return
}
6 changes: 3 additions & 3 deletions hmm/pos/dict.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,16 @@ type Dict struct {
// }

// AddToken adds one token
func (d *Dict) AddToken(text string, frequency float64, pos ...string) {
d.Seg.AddToken(text, frequency, pos...)
func (d *Dict) AddToken(text string, frequency float64, pos ...string) error {
return d.Seg.AddToken(text, frequency, pos...)
}

func (d *Dict) updateLogTotal() {
d.logTotal = math.Log(d.total)
}

// Frequency returns the frequency and existence of give word
func (d *Dict) Frequency(key string) (float64, bool) {
func (d *Dict) Frequency(key string) (float64, string, bool) {
return d.Seg.Find(key)
}

Expand Down
8 changes: 4 additions & 4 deletions hmm/pos/pos_seg.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ func (seg *Segmenter) getDag(runes []rune) map[int][]int {
frag = runes[k : k+1]

for {
freq, ok := seg.dict.Frequency(string(frag))
freq, _, ok := seg.dict.Frequency(string(frag))
if !ok {
break
}
Expand Down Expand Up @@ -160,7 +160,7 @@ func (seg *Segmenter) calc(runes []rune) map[int]route {

for idx := n - 1; idx >= 0; idx-- {
for _, i := range dag[idx] {
if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
if freq, _, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
r = route{
frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency,
index: i}
Expand Down Expand Up @@ -216,7 +216,7 @@ func (seg *Segmenter) cutDAG(sentence string) (result []gse.SegPos) {
continue
}

if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
if v, _, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
result = append(result, seg.cutDetail(bufString)...)
} else {
for _, elem := range buf {
Expand Down Expand Up @@ -259,7 +259,7 @@ func (seg *Segmenter) bufn(buf []rune) (result []gse.SegPos) {
return
}

if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
if v, _, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
result = append(result, seg.cutDetail(bufString)...)
return
}
Expand Down
1 change: 1 addition & 0 deletions seg.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ type AnalyzeToken struct {

Text string
Freq float64
Pos string
}

// Segment 文本中的一个分词
Expand Down
Loading

0 comments on commit 85162d8

Please sign in to comment.