Skip to content

Commit

Permalink
add Find() function pos return support and more error return
Browse files Browse the repository at this point in the history
  • Loading branch information
vcaesar committed Sep 3, 2021
1 parent 0796876 commit 366655c
Show file tree
Hide file tree
Showing 12 changed files with 60 additions and 51 deletions.
21 changes: 11 additions & 10 deletions dag.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ type route struct {
}

// Find find word in dictionary return word's frequency and existence
func (seg *Segmenter) Find(str string) (float64, bool) {
func (seg *Segmenter) Find(str string) (float64, string, bool) {
return seg.Dict.Find([]byte(str))
}

Expand All @@ -58,13 +58,14 @@ func (seg *Segmenter) Analyze(text []string) (az []AnalyzeToken) {
end = az[k-1].End + len([]rune(v))
}

freq, _ := seg.Find(v)
freq, pos, _ := seg.Find(v)
az = append(az, AnalyzeToken{
Position: k,
Start: start,
End: end,
Text: v,
Freq: freq,
Pos: pos,
})
}

Expand All @@ -86,7 +87,7 @@ func (seg *Segmenter) getDag(runes []rune) map[int][]int {
frag = runes[k : k+1]

for {
freq, ok := seg.Find(string(frag))
freq, _, ok := seg.Find(string(frag))
if !ok {
break
}
Expand Down Expand Up @@ -123,7 +124,7 @@ func (seg *Segmenter) calc(runes []rune) map[int]route {
logT := math.Log(seg.Dict.totalFrequency)
for idx := n - 1; idx >= 0; idx-- {
for _, i := range dag[idx] {
freq, ok := seg.Find(string(runes[idx : i+1]))
freq, _, ok := seg.Find(string(runes[idx : i+1]))

if ok {
f := math.Log(freq) - logT + rs[i+1].frequency
Expand All @@ -149,7 +150,7 @@ func (seg *Segmenter) calc(runes []rune) map[int]route {

func (seg *Segmenter) hmm(bufString string, buf []rune) (result []string) {

v, ok := seg.Find(bufString)
v, _, ok := seg.Find(bufString)
if !ok || v == 0 {
result = append(result, seg.HMMCut(bufString)...)
return
Expand Down Expand Up @@ -298,7 +299,7 @@ func (seg *Segmenter) cutForSearch(str string, hmm ...bool) []string {
var gram string
for i := 0; i < len(runes)-incr+1; i++ {
gram = string(runes[i : i+incr])
v, ok := seg.Find(gram)
v, _, ok := seg.Find(gram)
if ok && v > 0 {
result = append(result, gram)
}
Expand All @@ -320,7 +321,7 @@ func (seg *Segmenter) SuggestFreq(words ...string) float64 {

if len(words) > 1 {
for _, word := range words {
freq, ok := seg.Find(word)
freq, _, ok := seg.Find(word)
if ok {
frequency *= freq
}
Expand All @@ -330,7 +331,7 @@ func (seg *Segmenter) SuggestFreq(words ...string) float64 {

frequency, _ = math.Modf(frequency * total)
wordFreq := 0.0
freq, ok := seg.Find(strings.Join(words, ""))
freq, _, ok := seg.Find(strings.Join(words, ""))
if ok {
wordFreq = freq
}
Expand All @@ -344,7 +345,7 @@ func (seg *Segmenter) SuggestFreq(words ...string) float64 {

word := words[0]
for _, segment := range seg.Cut(word, false) {
freq, ok := seg.Find(segment)
freq, _, ok := seg.Find(segment)
if ok {
frequency *= freq
}
Expand All @@ -356,7 +357,7 @@ func (seg *Segmenter) SuggestFreq(words ...string) float64 {
frequency += 1.0
wordFreq := 1.0

freq, ok := seg.Find(word)
freq, _, ok := seg.Find(word)
if ok {
wordFreq = freq
}
Expand Down
8 changes: 5 additions & 3 deletions dict_1.16_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,17 @@ func TestLoadDictEmbed(t *testing.T) {
seg1, err := NewEmbed("zh, world 20 n", "en")
tt.Nil(t, err)

f, ok := seg1.Find("1号店")
f, pos, ok := seg1.Find("1号店")
tt.Bool(t, ok)
tt.Equal(t, pos, "n")
tt.Equal(t, 3, f)

f, ok = seg1.Find("world")
f, pos, ok = seg1.Find("world")
tt.Bool(t, ok)
tt.Equal(t, pos, "n")
tt.Equal(t, 20, f)

f, ok = seg1.Find("八千一百三十七万七千二百三十六口")
f, _, ok = seg1.Find("八千一百三十七万七千二百三十六口")
tt.Bool(t, ok)
tt.Equal(t, 2, f)
}
Expand Down
15 changes: 8 additions & 7 deletions dictionary.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ func (dict *Dictionary) LookupTokens(
return
}

// Find find word in the dictionary is non-existent
// and the word's frequency
func (dict *Dictionary) Find(word []byte) (float64, bool) {
// Find find the word in the dictionary is non-existent
// and the word's frequency, pos
func (dict *Dictionary) Find(word []byte) (float64, string, bool) {
var (
id, value int
freq float64
Expand All @@ -115,20 +115,21 @@ func (dict *Dictionary) Find(word []byte) (float64, bool) {

id, err = dict.trie.Jump(word, id)
if err != nil {
return 0, false
return 0, "", false
}

value, err = dict.trie.Value(id)
if err != nil && id != 0 {
return 0, true
return 0, "", true
}

if err != nil {
return 0, false
return 0, "", false
}

freq = dict.Tokens[value].frequency
return freq, true
pos := dict.Tokens[value].pos
return freq, pos, true
}

// Value find word in the dictionary
Expand Down
8 changes: 4 additions & 4 deletions examples/en/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ func main() {
// seg.LoadDict("zh, ../../testdata/test_dict3.txt")
seg.AddToken("winter is coming!", 100, "n")

freq, ok := seg.Find("hello")
fmt.Println(freq, ok)
freq, pos, ok := seg.Find("hello")
fmt.Println(freq, pos, ok)

freq, ok = seg.Find("world")
fmt.Println(freq, ok)
freq, pos, ok = seg.Find("world")
fmt.Println(freq, pos, ok)

text := "Helloworld, winter is coming! 你好世界."

Expand Down
4 changes: 2 additions & 2 deletions examples/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ func addToken() {
// seg.AddTokenForce("上海东方明珠广播电视塔", 100, "n")
//
seg.AddToken("太空针", 100)
freq, ok := seg.Find("太空针")
fmt.Println("seg.Find: ", freq, ok)
freq, pos, ok := seg.Find("太空针")
fmt.Println("seg.Find: ", freq, pos, ok)

// seg.CalcToken()
err = seg.RemoveToken("太空针")
Expand Down
5 changes: 3 additions & 2 deletions gse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@ func TestLoadDictMap(t *testing.T) {

prodSeg.LoadDictMap(m)

f, ok := prodSeg.Find("一城山水")
f, pos, ok := prodSeg.Find("一城山水")
tt.Bool(t, ok)
tt.Equal(t, pos, "n")
tt.Equal(t, 10, f)

f, ok = prodSeg.Find("山河日月")
f, _, ok = prodSeg.Find("山河日月")
tt.Bool(t, ok)
tt.Equal(t, 13, f)
}
Expand Down
7 changes: 4 additions & 3 deletions hmm/idf/idf.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,13 @@ type Idf struct {
}

// AddToken adds a new word with IDF into it's dictionary.
func (i *Idf) AddToken(text string, frequency float64, pos ...string) {
i.seg.AddToken(text, frequency, pos...)
func (i *Idf) AddToken(text string, frequency float64, pos ...string) error {
err := i.seg.AddToken(text, frequency, pos...)

i.freqs = append(i.freqs, frequency)
sort.Float64s(i.freqs)
i.median = i.freqs[len(i.freqs)/2]
return err
}

// LoadDict load idf dictionary
Expand All @@ -48,7 +49,7 @@ func (i *Idf) LoadDict(files ...string) error {
}

// Frequency returns the IDF of given word.
func (i *Idf) Frequency(key string) (float64, bool) {
func (i *Idf) Frequency(key string) (float64, string, bool) {
return i.seg.Find(key)
}

Expand Down
6 changes: 3 additions & 3 deletions hmm/idf/tag_extracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
ws := make(Segments, 0)
var s Segment
for k, v := range freqMap {
if freq, ok := t.Idf.Frequency(k); ok {
if freq, _, ok := t.Idf.Frequency(k); ok {
s = Segment{text: k, weight: freq * v}
} else {
s = Segment{text: k, weight: t.Idf.median * v}
Expand All @@ -119,9 +119,9 @@ func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {

if len(ws) > topK {
tags = ws[:topK]
} else {
tags = ws
return
}

tags = ws
return
}
6 changes: 3 additions & 3 deletions hmm/pos/dict.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,16 @@ type Dict struct {
// }

// AddToken adds one token
func (d *Dict) AddToken(text string, frequency float64, pos ...string) {
d.Seg.AddToken(text, frequency, pos...)
func (d *Dict) AddToken(text string, frequency float64, pos ...string) error {
return d.Seg.AddToken(text, frequency, pos...)
}

func (d *Dict) updateLogTotal() {
d.logTotal = math.Log(d.total)
}

// Frequency returns the frequency and existence of give word
func (d *Dict) Frequency(key string) (float64, bool) {
func (d *Dict) Frequency(key string) (float64, string, bool) {
return d.Seg.Find(key)
}

Expand Down
8 changes: 4 additions & 4 deletions hmm/pos/pos_seg.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ func (seg *Segmenter) getDag(runes []rune) map[int][]int {
frag = runes[k : k+1]

for {
freq, ok := seg.dict.Frequency(string(frag))
freq, _, ok := seg.dict.Frequency(string(frag))
if !ok {
break
}
Expand Down Expand Up @@ -160,7 +160,7 @@ func (seg *Segmenter) calc(runes []rune) map[int]route {

for idx := n - 1; idx >= 0; idx-- {
for _, i := range dag[idx] {
if freq, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
if freq, _, ok := seg.dict.Frequency(string(runes[idx : i+1])); ok {
r = route{
frequency: math.Log(freq) - seg.dict.logTotal + rs[i+1].frequency,
index: i}
Expand Down Expand Up @@ -216,7 +216,7 @@ func (seg *Segmenter) cutDAG(sentence string) (result []gse.SegPos) {
continue
}

if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
if v, _, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
result = append(result, seg.cutDetail(bufString)...)
} else {
for _, elem := range buf {
Expand Down Expand Up @@ -259,7 +259,7 @@ func (seg *Segmenter) bufn(buf []rune) (result []gse.SegPos) {
return
}

if v, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
if v, _, ok := seg.dict.Frequency(bufString); !ok || v == 0.0 {
result = append(result, seg.cutDetail(bufString)...)
return
}
Expand Down
1 change: 1 addition & 0 deletions seg.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ type AnalyzeToken struct {

Text string
Freq float64
Pos string
}

// Segment 文本中的一个分词
Expand Down
22 changes: 12 additions & 10 deletions segmenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,19 +149,20 @@ func TestToken(t *testing.T) {
tt.Expect(t, "16", dict.MaxTokenLen())
tt.Expect(t, "5.3250742e+07", dict.TotalFreq())

freq, ok := dict.Find([]byte("世界"))
freq, pos, ok := dict.Find([]byte("世界"))
tt.Equal(t, 34387, freq)
tt.Equal(t, pos, "n")
tt.True(t, ok)

freq, ok = dict.Find([]byte("帝国大"))
freq, _, ok = dict.Find([]byte("帝国大"))
tt.Equal(t, 0, freq)
tt.True(t, ok)

freq, ok = dict.Find([]byte("帝国大厦"))
freq, _, ok = dict.Find([]byte("帝国大厦"))
tt.Equal(t, 3, freq)
tt.True(t, ok)

freq, ok = seg.Find("帝国大厦大")
freq, _, ok = seg.Find("帝国大厦大")
tt.Equal(t, 0, freq)
tt.False(t, ok)

Expand All @@ -177,26 +178,27 @@ func TestToken(t *testing.T) {
err = seg.AddToken("Winter is coming", 200)
tt.Nil(t, err)

freq, ok = seg.Find("Winter is coming")
freq, _, ok = seg.Find("Winter is coming")
tt.Equal(t, 100, freq)
tt.True(t, ok)

freq, ok = prodSeg.Find("伦敦摘星塔")
freq, _, ok = prodSeg.Find("伦敦摘星塔")
tt.Equal(t, 100, freq)
tt.True(t, ok)

err = prodSeg.AddToken("西雅图中心", 100)
tt.Nil(t, err)
err = prodSeg.AddToken("西雅图太空针", 100, "n")
tt.Nil(t, err)
freq, ok = prodSeg.Find("西雅图太空针")
freq, pos, ok = prodSeg.Find("西雅图太空针")
tt.Equal(t, 100, freq)
tt.Equal(t, pos, "n")
tt.True(t, ok)

prodSeg.AddTokenForce("Space Needle", 100, "n")
err = prodSeg.RemoveToken("西雅图太空针")
tt.Nil(t, err)
freq, ok = dict.Find([]byte("西雅图太空针"))
freq, _, ok = dict.Find([]byte("西雅图太空针"))
tt.Equal(t, 0, freq)
tt.False(t, ok)
}
Expand Down Expand Up @@ -228,11 +230,11 @@ func TestInAlphaNum(t *testing.T) {
seg, err := New("zh,./testdata/test_dict3.txt", "alpha")
tt.Nil(t, err)

freq, ok := seg.Find("hello")
freq, _, ok := seg.Find("hello")
tt.Equal(t, 20, freq)
tt.True(t, ok)

freq, ok = seg.Find("world")
freq, _, ok = seg.Find("world")
tt.Equal(t, 20, freq)
tt.True(t, ok)

Expand Down

0 comments on commit 366655c

Please sign in to comment.