From c5dd10452616a278149da581bee4975b54256609 Mon Sep 17 00:00:00 2001 From: vcaesar Date: Sun, 12 Sep 2021 11:23:26 -0400 Subject: [PATCH] Optimize load dictionary and stop by embed files, support user dictionary files --- circle.yml | 2 +- dict_1.16.go | 48 +++++++++++++++++++++++++++++++++++------------ dict_1.16_test.go | 37 +++++++++++++++++++++++++++++++----- 3 files changed, 69 insertions(+), 18 deletions(-) diff --git a/circle.yml b/circle.yml index 18a938b..901dae7 100644 --- a/circle.yml +++ b/circle.yml @@ -8,7 +8,7 @@ version: 2 jobs: build: docker: - - image: golang:1.16.7 + - image: golang:1.17.1 working_directory: /gopath/src/github.com/go-ego/gse steps: - checkout diff --git a/dict_1.16.go b/dict_1.16.go index 27d2daa..3dbf2ee 100644 --- a/dict_1.16.go +++ b/dict_1.16.go @@ -1,3 +1,4 @@ +//go:build go1.16 // +build go1.16 package gse @@ -19,16 +20,25 @@ func NewEmbed(dict ...string) (seg Segmenter, err error) { seg.AlphaNum = true } + err = seg.LoadDictEmbed(dict...) + return +} + +// LoadDictEmbed load dictionary by embed file +func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error) { if len(dict) > 0 { d := dict[0] - if strings.Contains(d, "zh,") { + if strings.Contains(d, ", ") { + begin := 0 s := strings.Split(d, ", ") - err = seg.LoadDictEmbed() - if err != nil { - return + if strings.Contains(d, "zh,") { + begin = 1 + err = seg.LoadDictStr(dataDict) } - err = seg.LoadDictStr(s[1]) + for i := begin; i < len(s); i++ { + err = seg.LoadDictStr(s[i]) + } return } @@ -36,12 +46,6 @@ func NewEmbed(dict ...string) (seg Segmenter, err error) { return } - err = seg.LoadDictEmbed() - return -} - -// LoadDictEmbed load dictionary by embed file -func (seg *Segmenter) LoadDictEmbed() error { return seg.LoadDictStr(dataDict) } @@ -87,7 +91,27 @@ func (seg *Segmenter) LoadDictStr(dict string) error { } // LoadStopEmbed load stop dictionary from embed file -func (seg *Segmenter) LoadStopEmbed() error { +func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error) { + if len(dict) > 0 { + d := dict[0] + if strings.Contains(d, ", ") { + begin := 0 + s := strings.Split(d, ", ") + if strings.Contains(d, "zh,") { + begin = 1 + err = seg.LoadStopStr(stopDict) + } + + for i := begin; i < len(s); i++ { + err = seg.LoadStopStr(s[i]) + } + return + } + + err = seg.LoadStopStr(d) + return + } + return seg.LoadStopStr(stopDict) } diff --git a/dict_1.16_test.go b/dict_1.16_test.go index df91e13..d8c38a0 100644 --- a/dict_1.16_test.go +++ b/dict_1.16_test.go @@ -1,19 +1,30 @@ +//go:build go1.16 // +build go1.16 package gse import ( + _ "embed" "testing" "github.com/vcaesar/tt" ) +//go:embed testdata/test_dict3.txt +var testDict string + +//go:embed testdata/test_dict2.txt +var testDict2 string + +//go:embed testdata/stop.txt +var testStop string + func TestLoadDictEmbed(t *testing.T) { - // var seg1 Segmenter - // err := seg1.LoadDictEmbed() - // tt.Nil(t, err) + var seg2 Segmenter + err := seg2.LoadDictEmbed(testDict) + tt.Nil(t, err) - seg1, err := NewEmbed("zh, world 20 n", "en") + seg1, err := NewEmbed("zh, word1 20 n, "+testDict+", "+testDict2, "en") tt.Nil(t, err) f, pos, ok := seg1.Find("1号店") @@ -21,11 +32,26 @@ func TestLoadDictEmbed(t *testing.T) { tt.Equal(t, "n", pos) tt.Equal(t, 3, f) + f, pos, ok = seg1.Find("hello") + tt.Bool(t, ok) + tt.Equal(t, "", pos) + tt.Equal(t, 20, f) + f, pos, ok = seg1.Find("world") tt.Bool(t, ok) tt.Equal(t, "n", pos) tt.Equal(t, 20, f) + f, pos, ok = seg1.Find("word1") + tt.Bool(t, ok) + tt.Equal(t, "n", pos) + tt.Equal(t, 20, f) + + f, pos, ok = seg1.Find("新星共和国") + tt.Bool(t, ok) + tt.Equal(t, "ns", pos) + tt.Equal(t, 32, f) + f, _, ok = seg1.Find("八千一百三十七万七千二百三十六口") tt.Bool(t, ok) tt.Equal(t, 2, f) @@ -33,7 +59,8 @@ func TestLoadDictEmbed(t *testing.T) { func TestLoadStopEmbed(t *testing.T) { var seg1 Segmenter - err := seg1.LoadStopEmbed() + err := seg1.LoadStopEmbed("zh, " + testStop) tt.Nil(t, err) tt.Bool(t, seg1.IsStop("比如")) + tt.Bool(t, seg1.IsStop("离开")) }