diff --git a/seg.go b/seg.go deleted file mode 100755 index cc312fc..0000000 --- a/seg.go +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2013 Hui Chen -// Copyright 2016 ego authors -// -// Licensed under the Apache License, Version 2.0 (the "License"): you may -// not use this file except in compliance with the License. You may obtain -// a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -// License for the specific language governing permissions and limitations -// under the License. - -package gse - -// AnalyzeToken analyze the segment info structure -type AnalyzeToken struct { - // 分词在文本中的起始位置 - Start int - End int - - Position int - Len int - - Type string - - Text string - Freq float64 - Pos string -} - -// Segment 文本中的一个分词 -type Segment struct { - // 分词在文本中的起始字节位置 - start int - - // 分词在文本中的结束字节位置(不包括该位置) - end int - - Position int - - // 分词信息 - token *Token -} - -// Start 返回分词在文本中的起始字节位置 -func (s *Segment) Start() int { - return s.start -} - -// End 返回分词在文本中的结束字节位置(不包括该位置) -func (s *Segment) End() int { - return s.end -} - -// Token 返回分词信息 -func (s *Segment) Token() *Token { - return s.token -} diff --git a/seg_utils.go b/seg_utils.go index ecad9b1..7103ad7 100755 --- a/seg_utils.go +++ b/seg_utils.go @@ -227,3 +227,28 @@ func Join(text []Text) string { } return string(b) } + +func printTokens(tokens []*Token, numTokens int) (output string) { + for iToken := 0; iToken < numTokens; iToken++ { + for _, word := range tokens[iToken].text { + output += fmt.Sprint(string(word)) + } + output += " " + } + return +} + +func toWords(strings ...string) []Text { + words := []Text{} + for _, s := range strings { + words = append(words, []byte(s)) + } + return words +} + +func bytesToString(bytes []Text) (output string) { + for _, b := range bytes { + output += (string(b) + "/") + } + return +} diff --git a/segmenter_test.go b/segmenter_test.go index 1e929e2..479162c 100755 --- a/segmenter_test.go +++ b/segmenter_test.go @@ -20,7 +20,7 @@ func TestGetVer(t *testing.T) { ver := GetVersion() tt.Expect(t, Version, ver) - expect(t, Version, ver) + tt.Expect(t, Version, ver) tt.Equal(t, Version, ver) } diff --git a/test_utils.go b/test_utils.go deleted file mode 100755 index 9793d27..0000000 --- a/test_utils.go +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2013 Hui Chen -// Copyright 2016 ego authors -// -// Licensed under the Apache License, Version 2.0 (the "License"): you may -// not use this file except in compliance with the License. You may obtain -// a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -// License for the specific language governing permissions and limitations -// under the License. - -package gse - -import ( - "fmt" - "testing" -) - -func printTokens(tokens []*Token, numTokens int) (output string) { - for iToken := 0; iToken < numTokens; iToken++ { - for _, word := range tokens[iToken].text { - output += fmt.Sprint(string(word)) - } - output += " " - } - return -} - -func toWords(strings ...string) []Text { - words := []Text{} - for _, s := range strings { - words = append(words, []byte(s)) - } - return words -} - -func bytesToString(bytes []Text) (output string) { - for _, b := range bytes { - output += (string(b) + "/") - } - return -} - -func expect(t *testing.T, expect string, actual interface{}) { - actualString := fmt.Sprint(actual) - if expect != actualString { - t.Errorf("期待值=\"%s\", 实际=\"%s\"", expect, actualString) - } -} diff --git a/token.go b/token.go index 2490cae..f7e8230 100755 --- a/token.go +++ b/token.go @@ -15,6 +15,51 @@ package gse +// AnalyzeToken analyze the segment info structure +type AnalyzeToken struct { + // 分词在文本中的起始位置 + Start int + End int + + Position int + Len int + + Type string + + Text string + Freq float64 + Pos string +} + +// Segment 文本中的一个分词 +type Segment struct { + // 分词在文本中的起始字节位置 + start int + + // 分词在文本中的结束字节位置(不包括该位置) + end int + + Position int + + // 分词信息 + token *Token +} + +// Start 返回分词在文本中的起始字节位置 +func (s *Segment) Start() int { + return s.start +} + +// End 返回分词在文本中的结束字节位置(不包括该位置) +func (s *Segment) End() int { + return s.end +} + +// Token 返回分词信息 +func (s *Segment) Token() *Token { + return s.token +} + // Text 字串类型,可以用来表达 // 1. 一个字元,比如 "世" 又如 "界", 英文的一个字元是一个词 // 2. 一个分词,比如 "世界" 又如 "人口"