File tree Expand file tree Collapse file tree 2 files changed +41
-5
lines changed
19100102/daweijian/mymodule Expand file tree Collapse file tree 2 files changed +41
-5
lines changed Original file line number Diff line number Diff line change 35
35
'''
36
36
37
37
# 运行导入的函数
38
+ from os import path
39
+
40
+ print (__file__ , __name__ )
41
+
42
+ fp = path .join (path .dirname (path .abspath (__file__ )), 'tang300.json' )
38
43
try :
39
- with open ('tang300.json' , 'r' , encoding = 'utf-8' ) as f :
44
+ with open (fp , 'r' , encoding = 'utf-8' ) as f :
40
45
d = json .load (f )
41
46
s = json .dumps (d , indent = 2 , ensure_ascii = False )
42
47
count = 10
Original file line number Diff line number Diff line change 2
2
import re
3
3
from collections import Counter
4
4
5
+ import jieba
6
+
5
7
6
8
def clean_ip_list (words ): # 清理掉空格 标点符号
7
9
i = 0
@@ -34,7 +36,7 @@ def list_dict(l):
34
36
else : # 存在一个字符非英文 所以整个词非英文单词
35
37
b = False
36
38
break
37
- if ( b ) :
39
+ if b :
38
40
if word in cadiz :
39
41
cadiz [word ] += 1
40
42
else :
@@ -70,18 +72,47 @@ def cut_count_cn(c, regex): # 取出所有中文 是一个列表
70
72
return cadiz
71
73
72
74
75
+ def clean_not_cn (words ): # 去除非中文词语
76
+ i = 0
77
+ while i < len (words ):
78
+ b = False # 非中文词语
79
+ for char in words [i ]:
80
+ if char <= '\u4e00 ' or char >= '\u9fff ' : # 如果是中文
81
+ b = True
82
+ break
83
+ if b :
84
+ words .remove (words [i ])
85
+ else :
86
+ i = i + 1
87
+ return words
88
+
89
+
90
+ def clean_len_less_2 (words ): # 去掉词语长度小于2的
91
+ i = 0
92
+ while i < len (words ):
93
+ if len (words [i ]) < 2 :
94
+ words .remove (words [i ])
95
+ else :
96
+ i = i + 1
97
+ return words
98
+
99
+
73
100
def stats_text_cn (s , count ): # 定义检索中文函数
74
101
if isinstance (s , str ):
75
- regex = re .compile ("(?x)(?: [\w -]+ | [\x80 -\xff ]{3} )" )
76
- words = cut_count_cn (s , regex )
102
+ # regex = re.compile("(?x)(?: [\w -]+ | [\x80 -\xff]{3} )")
103
+ words = jieba .lcut (s )
104
+ # 去除非中文词语
105
+ words = clean_not_cn (words )
106
+ # 去除词语长度小于2的
107
+ words = clean_len_less_2 (words )
77
108
# 用Counter对数组按照value值排序
78
109
c_dict = Counter (words )
79
110
# 找出频率最多的前count名
80
111
c_dict = c_dict .most_common (count )
81
112
print (c_dict )
82
113
else :
83
114
raise ValueError ("is not str" )
84
- return c_dict
115
+ # return c_dict
85
116
86
117
87
118
# 定义stats_text函数
You can’t perform that action at this time.
0 commit comments