Skip to content

Commit fab42d6

Browse files
committed
no message
1 parent d223e3a commit fab42d6

File tree

2 files changed

+41
-5
lines changed

2 files changed

+41
-5
lines changed

19100102/daweijian/mymodule/main.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,13 @@
3535
'''
3636

3737
# 运行导入的函数
38+
from os import path
39+
40+
print(__file__, __name__)
41+
42+
fp = path.join(path.dirname(path.abspath(__file__)), 'tang300.json')
3843
try:
39-
with open('tang300.json', 'r', encoding='utf-8') as f:
44+
with open(fp, 'r', encoding='utf-8') as f:
4045
d = json.load(f)
4146
s = json.dumps(d, indent=2, ensure_ascii=False)
4247
count = 10

19100102/daweijian/mymodule/stats_word.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import re
33
from collections import Counter
44

5+
import jieba
6+
57

68
def clean_ip_list(words): # 清理掉空格 标点符号
79
i = 0
@@ -34,7 +36,7 @@ def list_dict(l):
3436
else: # 存在一个字符非英文 所以整个词非英文单词
3537
b = False
3638
break
37-
if (b):
39+
if b:
3840
if word in cadiz:
3941
cadiz[word] += 1
4042
else:
@@ -70,18 +72,47 @@ def cut_count_cn(c, regex): # 取出所有中文 是一个列表
7072
return cadiz
7173

7274

75+
def clean_not_cn(words): # 去除非中文词语
76+
i = 0
77+
while i < len(words):
78+
b = False # 非中文词语
79+
for char in words[i]:
80+
if char <= '\u4e00' or char >= '\u9fff': # 如果是中文
81+
b = True
82+
break
83+
if b:
84+
words.remove(words[i])
85+
else:
86+
i = i + 1
87+
return words
88+
89+
90+
def clean_len_less_2(words): # 去掉词语长度小于2的
91+
i = 0
92+
while i < len(words):
93+
if len(words[i]) < 2:
94+
words.remove(words[i])
95+
else:
96+
i = i + 1
97+
return words
98+
99+
73100
def stats_text_cn(s, count): # 定义检索中文函数
74101
if isinstance(s, str):
75-
regex = re.compile("(?x)(?: [\w -]+ | [\x80 -\xff]{3} )")
76-
words = cut_count_cn(s, regex)
102+
# regex = re.compile("(?x)(?: [\w -]+ | [\x80 -\xff]{3} )")
103+
words = jieba.lcut(s)
104+
# 去除非中文词语
105+
words = clean_not_cn(words)
106+
# 去除词语长度小于2的
107+
words = clean_len_less_2(words)
77108
# 用Counter对数组按照value值排序
78109
c_dict = Counter(words)
79110
# 找出频率最多的前count名
80111
c_dict = c_dict.most_common(count)
81112
print(c_dict)
82113
else:
83114
raise ValueError("is not str")
84-
return c_dict
115+
# return c_dict
85116

86117

87118
# 定义stats_text函数

0 commit comments

Comments
 (0)