Skip to content

Commit 15d678c

Browse files
authored
Merge pull request #2448 from shannoxu/master
1901010059 day10
2 parents c8e704a + 5928004 commit 15d678c

File tree

3 files changed

+2286
-0
lines changed

3 files changed

+2286
-0
lines changed

exercises/1901010059/day10/main.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
2+
3+
import json
4+
5+
path = r'D:\Documents\GitHub\selfteaching-python-camp\exercises\1901010059\day10\tang300.json'
6+
with open(path,'r',encoding='UTF-8') as f:
7+
a = f.read()
8+
9+
from mymodule.stats_word import stats_text_cn as cn
10+
11+
print(cn(a,20))
12+
#任选一个函数用a测试参数类型检测是否成功
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
2+
3+
import re
4+
import collections #引入正则表达式
5+
count = int()
6+
def stats_text_en(en,count): #定义一个函数
7+
#\u4e00-\u9fa5 汉字的unicode范围
8+
#\u0030-\u0039 数字的unicode范围
9+
#\u0041-\u005a 大写字母unicode范围
10+
#\u0061-\u007a 小写字母unicode范围
11+
#sub(pattern,repl,string) 把字符串中的所有匹配表达式pattern中的地方替换成repl
12+
13+
if isinstance(en,str):
14+
t1= re.sub(u"([^\u0041-\u005a\u0061-\u007a])"," ",en) #将en中非英文字符转换成“ ”
15+
text1 = t1.split() #将字符串分割
16+
d = collections.Counter(text1).most_common(count) #counter 函数自带统计排列功能
17+
return d
18+
else:
19+
raise ValueError("请输入字符串")
20+
21+
22+
import jieba
23+
def stats_text_cn(cn,count): #定义一个统计中文汉字字频的函数
24+
if isinstance(cn,str):
25+
t = re.sub(u"([^\u4e00-\u9fa5])","",cn) #将cn中非中文字符转换成“”
26+
t1 = jieba.cut(t)
27+
t2 = []
28+
for i in t1:
29+
if len(i) >= 2:
30+
t2.append(i)
31+
d = collections.Counter(t2).most_common(count)
32+
return d
33+
else:
34+
raise ValueError("请输入字符串")
35+
36+
37+
def stats_text(j,count): #定义合并输出函数
38+
a = stats_text_cn(j,count) + stats_text_en(j,count) #将两次统计结果合并
39+
return a

0 commit comments

Comments
 (0)