Skip to content

Commit c12885a

Browse files
committed
Create 1001S01E06_stats_word.py
1 parent 513b680 commit c12885a

File tree

1 file changed

+95
-0
lines changed

1 file changed

+95
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
'''这是一个封装统计英文词频的函数,以及一个封装中文词频的函数
2+
6.1创建一个名为stats_text_en的函数,并用它封装d5_exercise_stats_text.py文件中的代码 '''
3+
4+
text = '''
5+
The Zen of Python, by Tim Peters
6+
Beautiful is better than ugly.
7+
Explicit is better than implicit.
8+
Simple is better than complex.
9+
Complex is better than complicated.
10+
Flat is better than nested.
11+
Sparse is better than dense.
12+
Readability counts.
13+
Special cases aren't special enough to break the rules.
14+
Although practicality beats purity.
15+
Errors should never pass silently.
16+
Unless explicitly silenced.
17+
In the face of ambxiguity, refuse the temptation to guess.
18+
There should be one-- and preferably only one --obvious way to do it.
19+
Although that way may not be obvious at first unless you're Dutch.
20+
Now is better than never.
21+
Although never is often better than *right* now.
22+
If the implementation is hard to explain, it's a bad idea.
23+
If the implementation is easy to explain, it may be a good idea.
24+
Namespaces are one honking great idea -- let's do more of those!
25+
Python是一种计算机程序设计语言。是一种动态的、面向对象的脚本语言,最初被设计用于编写自动化脚本(shell),随着版本的不断更新和语言新功能的添加,越来越多被用于独立的、大型项目的开发。
26+
'''
27+
dict1 = {}
28+
dict2 = {}
29+
dict3 = {}
30+
dict4 = {}
31+
32+
33+
"""创建一个名为stats_text_en的函数,它的功能是为统计英文词频"""
34+
def stats_text_en(text):
35+
import re
36+
'''只保留英文'''
37+
text = re.sub("[^A-Za-z]", " ", text.strip())
38+
'''将字符串text转换为列表list1,只保留单词为list1中的元素'''
39+
list1 = re.split(r"\W+",text)
40+
'''删除list1中为空的列表元素'''
41+
while '' in list1:
42+
list1.remove('')
43+
"""i属于list1中的元素,开始循环"""
44+
for i in list1:
45+
"""将列表中的单词及单词的出现次数,分别赋值给dict1的键和值"""
46+
dict1.setdefault(i,list1.count(i))
47+
"""将dict1按照value值从大到小排列,并将结果赋给元组tup1"""
48+
tup1 = sorted(dict1.items(),key = lambda items:items[1],reverse = True)
49+
"""遍历元组tup1"""
50+
for tup1 in tup1:
51+
dict2[tup1[0]] = dict1[tup1[0]]
52+
return dict2
53+
54+
#打印统计英文词频的结果
55+
print("统计英文词频的结果为:")
56+
print(stats_text_en(text))
57+
str = ''
58+
59+
60+
'''6.2创建一个名为stats_text_cn的函数,并用它实现统计汉字词频的功能'''
61+
62+
def histogram(s, old_d):
63+
d = old_d
64+
for c in s:
65+
d[c] = d.get(c, 0) + 1
66+
return d
67+
"""创建一个名为stats_text_cn的函数,它的功能是为统计中文词频"""
68+
def stats_text_cn(text):
69+
import re
70+
"""去掉text中的英文和数字"""
71+
text = re.sub("[A-Za-z0-9]", "", text)
72+
'''将字符串text转换为列表list1,只保留单词为list1中的元素'''
73+
list1 = re.split(r"\W+",text)
74+
75+
'''删除list1中为空的列表元素'''
76+
while '' in list1:
77+
list1.remove('')
78+
79+
''' 把dict3的行拆成单字,拆成字典格式的'''
80+
dict3 = dict()
81+
'''给dict3赋值'''
82+
for i in range(len(list1)):
83+
dict3 = histogram(list1[i], dict3)
84+
85+
"""将dict3按照value值从大到小排列,并将结果赋给元组tup1"""
86+
tup1 = sorted(dict3.items(),key = lambda items:items[1],reverse = True)
87+
88+
"""遍历元组tup1"""
89+
for tup1 in tup1:
90+
dict4[tup1[0]] = dict3[tup1[0]]
91+
return dict4
92+
93+
#打印统计中文词频的结果
94+
print("统计中文词频的结果为:")
95+
print(stats_text_cn(text))

0 commit comments

Comments
 (0)