1
- # 示例字符串
2
- '''
3
- text =
4
- The Zen of Python, by Tim Peters
1
+ # this is d8 excercise for errors and exceptions
2
+ # date : 2019.3.25
3
+ # author by : qiming
4
+ # Luchen:确实还是别人的代码,上上次课的一个内容感觉不太熟悉,我的进度有点卡在统计字频这件事上。
5
5
6
6
7
- Beautiful is better than ugly.
8
- Explicit is better than implicit.
9
- Simple is better than complex.
10
- Complex is better than complicated.
11
- Flat is better than nested.
12
- Sparse is better than dense.
13
- Readability counts.
14
- Special cases aren't special enough to break the rules.
15
- Although practicality beats purity.
16
- Errors should never pass silently.
17
- Unless explicitly silenced.
18
- In the face of ambxiguity, refuse the temptation to guess.
19
- There should be one-- and preferably only one --obvious way to do it.
20
- Although that way may not be obvious at first unless you're Dutch.
21
- Now is better than never.
22
- Although never is often better than *right* now.
23
- If the implementation is hard to explain, it's a bad idea.
24
- If the implementation is easy to explain, it may be a good idea.
25
- Namespaces are one honking great idea -- let's do more of those!
26
-
27
- Python是一种计算机程序设计语言。是一种动态的、面向对象的脚本语言,最初被设计用于编写自动化脚本(shell),随着版本的不断更新和语言新功能的添加,越来越多被用于独立的、大型项目的开发。
28
-
29
- '''
30
7
import collections
31
8
import re
32
9
33
- def stats_text_en (string_en ):
34
- ''' 统计英文词频
35
-
36
- 第一步:过滤英文字符,并将string拆分为list。
37
- 第二步:清理*-等标点符号。
38
- 第三步:使用collections库中的Counter函数进行词频统计并输出统计结果。
39
- '''
40
- #print("处理前的原始字符串\n\n",string_en)
41
- result = re .sub ("[^A-Za-z]" , " " , string_en .strip ())#把非A-Z和a-z的字符串全部去除掉
42
- #print("处理后的结果\n\n",result)
43
- newList = result .split ( )
44
- i = 0
45
- for i in range (0 ,len (newList )):
46
- newList [i ]= newList [i ].strip ('*-,.?!' )
47
- if newList [i ]== ' ' :
48
- newList [i ].remove (' ' )
49
- else :
50
- i = i + 1
51
- print ('英文单词词频统计结果: ' ,collections .Counter (newList ),'\n ' )
52
-
53
-
54
- def stats_text_cn (string_cn ):
55
- ''' 统计中文汉字字频
56
-
57
- 第一步:过滤汉字字符,并定义频率统计函数 stats()。
58
- 第二步:清除文本中的标点字符,将非标点字符组成新列表 new_list。
59
- 第三步:遍历列表,将字符同上一次循环中频率统计结果作为形参传给统计函数stats()。
60
- 第四步:统计函数在上一次统计结果基础上得出本次统计结果,赋值给newDict。
61
- 第五步:new_list遍历结束,输出倒序排列的统计结果。
62
- '''
63
- result1 = re .findall (u'[\u4e00 -\u9fff ]+' , string_cn )
64
- newString = '' .join (result1 )
65
-
66
- def stats (orgString , newDict ) :
67
- d = newDict
68
- for m in orgString :
69
- d [m ] = d .get (m , 0 ) + 1
70
- return d
71
-
72
- new_list = []
73
- for char in newString :
74
- cn = char .strip ('-*、。,:?!……' )
75
- new_list .append (cn )
76
-
77
- words = dict ()
78
- for n in range (0 ,len (new_list )) :
79
- words = stats (new_list [n ],words )
80
- newWords = sorted (words .items (), key = lambda item : item [1 ], reverse = True )
81
- print ('中文汉字字频统计结果: ' ,dict (newWords ))
82
-
83
- # 调用函数
84
- #stats_text_en(text)
85
- #stats_text_cn(text)
86
-
87
10
def stats_text_en (en ) :
88
- ''' 英文词频统计'''
89
- text_en = re .sub ("[^A-Za-z]" , " " , en .strip ())
90
- enList = text_en .split ( )
91
- return collections .Counter (enList )
11
+ ''' 1. 英文词频统计。
12
+ 2. 参数类型检查,不为字符串抛出异常。
13
+ '''
14
+ if type (en ) == str :
15
+ text_en = re .sub ("[^A-Za-z]" , " " , en .strip ())
16
+ enList = text_en .split ( )
17
+ return collections .Counter (enList )
18
+ else :
19
+ raise ValueError ('type of argumengt is not str' )
92
20
93
-
94
21
def stats_text_cn (cn ) :
95
- ''' 汉字字频统计 '''
96
- cnList = re .findall (u'[\u4e00 -\u9fff ]+' , cn .strip ())
97
- cnString = '' .join (cnList )
98
- return collections .Counter (cnString )
22
+ ''' 1. 汉字字频统计
23
+ 2. 参数类型检查,不为字符串抛出异常。
24
+ '''
25
+ if type (cn ) == str :
26
+ cnList = re .findall (u'[\u4e00 -\u9fff ]+' , cn .strip ())
27
+ cnString = '' .join (cnList )
28
+ return collections .Counter (cnString )
29
+ else :
30
+ raise ValueError ('type of argumengt is not str' )
99
31
100
32
def stats_text (text_en_cn ) :
101
- ''' 合并英汉词频统计 '''
102
- return (stats_text_en (text_en_cn )+ stats_text_cn (text_en_cn ))
103
-
104
- #感觉有一个问题没太明白,我交完作业再看看……
105
- #还有,文件虽然有报错,但是好像不耽误调用
33
+ ''' 1. 合并英汉词频统计
34
+ 2. 参数类型检查,不为字符串抛出异常。
35
+ '''
36
+ if type (text_en_cn ) == str :
37
+ return (stats_text_en (text_en_cn )+ stats_text_cn (text_en_cn ))
38
+ else :
39
+ raise ValueError ('type of argumengt is not str' )
0 commit comments