|
37 | 37 | '''
|
38 | 38 |
|
39 | 39 | def stats_text_en (text): #sort English words by the frequency.
|
| 40 | + try: |
| 41 | + for i in range(len(text)): |
| 42 | + if (text[i] >= u'\u0041' and text[i]<=u'\u005a') or (text[i] >= u'\u0061' and text[i]<=u'\u007a'): |
| 43 | + break |
40 | 44 |
|
41 |
| - for i in range(len(text)): |
42 |
| - if (text[i] >= u'\u0041' and text[i]<=u'\u005a') or (text[i] >= u'\u0061' and text[i]<=u'\u007a'): |
43 |
| - break |
44 | 45 |
|
| 46 | + text_en = text[i:] |
| 47 | + text_en = text_en.replace('--', '') |
| 48 | + text_en = text_en.replace('!', '') |
| 49 | + text_en = text_en.replace('*', '') |
| 50 | + text_en = text_en.replace('.', ' ') |
| 51 | + text_en = text_en.replace(',', '') |
45 | 52 |
|
46 |
| - text_en = text[i:] |
47 |
| - text_en = text_en.replace('--', '') |
48 |
| - text_en = text_en.replace('!', '') |
49 |
| - text_en = text_en.replace('*', '') |
50 |
| - text_en = text_en.replace('.', ' ') |
51 |
| - text_en = text_en.replace(',', '') |
| 53 | + # print("CN words frequency: ") |
| 54 | + # print(text_en) |
52 | 55 |
|
53 |
| - # print("CN words frequency: ") |
54 |
| - # print(text_en) |
| 56 | + text_en = text_en.split() |
55 | 57 |
|
56 |
| - text_en = text_en.split() |
| 58 | + counter_en = collections.Counter(text_en) |
| 59 | + print("\n\nEN words frequency: ") |
| 60 | + print(counter_en) |
57 | 61 |
|
58 |
| - counter_en = collections.Counter(text_en) |
59 |
| - print("\n\nEN words frequency: ") |
60 |
| - print(counter_en) |
| 62 | + return counter_en |
61 | 63 |
|
62 |
| - return counter_en |
| 64 | + except TypeError: |
| 65 | + print("English sorting: TypeError catched!") |
| 66 | + |
| 67 | + |
| 68 | + |
63 | 69 |
|
64 | 70 |
|
65 | 71 |
|
66 | 72 |
|
67 | 73 | def stats_text_cn (text): #sort Chinese words by the frequency.
|
68 |
| - text_cn = '' |
| 74 | + try: |
| 75 | + text_cn = '' |
69 | 76 |
|
70 |
| - for ch in text: |
71 |
| - if u'\u4e00' <= ch <= u'\u9fff': #only fetch the Chinese characthers |
72 |
| - text_cn = text_cn + ch |
| 77 | + for ch in text: |
| 78 | + if u'\u4e00' <= ch <= u'\u9fff': #only fetch the Chinese characthers |
| 79 | + text_cn = text_cn + ch |
73 | 80 |
|
74 | 81 |
|
75 |
| - # text = text.replace(':', '') |
76 |
| - # text = text.replace(',', '') |
77 |
| - # text = text.replace('\n', '') |
78 |
| - #text = text.replace('*', '') |
79 |
| - #print ('first char:') |
80 |
| - #print (text[0]) |
| 82 | + # text = text.replace(':', '') |
| 83 | + # text = text.replace(',', '') |
| 84 | + # text = text.replace('\n', '') |
| 85 | + #text = text.replace('*', '') |
| 86 | + #print ('first char:') |
| 87 | + #print (text[0]) |
81 | 88 |
|
82 |
| - text_split = [] |
| 89 | + text_split = [] |
83 | 90 |
|
84 |
| - for i in range(len(text_cn)): |
85 |
| - text_split.append(text_cn[i]) |
| 91 | + for i in range(len(text_cn)): |
| 92 | + text_split.append(text_cn[i]) |
86 | 93 |
|
87 |
| - #text = text.split() |
88 | 94 |
|
89 |
| - counter_cn = collections.Counter(text_split) |
90 |
| - print("CN wrods frequency: ") |
91 |
| - print(counter_cn) |
92 |
| - return counter_cn |
| 95 | + counter_cn = collections.Counter(text_split) |
| 96 | + print("CN wrods frequency: ") |
| 97 | + print(counter_cn) |
| 98 | + return counter_cn |
| 99 | + |
| 100 | + except TypeError: |
| 101 | + print("Chinese sorting: TypeError catched!") |
93 | 102 |
|
94 |
| -#print(stats_text_cn(text_cn)) |
95 | 103 |
|
96 | 104 |
|
97 | 105 | def stats_text (text): #call the functions above
|
| 106 | + |
| 107 | + try: |
| 108 | + stats_text_cn (text) |
| 109 | + stats_text_en (text) |
| 110 | + except TypeError: |
| 111 | + print("Text sorting: TypeError catched!") |
98 | 112 |
|
99 |
| - stats_text_cn (text) |
100 |
| - stats_text_en (text) |
101 | 113 |
|
0 commit comments