|
| 1 | +#-*- coding: UTF-8 -*- |
| 2 | +import collections |
| 3 | +import os |
| 4 | + |
| 5 | +#text = |
| 6 | +''' |
| 7 | +The Zen of Python, by Tim Peters |
| 8 | +
|
| 9 | +
|
| 10 | +Beautiful is better than ugly. |
| 11 | +Explicit is better than implicit. |
| 12 | +Simple is better than complex. |
| 13 | +Complex is better than complicated. |
| 14 | +Flat is better than nested. |
| 15 | +Sparse is better than dense. |
| 16 | +Readability counts. |
| 17 | +Special cases aren't special enough to break the rules. |
| 18 | +Although practicality beats purity. |
| 19 | +Errors should never pass silently. |
| 20 | +Unless explicitly silenced. |
| 21 | +In the face of ambxiguity, refuse the temptation to guess. |
| 22 | +There should be one-- and preferably only one --obvious way to do it. |
| 23 | +Although that way may not be obvious at first unless you're Dutch. |
| 24 | +Now is better than never. |
| 25 | +Although never is often better than *right* now. |
| 26 | +If the implementation is hard to explain, it's a bad idea. |
| 27 | +If the implementation is easy to explain, it may be a good idea. |
| 28 | +Namespaces are one honking great idea -- let's do more of those! |
| 29 | +''' |
| 30 | + |
| 31 | +#text_cn = |
| 32 | +''' |
| 33 | +
|
| 34 | +来自管理员童鞋的回复:可以自己定义哈,主要是实现函数的功能 |
| 35 | +完成时可以自己写一些测试的参数,检验自己的函数功能是否正确 |
| 36 | +
|
| 37 | +''' |
| 38 | + |
| 39 | +def stats_text_en (text): #sort English words by the frequency. |
| 40 | + |
| 41 | + for i in range(len(text)): |
| 42 | + if (text[i] >= u'\u0041' and text[i]<=u'\u005a') or (text[i] >= u'\u0061' and text[i]<=u'\u007a'): |
| 43 | + break |
| 44 | + |
| 45 | + |
| 46 | + text_en = text[i:] |
| 47 | + text_en = text_en.replace('--', '') |
| 48 | + text_en = text_en.replace('!', '') |
| 49 | + text_en = text_en.replace('*', '') |
| 50 | + text_en = text_en.replace('.', ' ') |
| 51 | + text_en = text_en.replace(',', '') |
| 52 | + |
| 53 | + # print("CN words frequency: ") |
| 54 | + # print(text_en) |
| 55 | + |
| 56 | + text_en = text_en.split() |
| 57 | + |
| 58 | + counter_en = collections.Counter(text_en) |
| 59 | + print("\n\nEN words frequency: ") |
| 60 | + print(counter_en) |
| 61 | + |
| 62 | + return counter_en |
| 63 | + |
| 64 | + |
| 65 | + |
| 66 | + |
| 67 | +def stats_text_cn (text): #sort Chinese words by the frequency. |
| 68 | + text_cn = '' |
| 69 | + |
| 70 | + for ch in text: |
| 71 | + if u'\u4e00' <= ch <= u'\u9fff': #only fetch the Chinese characthers |
| 72 | + text_cn = text_cn + ch |
| 73 | + |
| 74 | + |
| 75 | + # text = text.replace(':', '') |
| 76 | + # text = text.replace(',', '') |
| 77 | + # text = text.replace('\n', '') |
| 78 | + #text = text.replace('*', '') |
| 79 | + #print ('first char:') |
| 80 | + #print (text[0]) |
| 81 | + |
| 82 | + text_split = [] |
| 83 | + |
| 84 | + for i in range(len(text_cn)): |
| 85 | + text_split.append(text_cn[i]) |
| 86 | + |
| 87 | + #text = text.split() |
| 88 | + |
| 89 | + counter_cn = collections.Counter(text_split) |
| 90 | + print("CN wrods frequency: ") |
| 91 | + print(counter_cn) |
| 92 | + return counter_cn |
| 93 | + |
| 94 | +#print(stats_text_cn(text_cn)) |
| 95 | + |
| 96 | + |
| 97 | +def stats_text (text): #call the functions above |
| 98 | + |
| 99 | + stats_text_cn (text) |
| 100 | + stats_text_en (text) |
| 101 | + |
0 commit comments