|
| 1 | +# 统计参数中每个英文单词出现的次数 |
| 2 | +def stats_text_en(text): |
| 3 | + if not isinstance(text, str): |
| 4 | + raise ValueError('参数必须是 str 类型,输入类型 %s' % type(text)) |
| 5 | + elements = text.split() |
| 6 | + words = [] |
| 7 | + symbols = ',.*-!' |
| 8 | + for element in elements: |
| 9 | + for symbol in symbols: |
| 10 | + element = element.replace(symbol, '') |
| 11 | + if len(element) and element.isascii(): |
| 12 | + words.append(element) |
| 13 | + counter = {} |
| 14 | + word_set = set(words) |
| 15 | + |
| 16 | + for word in word_set: |
| 17 | + counter[word] = words.count(word) |
| 18 | + return sorted(counter.items(), key=lambda x: x[1], reverse=True) |
| 19 | + |
| 20 | + |
| 21 | +# 统计参数中每个中文汉字出现的次数 |
| 22 | +def stats_text_cn(text): |
| 23 | + if not isinstance(text, str): |
| 24 | + raise ValueError('参数必须是 str 类型,输入类型 %s' % type(text)) |
| 25 | + cn_characters = [] |
| 26 | + for character in text: |
| 27 | + if '\u4e00' <= character <= '\u9fff': |
| 28 | + cn_characters.append(character) |
| 29 | + counter = {} |
| 30 | + cn_character_set = set(cn_characters) |
| 31 | + for character in cn_character_set: |
| 32 | + counter[character] = cn_characters.count(character) |
| 33 | + return sorted(counter.items(), key=lambda x: x[1], reverse=True) |
| 34 | + |
| 35 | + |
| 36 | + |
| 37 | +def stats_text(text): |
| 38 | + ''' |
| 39 | + 合并英文词频和中文字频的结果 |
| 40 | + ''' |
| 41 | + if not isinstance(text, str): |
| 42 | + raise ValueError('参数必须是 str 类型,输入类型 %s' % type(text)) |
| 43 | + return stats_text_en(text) + stats_text_cn(text) |
| 44 | + |
| 45 | + |
| 46 | + |
| 47 | + |
| 48 | +en_text = ''' |
| 49 | +The Zen of Python, by Tim Peters |
| 50 | +
|
| 51 | +Beautiful is better than ugly. |
| 52 | +Explicit is better than implicit. |
| 53 | +Simple is better than complex. |
| 54 | +Complex is better than complicated. |
| 55 | +Flat is better than nested. |
| 56 | +Sparse is better than dense. |
| 57 | +Readability counts. |
| 58 | +Special cases aren't special enough to break the rules. |
| 59 | +Although practicality beats purity. |
| 60 | +Errors should never pass silently. |
| 61 | +Unless explicitly silenced. |
| 62 | +In the face of ambxiguity, refuse the temptation to guess. |
| 63 | +There should be one-- and preferably only one --obvious way to do it. |
| 64 | +Although that way may not be obvious at first unless you're Dutch. |
| 65 | +Now is better than never. |
| 66 | +Although never is often better than *right* now. |
| 67 | +If the implementation is hard to explain, it's a bad idea. |
| 68 | +If the implementation is easy to explain, it may be a good idea. |
| 69 | +Namespaces are one honking great idea -- let's do more of those! |
| 70 | +''' |
| 71 | + |
| 72 | +cn_text = ''' |
| 73 | +Python 之禅 by Tim Peters |
| 74 | +
|
| 75 | +优美胜于丑陋 |
| 76 | +明了胜于晦涩 |
| 77 | +简洁胜于复杂 |
| 78 | +复杂胜于凌乱 |
| 79 | +扁平胜于嵌套 |
| 80 | +间隔胜于紧凑 |
| 81 | +可读性很重要 |
| 82 | +即便假借特里的实用性之名,也不可违背这些规则 |
| 83 | +不要包容所有错误,除非你确定需要这样做 |
| 84 | +当存在多种可能,不要尝试去猜测 |
| 85 | +而是尽量找一种,最好是唯一一种明显的解决方案 |
| 86 | +虽然这并不容易,因为你不是 Python 之父 |
| 87 | +做也许好过不做,但不假思索就动手还不如不做 |
| 88 | +。。。 |
| 89 | +''' |
| 90 | + |
| 91 | +# 搜索 __name__ == __main__ |
| 92 | +if __name__ == '__main__': |
| 93 | + en_result = stats_text_en(en_text) |
| 94 | + cn_result = stats_text_cn(cn_text) |
| 95 | + print('统计参数中每个英文单词出现的次数 ==>\n', en_result) |
| 96 | + print('统计参数中每个中文汉字出现的次数 ==>\n', cn_result) |
| 97 | + |
| 98 | + |
| 99 | + |
0 commit comments