Skip to content

Commit 6bc20a1

Browse files
Andy910Wenyuhua1
authored andcommitted
自学训练营学习2群 day11 (#5167)
* Create stats_word.py * Create main.py * Create stats_word.py * Create main.py * Create main.py * Create stats_word.py * Create tang300.json * Create tang300.json * Create stats_word.py * Create main.py * Create main.py * Create stats_word.py * Create tang300.json * Create wechat.py
1 parent a928743 commit 6bc20a1

File tree

6 files changed

+6811
-0
lines changed

6 files changed

+6811
-0
lines changed

1901010091/d09/mymodule/tang300.json

+2,235
Large diffs are not rendered by default.

1901010091/d10/mymodule/tang300.json

+2,235
Large diffs are not rendered by default.

1901010091/d11/mymodule/main.py

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import json
2+
file_path='/Users/YY/Documents/GitHub/selfteaching-python-camp/1901010091/d09/mymodule/tang300.json'
3+
with open(file_path) as f:
4+
js=json.load(f)
5+
tt=[str(i) for i in js]
6+
text=''.join(tt)
7+
8+
count=20
9+
10+
import stats_word
11+
try:
12+
stats_word.stats_text(text,count)
13+
except ValueError:
14+
print("文本为非字符串")
15+

1901010091/d11/mymodule/stats_word.py

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
2+
def stats_text_en(text,count):
3+
t1=text.split()#将字符串转化为列表
4+
word=[]
5+
#去除非英文字符
6+
import re
7+
for i in t1:#历遍列表t1
8+
s=re.findall(r'[^a-zA-Z]+',i)#找出t1中的非英文字符
9+
for j in s:#j历遍所有非英文字符
10+
i=i.replace(j,'')#i中所有非英文字符用空白替换
11+
if len(i):#如果i是英文字符串
12+
word.append(i)#将i加入word中
13+
#统计词频
14+
cou={}
15+
w1=set(word)
16+
w2=list(w1)
17+
for a in range(len(w2)):
18+
cou[w2[a]]=0
19+
for b in range(len(word)):
20+
if w2[a]==word[b]:
21+
cou[w2[a]]+=1
22+
import collections
23+
yy=collections.Counter(cou).most_common(count)
24+
print(yy)
25+
26+
def stats_text_cn(text,count):
27+
word_lst=[]
28+
word_dict={}
29+
30+
import jieba
31+
seg_list=jieba.lcut(text,cut_all=False)
32+
text_jb=format(seg_list)
33+
34+
exclude_str=",]’ [。!!??、APPcontents' {},:type ,author\()title【】id《》,.<>=:+-*——“”...%1234567890a10bcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQURSTUVWXYZ"
35+
36+
textin=text_jb.split()
37+
textout=[]
38+
#添加每一个字到列表中
39+
for line in textin:
40+
for char in line:
41+
word_lst.append(char)
42+
43+
#用字典统计每个字出现的次数
44+
for char in seg_list:
45+
if char not in exclude_str:
46+
if len(char)>1:
47+
if char.strip() not in word_dict:#去除各种空白
48+
word_dict[char]=1
49+
else:
50+
word_dict[char]+=1
51+
#按字频排序
52+
import collections
53+
textout=collections.Counter(word_dict).most_common(count)
54+
55+
stats_text_cn.textout=textout
56+
#输出结果
57+
print(textout)
58+
59+
def stats_text(text,count):
60+
if type(text)!=str:
61+
raise ValueError("文本为非字符串")
62+
stats_text_en(text,count)
63+
stats_text_cn(text,count)
64+

0 commit comments

Comments
 (0)