File tree Expand file tree Collapse file tree 3 files changed +2286
-0
lines changed
exercises/1901010059/day10 Expand file tree Collapse file tree 3 files changed +2286
-0
lines changed Original file line number Diff line number Diff line change
1
+
2
+
3
+ import json
4
+
5
+ path = r'D:\Documents\GitHub\selfteaching-python-camp\exercises\1901010059\day10\tang300.json'
6
+ with open (path ,'r' ,encoding = 'UTF-8' ) as f :
7
+ a = f .read ()
8
+
9
+ from mymodule .stats_word import stats_text_cn as cn
10
+
11
+ print (cn (a ,20 ))
12
+ #任选一个函数用a测试参数类型检测是否成功
Original file line number Diff line number Diff line change
1
+
2
+
3
+ import re
4
+ import collections #引入正则表达式
5
+ count = int ()
6
+ def stats_text_en (en ,count ): #定义一个函数
7
+ #\u4e00-\u9fa5 汉字的unicode范围
8
+ #\u0030-\u0039 数字的unicode范围
9
+ #\u0041-\u005a 大写字母unicode范围
10
+ #\u0061-\u007a 小写字母unicode范围
11
+ #sub(pattern,repl,string) 把字符串中的所有匹配表达式pattern中的地方替换成repl
12
+
13
+ if isinstance (en ,str ):
14
+ t1 = re .sub (u"([^\u0041 -\u005a \u0061 -\u007a ])" ," " ,en ) #将en中非英文字符转换成“ ”
15
+ text1 = t1 .split () #将字符串分割
16
+ d = collections .Counter (text1 ).most_common (count ) #counter 函数自带统计排列功能
17
+ return d
18
+ else :
19
+ raise ValueError ("请输入字符串" )
20
+
21
+
22
+ import jieba
23
+ def stats_text_cn (cn ,count ): #定义一个统计中文汉字字频的函数
24
+ if isinstance (cn ,str ):
25
+ t = re .sub (u"([^\u4e00 -\u9fa5 ])" ,"" ,cn ) #将cn中非中文字符转换成“”
26
+ t1 = jieba .cut (t )
27
+ t2 = []
28
+ for i in t1 :
29
+ if len (i ) >= 2 :
30
+ t2 .append (i )
31
+ d = collections .Counter (t2 ).most_common (count )
32
+ return d
33
+ else :
34
+ raise ValueError ("请输入字符串" )
35
+
36
+
37
+ def stats_text (j ,count ): #定义合并输出函数
38
+ a = stats_text_cn (j ,count ) + stats_text_en (j ,count ) #将两次统计结果合并
39
+ return a
You can’t perform that action at this time.
0 commit comments