19100205 10day

11661246 · 11661246 · commit ef6e7d13a6a1 · 2019-03-30T22:56:05.000+08:00
diff --git a/19100205/11661246/d5_exercise_string.py b/19100205/11661246/d5_exercise_string.py
@@ -25,7 +25,7 @@
 
 # 删除包含ea的单词
 list_str = str.split () # 将字符串转换为列表
-list_str1 = []
+list_str1 = [] 
 for i in list_str:
     if 'ea' not in i:
         list_str1.append(i) # 此循环用if条件删除包含ea的单词
diff --git a/19100205/11661246/main.py b/19100205/11661246/main.py
@@ -26,5 +26,4 @@
 except ValueError:
     print('输入的不是文本格式，请重新输入：')   
     
-print(mymodule.stats_word.stats_text_cn(text))
-
+print(mymodule.stats_word.stats_text_cn(text))
diff --git a/19100205/11661246/mymodule/__pycache__/stats_word.cpython-37.pyc b/19100205/11661246/mymodule/__pycache__/stats_word.cpython-37.pyc
diff --git a/19100205/11661246/mymodule/stats_word.py b/19100205/11661246/mymodule/stats_word.py
@@ -1,3 +1,7 @@
+# encoding=utf 
+import jieba #day10
+import collections
+
 # 函数1：统计输入文本中英文单词的词频：
 def stats_text_en(text):
     if not isinstance(text,str):
@@ -17,14 +21,19 @@ def stats_text_en(text):
 def stats_text_cn(text):  
     if not isinstance(text,str):
         raise ValueError('输入的不是文本格式，请重新输入：') # 第8天作业要求，添加参数类型检查
-    dic = {}
+    
+    text1 = []
     for i in text:  # 这个循环有效，说明一串汉字也是一个字符串，每个汉字就是其中的一个元素，可以用for in 来遍历，其中i代表了每个汉字的unicode编码
-        if u'\u4e00' <= i <= u'\u9fff':     # 挑选出中文字
-            dic[i] = text.count(i)      # 用.count()函数/方法来对每个元素（这里是汉字）进行计数，形成一个字典
-    import collections
+        if u'\u4e00' > i > u'\u9fff':     # 挑选出非中文字
+            text=text.split(i,"") # 将非中文字符替换为空格
+    seg_list = jieba.cut(text,cut_all =False)
+    
+    for j in seg_list:
+             if len(j) >= 2 : #只统计长度大于等于2的词
+              text1.append(j)
     count = int(input('请输入要限制输出的元素个数：'))
-    dic = collections.Counter(dic).most_common(count)  #按出现次数从大到小排列
-    return dic
+    text1 = collections.Counter(text1).most_common(count)  #按出现次数从大到小排列
+    return text1
 
 
 # 函数3：统计中英文混合词频：
diff --git a/19100205/11661246/mymodule/test.py b/19100205/11661246/mymodule/test.py
@@ -0,0 +1,30 @@
+import jieba
+import collections
+# 函数2：统计输入文本中中文字的词频：
+def stats_text_cn(text):  
+    if not isinstance(text,str):
+        raise ValueError('输入的不是文本格式，请重新输入：') # 第8天作业要求，添加参数类型检查
+    
+    text1 = []
+    for i in text:  # 这个循环有效，说明一串汉字也是一个字符串，每个汉字就是其中的一个元素，可以用for in 来遍历，其中i代表了每个汉字的unicode编码
+        if i <u'\u4e00' or > u'\u9fff':     # 挑选出非中文字
+            text=text.split(i,"") # 将非中文字符删除
+    seg_list = jieba.cut(text,cut_all =False)
+    
+    for j in seg_list:
+             if len(j) >= 2 : #只统计长度大于等于2的词
+              text1.append(j)
+    count = int(input('请输入要限制输出的元素个数：'))
+    text1 = collections.Counter(text1).most_common(count)  #按出现次数从大到小排列
+    return text1
+    
+with open('tang300.json','r',encoding='UTF-8') as f:
+    a = f.read()
+try:
+    if not isinstance(a,str):
+        raise ValueError()
+    
+except ValueError:
+    print('输入的不是文本格式，请重新输入：')   
+    
+print(stats_text_cn(a))
diff --git a/19100205/11661246/mymodule/text11.PY b/19100205/11661246/mymodule/text11.PY
@@ -1,5 +1,6 @@
 import collections
 import re
+import jieba
 def stats_text_en(en,count):  
     '''英文词频统计'''
     '''参数类型检查；如果输入参数不为字符串则抛出ValueError'''
@@ -37,4 +38,4 @@ def stats_text(text_e_c,count_e_c):
     else:
          raise ValueError("输入的不是字符串")
         
-           
+