-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkeyWords.py
112 lines (89 loc) · 4.07 KB
/
keyWords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import data_processing as dp
import initialize as init
from collections import Counter
"""
Makes 2 .txt files for keywords by member and by party (on ascending order from oldest date to newest)
Uses the term frequency to determine importance
"""
def find_KeyWords():
Data, stop_words_array = init.readCSV()
Data_list = Data['speech'].values.tolist()
Data_length = len(Data_list)
date_dict_member = {}
date_dict_party = {}
past_percentage = 0
index = 0
id = 0
#CHANGE THIS VARIABLE TO MODIFY THE AMOUNT OF DATA THAT'LL BE PROCESSED (HIGHER == LESS DATA, ALL DATA == 1)
################################
increment = 5
################################
if (increment <= 0):
print('Increment can\'t be less than 1. (Set automatically to 1)')
increment = 1
#Processes the speeches(without stemming) and makes dictionaries based on the dates for the members and parties
print ('Processing: 0%')
for speech in Data_list:
speech_list = speech.split(' ')
if (len(speech_list) > 100 and index%increment == 0):
result = dp.preprocess(speech, stop_words_array)
name = Data['member_name'][index]
party = Data['political_party'][index]
if (result != [] and type(name) == str and type(party) == str):
date_temp = Data['sitting_date'][index]
date = date_temp[-4:]
if date in date_dict_member:
if name in date_dict_member[date]:
date_dict_member[date][name] = date_dict_member[date][name] + ' ' + result
else:
date_dict_member[date][name] = result
else:
date_dict_member[date] = {name:result}
if date in date_dict_party:
if party in date_dict_party[date]:
date_dict_party[date][party] = date_dict_party[date][party] + ' ' + result
else:
date_dict_party[date][party] = result
else:
date_dict_party[date] = {party:result}
id += 1
index += 1
percentage = int(index/Data_length*100)
if (past_percentage != percentage):
print('Processing: ' + str(percentage) + '%')
past_percentage = percentage
print('Done!')
#Makes the first file, writing the 15 most frequent terms (key words) said by the members sorted by the sitting date
file = open(".\\generated_files\MemberKeyWords.txt", "w", encoding="utf-8")
for date in date_dict_member:
file.write('Year: ' + str(date) + '\n============================\n============================\n')
for name in date_dict_member[date]:
file.write(name + ':\n')
dict_list = date_dict_member[date][name].split(' ')
word_frequency = Counter(dict_list)
tags = word_frequency.most_common(15)
tags1 = []
for tag in tags:
tags1.append(tag[0])
file.write(', '.join(tags1))
file.write('\n-------------\n')
file.close()
#Makes the first file, writing the 15 most frequent terms (key words) said by the parties sorted by the sitting date
file1 = open(".\\generated_files\PartyKeyWords.txt", "w", encoding="utf-8")
for date in date_dict_party:
file1.write('Year: ' + str(date) + '\n============================\n============================\n')
for party in date_dict_party[date]:
file1.write(party + ':\n')
dict_list = date_dict_party[date][party].split(' ')
word_frequency = Counter(dict_list)
tags = word_frequency.most_common(15)
tags1 = []
for tag in tags:
tags1.append(tag[0])
file1.write(', '.join(tags1))
file1.write('\n-------------\n')
file1.close()
print('\nFiles made!')
##################################
##################################
find_KeyWords()