-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifyer.py
275 lines (175 loc) · 7.86 KB
/
classifyer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#CSE 3521 Project: Logistic Regression comparison
#now we have all the necessary parameters for prediction
#starting actual code
import numpy
import matplotlib.pyplot
#I realize that the proper pythonic way is 'import numpy as np
#and import matplotlib.pyplot as plt, but I just prefer to call
#functions by their actual names
#first, we make a small little template dictionary class to store all unique words with a counter to them,
# basically like Collections.counter
#the counter will serve as the X feature vector for a given sentence
class dict_num_count(dict):
def append(self, word, generate_more_features = True):
if word not in self.keys():
if generate_more_features:
self[word] = 1
else:
self[word] += 1
def reset(self):
for x in self.keys():
self[x] = 0
def create_word_feature_vector():
#making the universal word dictionary, or the actual feature vector
task_a_train_file = open("SemEval2018-Task3/datasets/train/SemEval2018-T3-train-taskA_emoji.txt", 'r', encoding='utf8')
task_a_train_lines = task_a_train_file.readlines()
#doing some parsing here to figure out what goes where
total_sentences = len(task_a_train_lines)
task_a_train_lines.remove(task_a_train_lines[0])
#in this first line of code, we just initialize the bag of words vector
th_dict = dict_num_count()
th_answers = []
#we create a vector of the answers as well, with
#1 as positive, 0 as neutral, and -1 as negative
for words2 in task_a_train_lines:
wordl2 = words2.split()
th_answers.append(int(wordl2[1]))
wordl2.remove(wordl2[1])
for w in wordl2:
th_dict.append(w)
th_dict.reset()
return task_a_train_lines, th_dict, th_answers
def initialize_weight_vectors(template:dict, num_len):
#making a contigous array just in case it could possible speed up update times
unique_words = template.values()
total_weight_array = numpy.ascontiguousarray(numpy.ones((num_len, len(unique_words))))
#weights initialized to .0005, value just taken from inpsiration of the 3/25 in class
total_weight_array *= .0005
return total_weight_array
def calculate_feature_vector(th_dict, sentence, generate_features = True):
#calculates for only one sentence, a feature vector X_i, where i is the training sample number
words1 = sentence.split()
for w1 in words1:
th_dict.append(w1, generate_features)
value_copy = list(th_dict.values())
th_dict.reset()
return value_copy
'''A function for splitting up the correct y vector into different classes'''
def answer_split(ans):
#theoretically this is a bit slow since these could be initialized with numpy and in the same for loop
#as the origenal function, but this way makes me feel more confident in the approach
positive_ans = []
negative_ans = []
for x in ans:
if x == 1:
positive_ans.append(1)
else:
positive_ans.append(0)
for x2 in ans:
if x2 == 0:
negative_ans.append(1)
else:
negative_ans.append(0)
return numpy.array(positive_ans), numpy.array(negative_ans)
def make_LR_for_sentences(unique_word_dict, sentences, weight_vectors, gen_information = True):
#first we make our x_vector
#the entire X vector is a NxM matrix where N are the p arameters (bag of words)
#and M contains each sentence
X = []
for x in sentences:
x_i = calculate_feature_vector(unique_word_dict, x, gen_information)
X.append(x_i)
X = numpy.array(X)
positive_weight_vector = weight_vectors[0]
positive_z = numpy.dot(X, positive_weight_vector)
positive_z = 1/(1+numpy.exp(-(positive_z)))
negative_weight_vector= weight_vectors[1]
negative_z = numpy.dot(X, negative_weight_vector)
negative_z = 1 / (1 + numpy.exp(-(negative_z)))
return X, positive_z, negative_z
def train(template:dict_num_count, data_set, data_set_ans, weight_array, learning_rate, epoaches=10):
data_set_m = len(data_set)
y_pos_ans, y_negative_ans = answer_split(data_set_ans)
#setting up the loss arrays for each of the catagories
positive_loss = []
negative_loss = []
#data_set should be a list of sentences for each of the data sets, followed by their class (positive, etc)
for x in range(0, epoaches):
print("epoch: " + str(x))
X, P_y1, P_y2 = make_LR_for_sentences(template, data_set, weight_array)
y_pred = numpy.array([P_y2, P_y1])
y_pred = numpy.argmax(y_pred, axis=0)
#just subtracting to match the range (argmax outputs from 0, 1, 2, but answers come in -1, 0, 1 due to
#the implementation choice
predict_catagory(data_set_ans, y_pred)
avg_pos_grad = numpy.dot(X.transpose(), (P_y1 - y_pos_ans)) / data_set_m
avg_negative_grad1 = numpy.dot(X.transpose(), (P_y2 - y_negative_ans)) / data_set_m
avg_gradients = numpy.array([avg_pos_grad, avg_negative_grad1])
avg_gradients *= learning_rate
weight_array -= avg_gradients
return weight_array, positive_loss, negative_loss
def save_loss(y1, y, updated_loss: list):
#using the same loss function as in the 4/5 class
len_of_dataset = len(y)
loss = -(1/len_of_dataset)*numpy.sum(y*numpy.log(y1) + (1-y)*numpy.log(1-y1))
updated_loss.append(loss)
print("loss: ", str(loss))
def make_graph(loss, type):
if type == 1:
str_var = "positive"
elif type == 0:
str_var = "neutral"
else:
str_var = "negative"
#the loss is a loss array
# Data for plotting
e = numpy.arange(len(loss))
f = loss
e += 1
fig, ax = matplotlib.pyplot.subplots()
ax.plot(e, f)
ax.set(xlabel='epoch (e)', ylabel='loss',
title='loss over epoch graph for logistic regression of class ' + str_var)
ax.grid()
fig.savefig("test_" + str_var + ".png")
matplotlib.pyplot.show()
def predict_catagory(y, y_pre):
#this will process corect and incorrect answers
#y_pre is a vector from the function to predict the guess, by taking arg_max
y = numpy.array(y, dtype=numpy.int)
correct = numpy.equal(y, y_pre)
accuracy = numpy.sum(correct)/len(y)
print("accuracy:" + str(accuracy))
def predict_test_category(feature_vectors, weight_values):
#first, we get all the test data and load them into variables
test_data = open("SemEval2018-Task3/datasets/goldtest_TaskA/SemEval2018-T3_gold_test_taskA_emoji.txt", encoding="utf8")
all_sentences = []
task_a_test_lines = test_data.readlines()
total_sentences = len(task_a_test_lines)
task_a_test_lines.remove(task_a_test_lines[0])
# in this first line of code, we just initialize the bag of words vector
th_dict = dict_num_count()
th_answers = []
# we create a vector of the answers as well, with
# 1 as positive, 0 as neutral, and -1 as negative
for words2 in task_a_test_lines:
wordl2 = words2.split()
th_answers.append(int(wordl2[1]))
wordl2.remove(wordl2[1])
for w in wordl2:
th_dict.append(w)
# just making an array of the numbers to make the returning parameters easier
th_dict.reset()
X, P_y1, P_y2 = make_LR_for_sentences(feature_vectors, task_a_test_lines, weight_values, False)
y_pred = numpy.array([P_y2, P_y1])
y_pred = numpy.argmax(y_pred, axis=0)
print("Test Data:\n")
predict_catagory(th_answers, y_pred)
#here is the main code entry point
#task a regression
sentences, feature_vectors, answers = create_word_feature_vector()
epoachs = 5
weight_vectors = initialize_weight_vectors(feature_vectors, 2)
completed_weight_vectors, positive_loss, negative_loss = train(feature_vectors, sentences, answers, weight_vectors, .1, epoachs)
predict_test_category(feature_vectors, completed_weight_vectors)
#task_b feature vector