-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodels-and-ensembling -VK Apr 29th.py
302 lines (237 loc) · 9.36 KB
/
models-and-ensembling -VK Apr 29th.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# %% [markdown]
# # Models and Ensembling Methods
# %% [markdown]
# ## Import dependencies
import numpy
from gensim.models import word2vec
from gensim.models import KeyedVectors
import pandas
from nltk import WordPunctTokenizer
from sklearn.preprocessing import label_binarize
import sqlite3
from sklearn.multiclass import OneVsRestClassifier
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_recall_curve # The average precision score in multi-label settings
from sklearn.metrics import average_precision_score
from sklearn import svm # Support Vector Machine
from itertools import cycle
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
# %% [markdown]
# ## Read in the data
# %% [markdown]
# ### Load raw train and test data
# %% [markdown]
# #### Load in the data from the database
# %%
dbconn = sqlite3.connect('./data/cleanedtraintest_v2.db')
train_data_df = pandas.read_sql_query('SELECT category, content_cleaned FROM train_data', dbconn)
test_data_df = pandas.read_sql_query('SELECT category, content_cleaned FROM test_data', dbconn)
dbconn.commit()
dbconn.close()
# %% [markdown]
# #### Check the if the data was loaded correctly
# %%
train_data_df
# %%
test_data_df
# # %% [markdown]
# # #### Sample 4000 rows
# # %%
# train_data_sample = train_data_df #.sample(n=10000, replace=False, random_state=123)
# train_data_sample.head()
# # %%
# test_data_sample = test_data_df #.sample(n=4000, replace=False, random_state=123)
# test_data_sample.head()
# %% [markdown]
# #### Train & Test data where x is the predictor features, y is the predicted feature
N_CLASSES = 4
x_train = train_data_df.content_cleaned
y_train = label_binarize(train_data_df.category, classes=[1, 2, 3, 4])
x_test = test_data_df.content_cleaned
y_test = label_binarize(test_data_df.category, classes=[1, 2, 3, 4])
# %% [markdown]
# ### Load word2vec data
# %% [markdown]
# #### Load word2vec feature arrays from .npz files
# load dict of arrays
w2v_train_features_array_dict = numpy.load(
'./data/word2vec-train-features-120000-min5dim300.npz')
w2v_test_features_array_dict = numpy.load(
'./data/word2vec-test-features-120000-min5dim300.npz')
# extract the first array from train
data = w2v_train_features_array_dict['arr_0']
# print the array
print(data)
# extract the first array from test
data = w2v_test_features_array_dict['arr_0']
# print the array
print(data)
# %% [markdown]
# #### Load word2vec model trained key vectors
w2v_model_train = KeyedVectors.load(
'./data/custom-trained-word2vec-120000-min5dim300.kv')
# %% [markdown]
# #### Get the word2vec data back into usable form
wpt = WordPunctTokenizer()
tokenized_corpus_train = [wpt.tokenize(document) for document in x_train]
tokenized_corpus_test = [wpt.tokenize(document) for document in x_test]
# %%
def average_word_vectors(words, model, vocabulary, num_features):
feature_vector = numpy.zeros((num_features,), dtype="float32")
nwords = 0.
for word in words:
if word in vocabulary:
nwords = nwords + 1.
feature_vector = numpy.add(feature_vector, model[word])
if nwords:
feature_vector = numpy.divide(feature_vector, nwords)
return feature_vector
def averaged_word_vectorizer(corpus, model, num_features):
vocabulary = set(model.wv.index2word)
features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
for tokenized_sentence in corpus]
return numpy.array(features)
# %% [markdown]
# #### Obtain document level embeddings
# %%
FEATURE_SIZE = 300
w2v_feature_array_train = averaged_word_vectorizer(corpus=tokenized_corpus_train,
model=w2v_model_train, num_features=FEATURE_SIZE)
w2v_feature_array_test = averaged_word_vectorizer(corpus=tokenized_corpus_test,
model=w2v_model_train, num_features=FEATURE_SIZE)
x_train_w2v = pandas.DataFrame(w2v_feature_array_train)
x_test_w2v = pandas.DataFrame(w2v_feature_array_test)
# %% [markdown]
# ## Build Models
# %% [markdown]
# ### SVM Model Building Functions
# SVM classifier function
def run_svm(x_train, y_train):
classifier = OneVsRestClassifier(svm.LinearSVC(random_state=1))
classifier.fit(x_train, y_train)
return classifier
# Calculate, then plot the Precision, Recall, Average Precision, F1
def prf1_calc(classifier, algo_name, n_classes, x_test, y_test):
# Get the decision function from the classifier
if algo_name == 'N.B' or algo_name == 'D.T':
y_score = classifier.predict(x_test)
else:
y_score = classifier.decision_function(x_test)
# The average precision score in multi-label settings
# For each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(y_test[:, i],
y_score[:, i])
average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])
# A "micro-average": quantifying score on all classes jointly
precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(),
y_score.ravel())
average_precision["micro"] = average_precision_score(y_test, y_score,
average="micro")
# Plot the data
prf1_plot(precision, recall, average_precision, algo_name, n_classes)
# Return all metrics
results = pandas.DataFrame()
results.at[0, 'P-R 0'] = numpy.round(average_precision[0], 3)
results.at[0, 'P-R 1'] = numpy.round(average_precision[1], 3)
results.at[0, 'P-R 2'] = numpy.round(average_precision[2], 3)
results.at[0, 'P-R 3'] = numpy.round(average_precision[3], 3)
results.at[0, 'P-R Avg'] = numpy.round(average_precision['micro'], 3)
return results
# Function to Plot Precision, Recall, F1
def prf1_plot(precision, recall, average_precision, algo_name, n_classes):
print(algo_name)
print('Average precision score, micro-averaged over all classes: {0:0.2f}'
.format(average_precision["micro"]))
# Plot the micro-averaged Precision-Recall curve
plt.figure()
plt.step(recall['micro'], precision['micro'], where='post')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(
'Average precision score for, micro-averaged over all classes: AP={0:0.2f}'
.format(average_precision["micro"]))
# Plot Precision-Recall curve for each class and iso-f1 curves
# setup plot details
colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])
plt.figure(figsize=(7, 8))
f_scores = numpy.linspace(0.2, 0.8, num=4)
lines = []
labels = []
for f_score in f_scores:
x = numpy.linspace(0.01, 1)
y = f_score * x / (2 * x - f_score)
l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))
lines.append(l)
labels.append('iso-f1 curves')
l, = plt.plot(recall["micro"], precision["micro"], color='gold', lw=2)
lines.append(l)
labels.append('micro-average Precision-recall (area = {0:0.2f})'
''.format(average_precision["micro"]))
for i, color in zip(range(n_classes), colors):
l, = plt.plot(recall[i], precision[i], color=color, lw=2)
lines.append(l)
labels.append('Precision-recall for class {0} (area = {1:0.2f})'
''.format(i, average_precision[i]))
fig = plt.gcf()
fig.subplots_adjust(bottom=0.25)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Extension of Precision-Recall curve to multi-class')
plt.legend(lines, labels, loc=(0, -.5), prop=dict(size=14))
plt.show()
# %% [markdown]
# ## Run the Models
# Get the SVM model fitted
svm_model = run_svm(x_train_w2v, y_train)
# %% [markdown]
# ## Get the scores
scores = pandas.DataFrame()
#%%
# For SVM calculate and plot the Precision, Recall, Avg Precision
scores = scores.append(prf1_calc(svm_model, 'SVM', N_CLASSES, x_test_w2v, y_test))
# %%
# Logistic Regression function
def run_logreg(x_train, y_train):
#classifier = OneVsRestClassifier(svm.LinearSVC(random_state=1))
classifier = OneVsRestClassifier(LogisticRegression(random_state=1))
classifier.fit(x_train, y_train)
return classifier
# %%
# Run Logistic Regression Model
logreg_model = run_logreg(x_train_w2v, y_train)
# %% For LOG REG calculate and plot the Precision, Recall, Avg Precision
scores = scores.append(prf1_calc(logreg_model, 'LOGREG', N_CLASSES, x_test_w2v, y_test))
# %%
# Naive Bayes Function
def run_nb(x_train, y_train):
classifier = OneVsRestClassifier(GaussianNB())
classifier.fit(x_train, y_train)
return classifier
# %%
# Run Naive Bayes Classifier
nb_model = run_nb(x_train_w2v, y_train)
# %% Precision, Recall, Avg. Precision for Naive Bayes
scores = scores.append(prf1_calc(nb_model, 'N.B', N_CLASSES, x_test_w2v, y_test))
# %% Decision Trees Function
def run_dectree(x_train, y_train):
classifier = OneVsRestClassifier(tree.DecisionTreeClassifier())
classifier.fit(x_train, y_train)
return classifier
# %%
# Run Decision Trees
dectree_model = run_dectree(x_train_w2v, y_train)
# %% Precision, Recall, Avg. Precision for Decision Trees
scores = scores.append(prf1_calc(dectree_model, 'D.T', N_CLASSES, x_test_w2v, y_test))
# %%