forked from levitation/toxic_comments
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodels.py
119 lines (100 loc) · 5.47 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from keras import regularizers
from keras.utils import multi_gpu_model
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Bidirectional, LSTM, Merge, BatchNormalization
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import Input, Concatenate, GlobalAveragePooling1D, concatenate
def get_cnn(embedding_matrix, num_classes, max_seq_len, num_filters=64, l2_weight_decay=0.0001, dropout_val=0.5, dense_dim=32, add_sigmoid=True, train_embeds=False, gpus=0):
model = Sequential()
model.add(Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], input_length=max_seq_len, trainable=train_embeds))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(dropout_val))
model.add(Dense(dense_dim, activation='relu', kernel_regularizer=regularizers.l2(l2_weight_decay)))
if add_sigmoid:
model.add(Dense(num_classes, activation='sigmoid'))
if gpus > 0:
model = multi_gpu_model(model, gpus=gpus)
return model
def get_lstm(embedding_matrix, num_classes, max_seq_len, l2_weight_decay=0.0001, lstm_dim=50, dropout_val=0.3, dense_dim=32, add_sigmoid=True, train_embeds=False, gpus=0):
model = Sequential()
model.add(Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], input_length=max_seq_len, trainable=train_embeds))
model.add(Bidirectional(LSTM(lstm_dim, return_sequences=True)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(dropout_val))
model.add(Dense(lstm_dim, activation="relu"))
model.add(Dropout(dropout_val))
model.add(Dense(dense_dim, activation='relu', kernel_regularizer=regularizers.l2(l2_weight_decay)))
if add_sigmoid:
model.add(Dense(num_classes, activation="sigmoid"))
if gpus > 0:
model = multi_gpu_model(model, gpus=gpus)
return model
def get_concat_model(embedding_matrix, num_classes, max_seq_len, n_layers=10, concat=0, pool='max', num_filters=64, l2_weight_decay=0.0001, lstm_dim=50, dropout_val=0.5, dense_dim=32, add_sigmoid=True, train_embeds=False, gpus=0):
model_dense = get_dense_model(embedding_matrix,
num_classes,
max_seq_len,
100,
n_layers,
concat,
dropout_val,
l2_weight_decay,
pool,
add_sigmoid=False,
train_embeds=train_embeds)
model_lstm = get_lstm(embedding_matrix,
num_classes,
max_seq_len,
l2_weight_decay,
lstm_dim,
dropout_val,
dense_dim,
add_sigmoid=False,
train_embeds=train_embeds)
model_cnn = get_cnn(embedding_matrix,
num_classes,
max_seq_len,
num_filters,
l2_weight_decay,
dropout_val,
dense_dim,
add_sigmoid=False,
train_embeds=train_embeds)
model = Sequential()
model.add(Merge([model_lstm, model_cnn, model_dense], mode = 'concat'))
model.add(BatchNormalization())
model.add(Dropout(dropout_val))
model.add(Dense(dense_dim, activation='relu', kernel_regularizer=regularizers.l2(l2_weight_decay)))
if add_sigmoid:
model.add(Dense(num_classes, activation="sigmoid"))
if gpus > 0:
model = multi_gpu_model(model, gpus=gpus)
return model
def get_tfidf(x_train, x_val, x_test, max_features=50000):
word_tfidf = TfidfVectorizer(max_features=max_features, analyzer='word', lowercase=True, ngram_range=(1, 3), token_pattern='[a-zA-Z0-9]')
char_tfidf = TfidfVectorizer(max_features=max_features, analyzer='char', lowercase=True, ngram_range=(1, 5), token_pattern='[a-zA-Z0-9]')
train_tfidf_word = word_tfidf.fit_transform(x_train)
val_tfidf_word = word_tfidf.transform(x_val)
test_tfidf_word = word_tfidf.transform(x_test)
train_tfidf_char = char_tfidf.fit_transform(x_train)
val_tfidf_char = char_tfidf.transform(x_val)
test_tfidf_char = char_tfidf.transform(x_test)
train_tfidf = sparse.hstack([train_tfidf_word, train_tfidf_char])
val_tfidf = sparse.hstack([val_tfidf_word, val_tfidf_char])
test_tfidf = sparse.hstack([test_tfidf_word, test_tfidf_char])
return train_tfidf, val_tfidf, test_tfidf, word_tfidf, char_tfidf
def get_most_informative_features(vectorizers, clf, n=20):
feature_names = []
for vectorizer in vectorizers:
feature_names.extend(vectorizer.get_feature_names())
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
return coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]
def save_predictions(df, predictions, target_labels, additional_name=None):
for i, label in enumerate(target_labels):
if additional_name is not None:
label = '{}_{}'.format(additional_name, label)
df[label] = predictions[:, i]