1
1
import numpy as np
2
2
import requests
3
3
import re
4
- # TODO add sentence tokenizer
5
4
6
5
7
6
def sigmoid (x ):
@@ -88,16 +87,14 @@ def fit(self, x, label):
88
87
89
88
for t in reversed (range (0 , n_t )):
90
89
t_idx = np .arange (t * b_size , (t + 1 ) * b_size )
91
- if t < n_t - 1 :
92
- grad_h [t_idx ] += (
93
- dsigmoid (grad_f [t_idx + b_size ], f [t_idx + b_size ]) @ self .u_f .T +
94
- dsigmoid (grad_i [t_idx + b_size ], i [t_idx + b_size ]) @ self .u_i .T +
95
- dsigmoid (grad_o [t_idx + b_size ], o [t_idx + b_size ]) @ self .u_o .T +
96
- dtanh (grad_c_bar [t_idx + b_size ], c_bar [t_idx + b_size ]) @ self .u_c .T
97
- )
98
- grad_c [t_idx ] = o [t_idx ] * grad_h [t_idx ] * (1 - np .square (np .tanh (c [t_idx ])))
99
- if t < n_t - 1 :
100
- grad_c [t_idx ] += f [t_idx + b_size ] * grad_c [t_idx + b_size ]
90
+ t_idx_next = t_idx + b_size if t < n_t - 1 else t_idx
91
+ grad_h [t_idx ] += (
92
+ dsigmoid (grad_f [t_idx_next ], f [t_idx_next ]) @ self .u_f .T +
93
+ dsigmoid (grad_i [t_idx_next ], i [t_idx_next ]) @ self .u_i .T +
94
+ dsigmoid (grad_o [t_idx_next ], o [t_idx_next ]) @ self .u_o .T +
95
+ dtanh (grad_c_bar [t_idx_next ], c_bar [t_idx_next ]) @ self .u_c .T
96
+ )
97
+ grad_c [t_idx ] = o [t_idx ] * grad_h [t_idx ] * (1 - np .square (np .tanh (c [t_idx ]))) + f [t_idx_next ] * grad_c [t_idx_next ]
101
98
grad_f [t_idx ] = grad_c [t_idx ] * c_prev [t_idx ]
102
99
grad_i [t_idx ] = grad_c [t_idx ] * c_bar [t_idx ]
103
100
grad_o [t_idx ] = grad_h [t_idx ] * tanh (c [t_idx ])
@@ -149,16 +146,14 @@ def gradient_check(self, x, label):
149
146
150
147
for t in reversed (range (0 , n_t )):
151
148
t_idx = np .arange (t * n_data , (t + 1 ) * n_data )
152
- if t < n_t - 1 :
153
- grad_h [t_idx ] += (
154
- dsigmoid (grad_f [t_idx + n_data ], f [t_idx + n_data ]) @ self .u_f .T +
155
- dsigmoid (grad_i [t_idx + n_data ], i [t_idx + n_data ]) @ self .u_i .T +
156
- dsigmoid (grad_o [t_idx + n_data ], o [t_idx + n_data ]) @ self .u_o .T +
157
- dtanh (grad_c_bar [t_idx + n_data ], c_bar [t_idx + n_data ]) @ self .u_c .T
158
- )
159
- grad_c [t_idx ] = o [t_idx ] * grad_h [t_idx ] * (1 - np .square (np .tanh (c [t_idx ])))
160
- if t < n_t - 1 :
161
- grad_c [t_idx ] += f [t_idx + n_data ] * grad_c [t_idx + n_data ]
149
+ t_idx_next = t_idx + n_data if t < n_t - 1 else t_idx
150
+ grad_h [t_idx ] += (
151
+ dsigmoid (grad_f [t_idx_next ], f [t_idx_next ]) @ self .u_f .T +
152
+ dsigmoid (grad_i [t_idx_next ], i [t_idx_next ]) @ self .u_i .T +
153
+ dsigmoid (grad_o [t_idx_next ], o [t_idx_next ]) @ self .u_o .T +
154
+ dtanh (grad_c_bar [t_idx_next ], c_bar [t_idx_next ]) @ self .u_c .T
155
+ )
156
+ grad_c [t_idx ] = o [t_idx ] * grad_h [t_idx ] * (1 - np .square (np .tanh (c [t_idx ]))) + f [t_idx_next ] * grad_c [t_idx_next ]
162
157
grad_f [t_idx ] = grad_c [t_idx ] * c_prev [t_idx ]
163
158
grad_i [t_idx ] = grad_c [t_idx ] * c_bar [t_idx ]
164
159
grad_o [t_idx ] = grad_h [t_idx ] * tanh (c [t_idx ])
@@ -171,39 +166,30 @@ def gradient_check(self, x, label):
171
166
h_prev .T @ dsigmoid (grad_f , f ), h_prev .T @ dsigmoid (grad_i , i ), h_prev .T @ dtanh (grad_c_bar , c_bar ), h_prev .T @ dsigmoid (grad_o , o ), h .T @ grad_v ,
172
167
constant @ dsigmoid (grad_f , f ), constant @ dsigmoid (grad_i , i ), constant @ dtanh (grad_c_bar , c_bar ), constant @ dsigmoid (grad_o , o ), constant @ grad_v
173
168
]):
174
- params_a = [param .copy () for param in self .param_list ]
175
- params_b = [param .copy () for param in self .param_list ]
176
- params_a [j ][index ]+= eps
177
- params_b [j ][index ]-= eps
178
-
179
- w_f_a , w_i_a , w_c_a , w_o_a , u_f_a , u_i_a , u_c_a , u_o_a , u_v_a , b_f_a , b_i_a , b_c_a , b_o_a , b_v_a = params_a
180
- w_f_b , w_i_b , w_c_b , w_o_b , u_f_b , u_i_b , u_c_b , u_o_b , u_v_b , b_f_b , b_i_b , b_c_b , b_o_b , b_v_b = params_b
181
- h_a , f_a , i_a , c_a , o_a , c_bar_a , h_b , f_b , i_b , c_b , o_b , c_bar_b = [
182
- np .zeros ((n_t * n_data , self .n_hidden )) for _ in range (12 )
183
- ]
184
-
185
- for t in range (n_t ):
186
- t_idx = np .arange (t * n_data , (t + 1 ) * n_data )
187
- t_idx_prev = t_idx - n_data if t > 0 else t_idx
188
-
189
- xt_batch , ht_prev_a , ht_prev_b = x_batch [t_idx ], h_a [t_idx_prev ], h_b [t_idx_prev ]
190
- f_a [t_idx ] = sigmoid (xt_batch @ w_f_a + ht_prev_a @ u_f_a + b_f_a )
191
- i_a [t_idx ] = sigmoid (xt_batch @ w_i_a + ht_prev_a @ u_i_a + b_i_a )
192
- o_a [t_idx ] = sigmoid (xt_batch @ w_o_a + ht_prev_a @ u_o_a + b_o_a )
193
- c_bar_a [t_idx ] = tanh (xt_batch @ w_c_a + ht_prev_a @ u_c_a + b_c_a )
194
- c_a [t_idx ] = f_a [t_idx ] * c_a [t_idx_prev ] + i_a [t_idx ] * c_bar_a [t_idx ]
195
- h_a [t_idx ] = o_a [t_idx ] * tanh (c_a [t_idx ])
196
-
197
- f_b [t_idx ] = sigmoid (xt_batch @ w_f_b + ht_prev_b @ u_f_b + b_f_b )
198
- i_b [t_idx ] = sigmoid (xt_batch @ w_i_b + ht_prev_b @ u_i_b + b_i_b )
199
- o_b [t_idx ] = sigmoid (xt_batch @ w_o_b + ht_prev_b @ u_o_b + b_o_b )
200
- c_bar_b [t_idx ] = tanh (xt_batch @ w_c_b + ht_prev_b @ u_c_b + b_c_b )
201
- c_b [t_idx ] = f_b [t_idx ] * c_b [t_idx_prev ] + i_b [t_idx ] * c_bar_b [t_idx ]
202
- h_b [t_idx ] = o_b [t_idx ] * tanh (c_b [t_idx ])
203
-
204
- pred_a = cross_entropy (softmax (h_a @ u_v_a + b_v_a ), y )
205
- pred_b = cross_entropy (softmax (h_b @ u_v_b + b_v_b ), y )
206
- print ('gradient_check' , j , ((pred_a - pred_b ) / eps / 2 - grad [index ])/ eps / eps )
169
+ preds = [0 , 0 ]
170
+ for sign in [+ 1 , - 1 ]:
171
+ params = [param .copy () for param in self .param_list ]
172
+ params [j ][index ] += sign * eps
173
+
174
+ w_f_a , w_i_a , w_c_a , w_o_a , u_f_a , u_i_a , u_c_a , u_o_a , u_v_a , b_f_a , b_i_a , b_c_a , b_o_a , b_v_a = params
175
+ h_a , f_a , i_a , c_a , o_a , c_bar_a = [
176
+ np .zeros ((n_t * n_data , self .n_hidden )) for _ in range (6 )
177
+ ]
178
+
179
+ for t in range (n_t ):
180
+ t_idx = np .arange (t * n_data , (t + 1 ) * n_data )
181
+ t_idx_prev = t_idx - n_data if t > 0 else t_idx
182
+
183
+ xt_batch , ht_prev_a = x_batch [t_idx ], h_a [t_idx_prev ]
184
+ f_a [t_idx ] = sigmoid (xt_batch @ w_f_a + ht_prev_a @ u_f_a + b_f_a )
185
+ i_a [t_idx ] = sigmoid (xt_batch @ w_i_a + ht_prev_a @ u_i_a + b_i_a )
186
+ o_a [t_idx ] = sigmoid (xt_batch @ w_o_a + ht_prev_a @ u_o_a + b_o_a )
187
+ c_bar_a [t_idx ] = tanh (xt_batch @ w_c_a + ht_prev_a @ u_c_a + b_c_a )
188
+ c_a [t_idx ] = f_a [t_idx ] * c_a [t_idx_prev ] + i_a [t_idx ] * c_bar_a [t_idx ]
189
+ h_a [t_idx ] = o_a [t_idx ] * tanh (c_a [t_idx ])
190
+
191
+ preds [(sign + 1 ) // 2 ] = cross_entropy (softmax (h_a @ u_v_a + b_v_a ), y )
192
+ print ('gradient_check' , j , ((preds [1 ] - preds [0 ]) / eps / 2 - grad [index ])/ eps / eps )
207
193
208
194
209
195
def sgd (self , grad_list ):
@@ -270,7 +256,7 @@ def text_generation(use_word=True):
270
256
word_to_ix = {word :i for i , word in enumerate (words )}
271
257
ix_to_word = {i :word for i , word in enumerate (words )}
272
258
273
- seq_length = 25
259
+ seq_length = 50
274
260
indices = np .vectorize (word_to_ix .get )(np .array (list (text )))
275
261
data = np .zeros ((text_size , vocab_size ))
276
262
data [np .arange (text_size ), indices ] = 1
0 commit comments