Skip to content

Commit 299867e

Browse files
committed
Update lstm.py
1 parent 18085ff commit 299867e

File tree

1 file changed

+41
-55
lines changed

1 file changed

+41
-55
lines changed

lstm.py

+41-55
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import numpy as np
22
import requests
33
import re
4-
# TODO add sentence tokenizer
54

65

76
def sigmoid(x):
@@ -88,16 +87,14 @@ def fit(self, x, label):
8887

8988
for t in reversed(range(0, n_t)):
9089
t_idx = np.arange(t * b_size, (t + 1) * b_size)
91-
if t < n_t - 1:
92-
grad_h[t_idx] += (
93-
dsigmoid(grad_f[t_idx + b_size], f[t_idx + b_size]) @ self.u_f.T +
94-
dsigmoid(grad_i[t_idx + b_size], i[t_idx + b_size]) @ self.u_i.T +
95-
dsigmoid(grad_o[t_idx + b_size], o[t_idx + b_size]) @ self.u_o.T +
96-
dtanh(grad_c_bar[t_idx + b_size], c_bar[t_idx + b_size]) @ self.u_c.T
97-
)
98-
grad_c[t_idx] = o[t_idx] * grad_h[t_idx] * (1 - np.square(np.tanh(c[t_idx])))
99-
if t < n_t - 1:
100-
grad_c[t_idx] += f[t_idx + b_size] * grad_c[t_idx + b_size]
90+
t_idx_next = t_idx + b_size if t < n_t - 1 else t_idx
91+
grad_h[t_idx] += (
92+
dsigmoid(grad_f[t_idx_next], f[t_idx_next]) @ self.u_f.T +
93+
dsigmoid(grad_i[t_idx_next], i[t_idx_next]) @ self.u_i.T +
94+
dsigmoid(grad_o[t_idx_next], o[t_idx_next]) @ self.u_o.T +
95+
dtanh(grad_c_bar[t_idx_next], c_bar[t_idx_next]) @ self.u_c.T
96+
)
97+
grad_c[t_idx] = o[t_idx] * grad_h[t_idx] * (1 - np.square(np.tanh(c[t_idx]))) + f[t_idx_next] * grad_c[t_idx_next]
10198
grad_f[t_idx] = grad_c[t_idx] * c_prev[t_idx]
10299
grad_i[t_idx] = grad_c[t_idx] * c_bar[t_idx]
103100
grad_o[t_idx] = grad_h[t_idx] * tanh(c[t_idx])
@@ -149,16 +146,14 @@ def gradient_check(self, x, label):
149146

150147
for t in reversed(range(0, n_t)):
151148
t_idx = np.arange(t * n_data, (t + 1) * n_data)
152-
if t < n_t - 1:
153-
grad_h[t_idx] += (
154-
dsigmoid(grad_f[t_idx + n_data], f[t_idx + n_data]) @ self.u_f.T +
155-
dsigmoid(grad_i[t_idx + n_data], i[t_idx + n_data]) @ self.u_i.T +
156-
dsigmoid(grad_o[t_idx + n_data], o[t_idx + n_data]) @ self.u_o.T +
157-
dtanh(grad_c_bar[t_idx + n_data], c_bar[t_idx + n_data]) @ self.u_c.T
158-
)
159-
grad_c[t_idx] = o[t_idx] * grad_h[t_idx] * (1 - np.square(np.tanh(c[t_idx])))
160-
if t < n_t - 1:
161-
grad_c[t_idx] += f[t_idx + n_data] * grad_c[t_idx + n_data]
149+
t_idx_next = t_idx + n_data if t < n_t - 1 else t_idx
150+
grad_h[t_idx] += (
151+
dsigmoid(grad_f[t_idx_next], f[t_idx_next]) @ self.u_f.T +
152+
dsigmoid(grad_i[t_idx_next], i[t_idx_next]) @ self.u_i.T +
153+
dsigmoid(grad_o[t_idx_next], o[t_idx_next]) @ self.u_o.T +
154+
dtanh(grad_c_bar[t_idx_next], c_bar[t_idx_next]) @ self.u_c.T
155+
)
156+
grad_c[t_idx] = o[t_idx] * grad_h[t_idx] * (1 - np.square(np.tanh(c[t_idx]))) + f[t_idx_next] * grad_c[t_idx_next]
162157
grad_f[t_idx] = grad_c[t_idx] * c_prev[t_idx]
163158
grad_i[t_idx] = grad_c[t_idx] * c_bar[t_idx]
164159
grad_o[t_idx] = grad_h[t_idx] * tanh(c[t_idx])
@@ -171,39 +166,30 @@ def gradient_check(self, x, label):
171166
h_prev.T @ dsigmoid(grad_f, f), h_prev.T @ dsigmoid(grad_i, i), h_prev.T @ dtanh(grad_c_bar, c_bar), h_prev.T @ dsigmoid(grad_o, o), h.T @ grad_v,
172167
constant @ dsigmoid(grad_f, f), constant @ dsigmoid(grad_i, i), constant @ dtanh(grad_c_bar, c_bar), constant @ dsigmoid(grad_o, o), constant @ grad_v
173168
]):
174-
params_a = [param.copy() for param in self.param_list]
175-
params_b = [param.copy() for param in self.param_list]
176-
params_a[j][index]+=eps
177-
params_b[j][index]-=eps
178-
179-
w_f_a, w_i_a, w_c_a, w_o_a, u_f_a, u_i_a, u_c_a, u_o_a, u_v_a, b_f_a, b_i_a, b_c_a, b_o_a, b_v_a = params_a
180-
w_f_b, w_i_b, w_c_b, w_o_b, u_f_b, u_i_b, u_c_b, u_o_b, u_v_b, b_f_b, b_i_b, b_c_b, b_o_b, b_v_b = params_b
181-
h_a, f_a, i_a, c_a, o_a, c_bar_a, h_b, f_b, i_b, c_b, o_b, c_bar_b = [
182-
np.zeros((n_t * n_data, self.n_hidden)) for _ in range(12)
183-
]
184-
185-
for t in range(n_t):
186-
t_idx = np.arange(t * n_data, (t + 1) * n_data)
187-
t_idx_prev = t_idx - n_data if t > 0 else t_idx
188-
189-
xt_batch, ht_prev_a, ht_prev_b = x_batch[t_idx], h_a[t_idx_prev], h_b[t_idx_prev]
190-
f_a[t_idx] = sigmoid(xt_batch @ w_f_a + ht_prev_a @ u_f_a + b_f_a)
191-
i_a[t_idx] = sigmoid(xt_batch @ w_i_a + ht_prev_a @ u_i_a + b_i_a)
192-
o_a[t_idx] = sigmoid(xt_batch @ w_o_a + ht_prev_a @ u_o_a + b_o_a)
193-
c_bar_a[t_idx] = tanh(xt_batch @ w_c_a + ht_prev_a @ u_c_a + b_c_a)
194-
c_a[t_idx] = f_a[t_idx] * c_a[t_idx_prev] + i_a[t_idx] * c_bar_a[t_idx]
195-
h_a[t_idx] = o_a[t_idx] * tanh(c_a[t_idx])
196-
197-
f_b[t_idx] = sigmoid(xt_batch @ w_f_b + ht_prev_b @ u_f_b + b_f_b)
198-
i_b[t_idx] = sigmoid(xt_batch @ w_i_b + ht_prev_b @ u_i_b + b_i_b)
199-
o_b[t_idx] = sigmoid(xt_batch @ w_o_b + ht_prev_b @ u_o_b + b_o_b)
200-
c_bar_b[t_idx] = tanh(xt_batch @ w_c_b + ht_prev_b @ u_c_b + b_c_b)
201-
c_b[t_idx] = f_b[t_idx] * c_b[t_idx_prev] + i_b[t_idx] * c_bar_b[t_idx]
202-
h_b[t_idx] = o_b[t_idx] * tanh(c_b[t_idx])
203-
204-
pred_a = cross_entropy(softmax(h_a @ u_v_a + b_v_a), y)
205-
pred_b = cross_entropy(softmax(h_b @ u_v_b + b_v_b), y)
206-
print('gradient_check', j, ((pred_a - pred_b) / eps / 2 - grad[index])/eps/eps)
169+
preds = [0, 0]
170+
for sign in [+1, -1]:
171+
params = [param.copy() for param in self.param_list]
172+
params[j][index] += sign * eps
173+
174+
w_f_a, w_i_a, w_c_a, w_o_a, u_f_a, u_i_a, u_c_a, u_o_a, u_v_a, b_f_a, b_i_a, b_c_a, b_o_a, b_v_a = params
175+
h_a, f_a, i_a, c_a, o_a, c_bar_a = [
176+
np.zeros((n_t * n_data, self.n_hidden)) for _ in range(6)
177+
]
178+
179+
for t in range(n_t):
180+
t_idx = np.arange(t * n_data, (t + 1) * n_data)
181+
t_idx_prev = t_idx - n_data if t > 0 else t_idx
182+
183+
xt_batch, ht_prev_a = x_batch[t_idx], h_a[t_idx_prev]
184+
f_a[t_idx] = sigmoid(xt_batch @ w_f_a + ht_prev_a @ u_f_a + b_f_a)
185+
i_a[t_idx] = sigmoid(xt_batch @ w_i_a + ht_prev_a @ u_i_a + b_i_a)
186+
o_a[t_idx] = sigmoid(xt_batch @ w_o_a + ht_prev_a @ u_o_a + b_o_a)
187+
c_bar_a[t_idx] = tanh(xt_batch @ w_c_a + ht_prev_a @ u_c_a + b_c_a)
188+
c_a[t_idx] = f_a[t_idx] * c_a[t_idx_prev] + i_a[t_idx] * c_bar_a[t_idx]
189+
h_a[t_idx] = o_a[t_idx] * tanh(c_a[t_idx])
190+
191+
preds[(sign + 1) // 2] = cross_entropy(softmax(h_a @ u_v_a + b_v_a), y)
192+
print('gradient_check', j, ((preds[1] - preds[0]) / eps / 2 - grad[index])/eps/eps)
207193

208194

209195
def sgd(self, grad_list):
@@ -270,7 +256,7 @@ def text_generation(use_word=True):
270256
word_to_ix = {word:i for i, word in enumerate(words)}
271257
ix_to_word = {i:word for i, word in enumerate(words)}
272258

273-
seq_length = 25
259+
seq_length = 50
274260
indices = np.vectorize(word_to_ix.get)(np.array(list(text)))
275261
data = np.zeros((text_size, vocab_size))
276262
data[np.arange(text_size), indices] = 1

0 commit comments

Comments
 (0)