1
+ import re
1
2
import os
2
3
import time
3
4
import math
16
17
from sklearn .utils import shuffle
17
18
from sklearn .metrics import accuracy_score
18
19
20
+ from model_py import Model
19
21
from opt import adam , warmup_cosine , warmup_linear , warmup_constant
20
22
from datasets import rocstories
21
23
from analysis import rocstories as rocstories_analysis
@@ -51,45 +53,44 @@ def __call__(self, X, Y, M, lm_logits, clf_logits, norm):
51
53
52
54
# Classification loss
53
55
clf_losses = self .clf_criterion (clf_logits , Y )
54
-
55
56
if lm_coef > 0 :
56
57
train_loss = clf_losses .sum () + lm_coef * lm_losses .sum ())
57
58
else :
58
59
train_loss = clf_losses .sum ()
59
60
return train_loss
60
61
61
- def mgpu_train (* xs ):
62
- gpu_ops = []
63
- gpu_grads = []
64
- xs = (tf .split (x , n_gpu , 0 ) for x in xs )
65
- for i , xs in enumerate (zip (* xs )):
66
- do_reuse = True if i > 0 else None
67
- with tf .device (assign_to_gpu (i , "/gpu:0" )), tf .variable_scope (tf .get_variable_scope (), reuse = do_reuse ):
68
- clf_logits , clf_losses , lm_losses = model (* xs , train = True , reuse = do_reuse )
69
- if lm_coef > 0 :
70
- train_loss = tf .reduce_mean (clf_losses ) + lm_coef * tf .reduce_mean (lm_losses )
71
- else :
72
- train_loss = tf .reduce_mean (clf_losses )
73
- params = find_trainable_variables ("model" )
74
- grads = tf .gradients (train_loss , params )
75
- grads = list (zip (grads , params ))
76
- gpu_grads .append (grads )
77
- gpu_ops .append ([clf_logits , clf_losses , lm_losses ])
78
- ops = [tf .concat (op , 0 ) for op in zip (* gpu_ops )]
79
- grads = average_grads (gpu_grads )
80
- grads = [g for g , p in grads ]
81
- train = opt_fns [opt ](params , grads , lr , partial (lr_schedules [lr_schedule ], warmup = lr_warmup ), n_updates_total , l2 = l2 , max_grad_norm = max_grad_norm , vector_l2 = vector_l2 , b1 = b1 , b2 = b2 , e = e )
82
- return [train ]+ ops
83
-
84
- def mgpu_predict (* xs ):
85
- gpu_ops = []
86
- xs = (tf .split (x , n_gpu , 0 ) for x in xs )
87
- for i , xs in enumerate (zip (* xs )):
88
- with tf .device (assign_to_gpu (i , "/gpu:0" )), tf .variable_scope (tf .get_variable_scope (), reuse = True ):
89
- clf_logits , clf_losses , lm_losses = model (* xs , train = False , reuse = True )
90
- gpu_ops .append ([clf_logits , clf_losses , lm_losses ])
91
- ops = [tf .concat (op , 0 ) for op in zip (* gpu_ops )]
92
- return ops
62
+ # def mgpu_train(*xs):
63
+ # gpu_ops = []
64
+ # gpu_grads = []
65
+ # xs = (tf.split(x, n_gpu, 0) for x in xs)
66
+ # for i, xs in enumerate(zip(*xs)):
67
+ # do_reuse = True if i > 0 else None
68
+ # with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse):
69
+ # clf_logits, clf_losses, lm_losses = model(*xs, train=True, reuse=do_reuse)
70
+ # if lm_coef > 0:
71
+ # train_loss = tf.reduce_mean(clf_losses) + lm_coef*tf.reduce_mean(lm_losses)
72
+ # else:
73
+ # train_loss = tf.reduce_mean(clf_losses)
74
+ # params = find_trainable_variables("model")
75
+ # grads = tf.gradients(train_loss, params)
76
+ # grads = list(zip(grads, params))
77
+ # gpu_grads.append(grads)
78
+ # gpu_ops.append([clf_logits, clf_losses, lm_losses])
79
+ # ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
80
+ # grads = average_grads(gpu_grads)
81
+ # grads = [g for g, p in grads]
82
+ # train = opt_fns[opt](params, grads, lr, partial(lr_schedules[lr_schedule], warmup=lr_warmup), n_updates_total, l2=l2, max_grad_norm=max_grad_norm, vector_l2=vector_l2, b1=b1, b2=b2, e=e)
83
+ # return [train]+ops
84
+
85
+ # def mgpu_predict(*xs):
86
+ # gpu_ops = []
87
+ # xs = (tf.split(x, n_gpu, 0) for x in xs)
88
+ # for i, xs in enumerate(zip(*xs)):
89
+ # with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=True):
90
+ # clf_logits, clf_losses, lm_losses = model(*xs, train=False, reuse=True)
91
+ # gpu_ops.append([clf_logits, clf_losses, lm_losses])
92
+ # ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
93
+ # return ops
93
94
94
95
def transform_roc (X1 , X2 , X3 ):
95
96
n_batch = len (X1 )
@@ -247,6 +248,7 @@ def predict():
247
248
+ [len (x1 [:max_len ])+ max (len (x2 [:max_len ]), len (x3 [:max_len ])) for x1 , x2 , x3 in zip (teX1 , teX2 , teX3 )]
248
249
)+ 3 , n_ctx
249
250
)
251
+ vocab = n_vocab + n_special + n_ctx
250
252
trX , trM = transform_roc (trX1 , trX2 , trX3 )
251
253
vaX , vaM = transform_roc (vaX1 , vaX2 , vaX3 )
252
254
if submit :
@@ -257,40 +259,39 @@ def predict():
257
259
n_batch_train = n_batch * n_gpu
258
260
n_updates_total = (n_train // n_batch_train )* n_iter
259
261
260
- X_train = tf .placeholder (tf .int32 , [n_batch_train , 2 , n_ctx , 2 ])
261
- M_train = tf .placeholder (tf .float32 , [n_batch_train , 2 , n_ctx ])
262
- X = tf .placeholder (tf .int32 , [None , 2 , n_ctx , 2 ])
263
- M = tf .placeholder (tf .float32 , [None , 2 , n_ctx ])
264
-
265
- Y_train = tf .placeholder (tf .int32 , [n_batch_train ])
266
- Y = tf .placeholder (tf .int32 , [None ])
267
-
268
- train , logits , clf_losses , lm_losses = mgpu_train (X_train , M_train , Y_train )
269
- clf_loss = tf .reduce_mean (clf_losses )
270
-
271
- params = find_trainable_variables ('model' )
272
- sess = tf .Session (config = tf .ConfigProto (allow_soft_placement = True ))
273
- sess .run (tf .global_variables_initializer ())
262
+ model = Model (vocab , cfg )
263
+ # TODO Initialize model
274
264
265
+ # Load weights from TF model
275
266
shapes = json .load (open ('model/params_shapes.json' ))
267
+ names = json .load (open ('model/parameters_names.json' ))
276
268
offsets = np .cumsum ([np .prod (shape ) for shape in shapes ])
277
269
init_params = [np .load ('model/params_{}.npy' .format (n )) for n in range (10 )]
278
270
init_params = np .split (np .concatenate (init_params , 0 ), offsets )[:- 1 ]
279
271
init_params = [param .reshape (shape ) for param , shape in zip (init_params , shapes )]
280
272
init_params [0 ] = init_params [0 ][:n_ctx ]
281
273
init_params [0 ] = np .concatenate ([init_params [1 ], (np .random .randn (n_special , n_embd )* 0.02 ).astype (np .float32 ), init_params [0 ]], 0 )
282
274
del init_params [1 ]
283
-
284
275
if n_transfer == - 1 :
285
276
n_transfer = 0
286
277
else :
287
278
n_transfer = 1 + n_transfer * 12
288
- sess .run ([p .assign (ip ) for p , ip in zip (params [:n_transfer ], init_params [:n_transfer ])])
289
-
290
- eval_mgpu_logits , eval_mgpu_clf_losses , eval_mgpu_lm_losses = mgpu_predict (X_train , M_train , Y_train )
291
- eval_logits , eval_clf_losses , eval_lm_losses = model (X , M , Y , train = False , reuse = True )
292
- eval_clf_loss = tf .reduce_mean (eval_clf_losses )
293
- eval_mgpu_clf_loss = tf .reduce_mean (eval_mgpu_clf_losses )
279
+ assert model .embed .weight .shape == init_params [0 ].shape
280
+ model .embed .weight = init_params [0 ]
281
+ for name , ip in zip (names [1 :n_transfer ], init_params [1 :n_transfer ]):
282
+ name = name [6 :] # skip "model/"
283
+ assert name [- 2 :] == ":0"
284
+ name = name [:- 2 ]
285
+ name = name .split ('/' )
286
+ pointer = model
287
+ for m_name in name :
288
+ l = re .split ('(\d+)' , m_name )
289
+ pointer = getattr (pointer , l [0 ])
290
+ if len (l ) == 1 :
291
+ num = int (l [1 ])
292
+ pointer = pointer [num ]
293
+ assert pointer .shape == ip .shape
294
+ pointer = ip
294
295
295
296
n_updates = 0
296
297
n_epochs = 0
0 commit comments