first version of model + weights transfer

thomwolf · thomwolf · commit 1f209c42b184 · 2018-06-14T00:18:48.000+02:00
diff --git a/model_py.py b/model_py.py
@@ -6,24 +6,18 @@
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter
 
-vocab = n_vocab + n_special + n_ctx
-
 def gelu(x):
     return 0.5*x*(1+torch.tanh(math.sqrt(2/math.pi)*(x+0.044715*torch.pow(x, 3))))
 
 def swish(x):
     return x*torch.sigmoid(x)
 
 ACT_FNS = {
-    'relu': nn.relu,
+    'relu': nn.ReLU,
     'swish': swish,
     'gelu': gelu
 }
 
-def clones(module, N):
-    "Produce N identical layers."
-    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
-
 
 class LayerNorm(nn.Module):
     "Construct a layernorm module (See citation for details)."
@@ -44,6 +38,7 @@ class Conv1D(nn.Module):
     def __init__(self, nf, rf, nx):
         super(Conv1D, self).__init__()
         self.rf = rf
+        self.nf = nf
         if rf == 1: #faster 1x1 conv
             self.w = Parameter(torch.ones(nx, nf)) # TODO change to random normal
             self.b = Parameter(torch.zeros(nf))
@@ -52,7 +47,7 @@ def __init__(self, nf, rf, nx):
 
     def forward(self, x):
         if self.rf == 1:
-            size_out = x.size()[:-1] + [nf]
+            size_out = x.size()[:-1] + [self.nf]
             x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w)
             x = x.view(*size_out)
         else:
@@ -61,14 +56,17 @@ def forward(self, x):
 
 
 class Attention(nn.Module):
-    def __init__(self, nx, n_state, n_head, attn_pdrop, resid_pdrop, scale=False):
+    def __init__(self, nx, cfg, scale=False):
         super(Attention, self).__init__()
-        self.c_attn = Conv1D(n_state*3, 1, nx)
-        self.c_proj = Conv1D(n_state, 1, nx)
+        n_state = nx # in Attention: n_state=768 (nx=n_embed) 
+        #[switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % cfg.n_head==0
+        self.n_head = cfg.n_head
         self.scale = scale
-        self.n_head = n_head
-        self.attn_dropout = nn.Dropout(attn_pdrop)
-        self.resid_dropout = nn.Dropout(resid_pdrop)
+        self.c_attn = Conv1D(n_state * 3, 1, nx)
+        self.c_proj = Conv1D(n_state, 1, nx)
+        self.attn_dropout = nn.Dropout(cfg.attn_pdrop)
+        self.resid_dropout = nn.Dropout(cfg.resid_pdrop)
 
     @staticmethod
     def mask_attn_weights(w):
@@ -87,12 +85,12 @@ def _attn(self, q, k, v):
 
     def merge_heads(self, x):
         new_x_shape = x.size()[:-2] + [np.prod(x.size()[-2:])]
-        x = x.view(*new_x_shape) # in Tensorflow version: merge_states
+        x = x.view(*new_x_shape) # in Tensorflow implem: fct merge_states
         return x.permute(0, 2, 1, 3)
 
     def split_heads(self, x, k=False):
         new_x_shape = x.size()[:-1] + [self.n_head, x.size(-1)//self.n_head]
-        x = x.view(*new_x_shape) # in Tensorflow version: split_states
+        x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states
         if k:
             return x.permute(0, 2, 3, 1)
         else:
@@ -112,53 +110,55 @@ def forward(self, x):
 
 
 class MLP(nn.Module):
-    def __init__(self, nx, n_state, afn, resid_pdrop):
+    def __init__(self, n_state, cfg): # in MLP: n_state=3072 (4 * n_embed)
         super(MLP, self).__init__()
+        nx = cfg.n_embed
         self.c_fc = Conv1D(n_state, 1, nx)
         self.c_proj = Conv1D(nx, 1, nx)
-        self.act = ACT_FNS[afn]
-        self.dropout = nn.Dropout(resid_pdrop)
+        self.act = ACT_FNS[cfg.afn]
+        self.dropout = nn.Dropout(cfg.resid_pdrop)
 
     def forward(self, x):
         h = self.act(self.c_fc(x))
-        h = self.c_proj(h)
-        return self.dropout(h)
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
 
 
 class Block(nn.Module):
-    def __init__(self, nx, n_head, attn_pdrop, resid_pdrop, afn, scale=False):
+    def __init__(self, cfg, scale=False):
         super(Block, self).__init__()
-        self.attn = Attention(nx, nx, n_head, attn_pdrop, resid_pdrop, scale)
+        nx = cfg.n_embed
+        self.attn = Attention(nx, cfg, scale)
         self.ln_1 = LayerNorm(nx)
-        self.mlp = MLP(nx, nx*4, afn, resid_pdrop)
+        self.mlp = MLP(4*nx, cfg)
         self.ln_2 = LayerNorm(nx)
 
     def forward(self, x):
-        h = self.attn(x)
-        h = self.ln_1(x)
-        h = self.mlp(x)
-        h = self.ln_2(x)
+        a = self.attn(x)
+        n = self.ln_1(x+a)
+        m = self.mlp(n)
+        h = self.ln_2(n+m)
         return h
 
 
 class Model(nn.Module):
     """ Transformer model """
-    def __init__(self, vocab, n_embd, pdrop, n_layers,
-                nx, n_head, attn_pdrop, resid_pdrop, afn):
+    def __init__(self, vocab, cfg):
         super(Model, self).__init__()
-        self.embed = nn.Embedding(vocab, n_embd)
-        self.drop = nn.Dropout(pdrop)
-        self.blocks = clones(Block(nx, n_head, attn_pdrop,
-                                   resid_pdrop, afn, scale=True), n_layers)
-        self.decoder = nn.Linear(nhid, vocab, bias=False)
-        self.decoder.weight = self.embed.weight
+        self.embed = nn.Embedding(vocab, cfg.n_embd)
+        self.drop = nn.Dropout(cfg.embd_pdrop)
+        block = Block(cfg, scale=True)
+        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(cfg.n_layer)])
+        self.decoder = nn.Linear(cfg.n_embed, vocab, bias=False)
+        self.decoder.weight = self.embed.weight # Tied weights
+        self.clf_dropout = nn.Dropout2d(cfg.clf_pdrop) # To reproduce the noise_shape parameter of TF implementation
 
     def forward(self, x, m):
         x = x.view(-1, x.size(2), x.size(3))
         m = m.view(-1, m.size(2))
         e = self.embed(x)
         h = e.sum(dim=2)
-        for block in self.blocks:
+        for block in self.h:
             h = block(h)
 
         # Language modeling logits
@@ -167,12 +167,11 @@ def forward(self, x, m):
 
         # Classification logits
         clf_h = h.view(-1, self.n_embed)
-        pool_idx = torch.eq(X[:, :, 0].contiguous().view(-1), self.clf_token)
+        pool_idx = torch.eq(x[:, :, 0].contiguous().view(-1), self.clf_token)
         clf_h = clf_h[pool_idx, :]
         clf_h = clf_h.view(-1, 2, self.n_embed, 1)
-        m = nn.Dropout2d(clf_pdrop) # To reproduce the noise_shape parameter of TF implementation
-        clf_h = m(clf_h)
+        clf_h = self.clf_dropout(clf_h)
         clf_h = clf_h.view(-1, self.n_embed)
         clf_logits = self.linear(clf_h)
 
-        return lm_logits, clf_logits
+        return lm_logits, clf_logits
diff --git a/train.py b/train.py
@@ -1,3 +1,4 @@
+import re
 import os
 import time
 import math
@@ -16,6 +17,7 @@
 from sklearn.utils import shuffle
 from sklearn.metrics import accuracy_score
 
+from model_py import Model
 from opt import adam, warmup_cosine, warmup_linear, warmup_constant
 from datasets import rocstories
 from analysis import rocstories as rocstories_analysis
@@ -51,45 +53,44 @@ def __call__(self, X, Y, M, lm_logits, clf_logits, norm):
 
         # Classification loss
         clf_losses = self.clf_criterion(clf_logits, Y)
-
         if lm_coef > 0:
             train_loss = clf_losses.sum() + lm_coef * lm_losses.sum())
         else:
             train_loss = clf_losses.sum()
         return train_loss
 
-def mgpu_train(*xs):
-    gpu_ops = []
-    gpu_grads = []
-    xs = (tf.split(x, n_gpu, 0) for x in xs)
-    for i, xs in enumerate(zip(*xs)):
-        do_reuse = True if i > 0 else None
-        with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse):
-            clf_logits, clf_losses, lm_losses = model(*xs, train=True, reuse=do_reuse)
-            if lm_coef > 0:
-                train_loss = tf.reduce_mean(clf_losses) + lm_coef*tf.reduce_mean(lm_losses)
-            else:
-                train_loss = tf.reduce_mean(clf_losses)
-            params = find_trainable_variables("model")
-            grads = tf.gradients(train_loss, params)
-            grads = list(zip(grads, params))
-            gpu_grads.append(grads)
-            gpu_ops.append([clf_logits, clf_losses, lm_losses])
-    ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
-    grads = average_grads(gpu_grads)
-    grads = [g for g, p in grads]
-    train = opt_fns[opt](params, grads, lr, partial(lr_schedules[lr_schedule], warmup=lr_warmup), n_updates_total, l2=l2, max_grad_norm=max_grad_norm, vector_l2=vector_l2, b1=b1, b2=b2, e=e)
-    return [train]+ops
-
-def mgpu_predict(*xs):
-    gpu_ops = []
-    xs = (tf.split(x, n_gpu, 0) for x in xs)
-    for i, xs in enumerate(zip(*xs)):
-        with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=True):
-            clf_logits, clf_losses, lm_losses = model(*xs, train=False, reuse=True)
-            gpu_ops.append([clf_logits, clf_losses, lm_losses])
-    ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
-    return ops
+# def mgpu_train(*xs):
+#     gpu_ops = []
+#     gpu_grads = []
+#     xs = (tf.split(x, n_gpu, 0) for x in xs)
+#     for i, xs in enumerate(zip(*xs)):
+#         do_reuse = True if i > 0 else None
+#         with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse):
+#             clf_logits, clf_losses, lm_losses = model(*xs, train=True, reuse=do_reuse)
+#             if lm_coef > 0:
+#                 train_loss = tf.reduce_mean(clf_losses) + lm_coef*tf.reduce_mean(lm_losses)
+#             else:
+#                 train_loss = tf.reduce_mean(clf_losses)
+#             params = find_trainable_variables("model")
+#             grads = tf.gradients(train_loss, params)
+#             grads = list(zip(grads, params))
+#             gpu_grads.append(grads)
+#             gpu_ops.append([clf_logits, clf_losses, lm_losses])
+#     ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
+#     grads = average_grads(gpu_grads)
+#     grads = [g for g, p in grads]
+#     train = opt_fns[opt](params, grads, lr, partial(lr_schedules[lr_schedule], warmup=lr_warmup), n_updates_total, l2=l2, max_grad_norm=max_grad_norm, vector_l2=vector_l2, b1=b1, b2=b2, e=e)
+#     return [train]+ops
+
+# def mgpu_predict(*xs):
+#     gpu_ops = []
+#     xs = (tf.split(x, n_gpu, 0) for x in xs)
+#     for i, xs in enumerate(zip(*xs)):
+#         with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=True):
+#             clf_logits, clf_losses, lm_losses = model(*xs, train=False, reuse=True)
+#             gpu_ops.append([clf_logits, clf_losses, lm_losses])
+#     ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
+#     return ops
 
 def transform_roc(X1, X2, X3):
     n_batch = len(X1)
@@ -247,6 +248,7 @@ def predict():
                     +[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)]
                    )+3, n_ctx
                 )
+    vocab = n_vocab + n_special + n_ctx
     trX, trM = transform_roc(trX1, trX2, trX3)
     vaX, vaM = transform_roc(vaX1, vaX2, vaX3)
     if submit:
@@ -257,40 +259,39 @@ def predict():
     n_batch_train = n_batch*n_gpu
     n_updates_total = (n_train//n_batch_train)*n_iter
 
-    X_train = tf.placeholder(tf.int32, [n_batch_train, 2, n_ctx, 2])
-    M_train = tf.placeholder(tf.float32, [n_batch_train, 2, n_ctx])
-    X = tf.placeholder(tf.int32, [None, 2, n_ctx, 2])
-    M = tf.placeholder(tf.float32, [None, 2, n_ctx])
-
-    Y_train = tf.placeholder(tf.int32, [n_batch_train])
-    Y = tf.placeholder(tf.int32, [None])
-
-    train, logits, clf_losses, lm_losses = mgpu_train(X_train, M_train, Y_train)
-    clf_loss = tf.reduce_mean(clf_losses)
-
-    params = find_trainable_variables('model')
-    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
-    sess.run(tf.global_variables_initializer())
+    model = Model(vocab, cfg)
+    # TODO Initialize model
 
+    # Load weights from TF model
     shapes = json.load(open('model/params_shapes.json'))
+    names = json.load(open('model/parameters_names.json'))
     offsets = np.cumsum([np.prod(shape) for shape in shapes])
     init_params = [np.load('model/params_{}.npy'.format(n)) for n in range(10)]
     init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
     init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
     init_params[0] = init_params[0][:n_ctx]
     init_params[0] = np.concatenate([init_params[1], (np.random.randn(n_special, n_embd)*0.02).astype(np.float32), init_params[0]], 0)
     del init_params[1]
-
     if n_transfer == -1:
         n_transfer = 0
     else:
         n_transfer = 1+n_transfer*12
-    sess.run([p.assign(ip) for p, ip in zip(params[:n_transfer], init_params[:n_transfer])])
-
-    eval_mgpu_logits, eval_mgpu_clf_losses, eval_mgpu_lm_losses = mgpu_predict(X_train, M_train, Y_train)
-    eval_logits, eval_clf_losses, eval_lm_losses = model(X, M, Y, train=False, reuse=True)
-    eval_clf_loss = tf.reduce_mean(eval_clf_losses)
-    eval_mgpu_clf_loss = tf.reduce_mean(eval_mgpu_clf_losses)
+    assert model.embed.weight.shape == init_params[0].shape
+    model.embed.weight = init_params[0]
+    for name, ip in zip(names[1:n_transfer], init_params[1:n_transfer]):
+        name = name[6:] # skip "model/"
+        assert name[-2:] == ":0"
+        name = name[:-2]
+        name = name.split('/')
+        pointer = model
+        for m_name in name:
+            l = re.split('(\d+)', m_name)
+            pointer = getattr(pointer, l[0])
+            if len(l) == 1:
+                num = int(l[1])
+                pointer = pointer[num]
+        assert pointer.shape == ip.shape
+        pointer = ip
 
     n_updates = 0
     n_epochs = 0