added openAIAdam optimizer

thomwolf · thomwolf · commit 0704c84ab97f · 2018-06-14T13:19:07.000+02:00
diff --git a/model_py.py b/model_py.py
@@ -6,7 +6,6 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from torch.nn.parameter import Parameter
 
 def gelu(x):
diff --git a/opt.py b/opt.py
@@ -1,23 +1,104 @@
 import math
-import numpy as np
+import torch
+from torch.optim import Optimizer
+from torch.nn.utils import clip_grad_norm
 
 def warmup_cosine(x, warmup=0.002):
-    pass
+    s = 1 if x <= warmup else 0
+    return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x)))
 
 def warmup_constant(x, warmup=0.002):
-    pass
+    s = 1 if x <= warmup else 0
+    return s*(x/warmup) + (1-s)*1
 
 def warmup_linear(x, warmup=0.002):
-    pass
+    s = 1 if x <= warmup else 0
+    return (s*(x/warmup) + (1-s))*(1-x)
 
-schedules = {
+SCHEDULES = {
     'warmup_cosine':warmup_cosine,
     'warmup_constant':warmup_constant,
     'warmup_linear':warmup_linear,
 }
 
-def adam(params, grads, lr, schedule, t_total, b1=0.9, b2=0.999, e=1e-8, l2=0, vector_l2=False, max_grad_norm=-1, **kwargs):
-    """
-    adam with weight decay fix
+
+class OpenAIAdam(Optimizer):
+    """Implements Open AI version of Adam algorithm with weight decay fix.
     """
-    pass
+    def __init__(self, params, lr, schedule, warmup, t_total,
+                 b1=0.9, b2=0.999, e=1e-8, l2=0,
+                 vector_l2=False, max_grad_norm=-1, **kwargs):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0 <= warmup:
+            raise ValueError("Invalid warmup: {}".format(warmup))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {}".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {}".format(b2))
+        if not 0.0 <= e:
+            raise ValueError("Invalid epsilon value: {}".format(e))
+        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+                        b1=b1, b2=b2, e=e, l2=l2, vector_l2=vector_l2,
+                        max_grad_norm=max_grad_norm)
+        super(OpenAIAdam, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['b1'], group['b2']
+
+                state['step'] += 1
+
+                # Add grad clipping
+                if group['max_grad_norm'] > 0:
+                    clip_grad_norm(p, group['max_grad_norm'])
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+
+                schedule_fct = SCHEDULES[group['schedule']]
+                lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
+
+                p.data.addcdiv_(-step_size, exp_avg, denom)
+
+                # Add weight decay at the end (fixed version)
+                if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0:
+                    p.data.add_(-lr_scheduled * group['l2'], p.data)
+
+        return loss
diff --git a/text_utils.py b/text_utils.py
@@ -27,9 +27,9 @@ def text_standardize(text):
     text = text.replace('―', '-')
     text = text.replace('…', '...')
     text = text.replace('´', "'")
-    text = re.sub('''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
-    text = re.sub('\s*\n\s*', ' \n ', text)
-    text = re.sub('[^\S\n]+', ' ', text)
+    text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
+    text = re.sub(r'\s*\n\s*', ' \n ', text)
+    text = re.sub(r'[^\S\n]+', ' ', text)
     return text.strip()
 
 class TextEncoder(object):
diff --git a/train.py b/train.py
@@ -18,29 +18,20 @@
 from sklearn.metrics import accuracy_score
 
 from model_py import Model, LMHead, ClfHead, load_openai_pretrained_model
-from opt import adam, warmup_cosine, warmup_linear, warmup_constant
+from opt import OpenAIAdam
 from datasets import rocstories
 from analysis import rocstories as rocstories_analysis
 from text_utils import TextEncoder
 from utils import (encode_dataset, flatten, iter_data,
                    ResultLogger, make_path)
 
-OPT_FNS = {
-    'adam':adam,
-}
-
-LR_SCHEDULES = {
-    'warmup_cosine':warmup_cosine,
-    'warmup_linear':warmup_linear,
-    'warmup_constant':warmup_constant,
-}
-
 class LossCompute:
     "A Loss compute and train function."
-    def __init__(self, lm_criterion, clf_criterion, lm_coef):
+    def __init__(self, lm_criterion, clf_criterion, lm_coef, opt=None):
         self.lm_criterion = lm_criterion
         self.clf_criterion = clf_criterion
         self.lm_coef = lm_coef
+        self.opt = opt
 
     def __call__(self, X, Y, M, lm_logits, clf_logits):
         # Language modeling loss
@@ -53,11 +44,18 @@ def __call__(self, X, Y, M, lm_logits, clf_logits):
 
         # Classification loss
         clf_losses = self.clf_criterion(clf_logits, Y)
+
         if self.lm_coef > 0:
             train_loss = clf_losses.sum() + self.lm_coef * lm_losses.sum()
         else:
             train_loss = clf_losses.sum()
-        return train_loss
+
+        train_loss.backward()
+        if self.opt is not None:
+            self.opt.step()
+            self.opt.optimizer.zero_grad()
+        return train_loss.item()
+
 
 def transform_roc(X1, X2, X3):
     n_batch = len(X1)
@@ -229,7 +227,14 @@ def transform_roc(X1, X2, X3):
     model = Model(vocab, args)
     lm_head = LMHead(model, args)
     clf_head = ClfHead(clf_token, args)
-    compute_loss = LossCompute(nn.CrossEntropyLoss(reduce=False), nn.CrossEntropyLoss(reduce=False), lm_coef) # TODO check loss functions
+
+    criterion = nn.CrossEntropyLoss(reduce=False) # TODO check loss functions
+    model_opt = OpenAIAdam(model.parameters(), lr=lr, schedule=lr_schedule,
+                            warmup=lr_warmup, t_total=n_updates_total, b1=b1,
+                            b2=b2, e=e, l2=l2, vector_l2=vector_l2,
+                            max_grad_norm=max_grad_norm)
+
+    compute_loss = LossCompute(criterion, criterion, lm_coef, model_opt)
     # TODO Initialize model (?)
     # TODO add train() and eval()
     load_openai_pretrained_model(model, n_ctx, n_special, args)
@@ -258,8 +263,6 @@ def transform_roc(X1, X2, X3):
             lm_logits = lm_head(h)
             clf_logits = clf_head(h, XMB)
             loss = compute_loss(XMB, YMB, MMB, lm_logits, clf_logits)
-            loss.backward()
-            
             n_updates += 1
             #if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0:
                 # log()