[Tacotron2/PyT] Fix warnings, AMP state loading

alancucki · nv-kkudrynski · commit 7ce175430ff9 · 2021-12-17T05:20:41.000-08:00
diff --git a/PyTorch/SpeechSynthesis/Tacotron2/train.py b/PyTorch/SpeechSynthesis/Tacotron2/train.py
@@ -33,8 +33,6 @@
 
 import torch
 from torch.utils.data import DataLoader
-from torch.autograd import Variable
-from torch.nn.parameter import Parameter
 
 import torch.distributed as dist
 from torch.utils.data.distributed import DistributedSampler
@@ -49,8 +47,6 @@
 import dllogger as DLLogger
 from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
 
-from scipy.io.wavfile import write as write_wav
-
 
 def parse_args(parser):
     """
@@ -161,11 +157,11 @@ def parse_args(parser):
 
 def reduce_tensor(tensor, num_gpus):
     rt = tensor.clone()
-    dist.all_reduce(rt, op=dist.reduce_op.SUM)
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
     if rt.is_floating_point():
         rt = rt/num_gpus
     else:
-        rt = rt//num_gpus
+        rt = torch.div(rt, num_gpus, rounding_mode='floor')
     return rt
 
 
@@ -184,8 +180,8 @@ def init_distributed(args, world_size, rank, group_name):
     print("Done initializing distributed")
 
 
-def save_checkpoint(model, optimizer, epoch, config, amp_run, output_dir, model_name,
-                    local_rank, world_size):
+def save_checkpoint(model, optimizer, scaler, epoch, config, output_dir,
+                    model_name, local_rank, world_size):
 
     random_rng_state = torch.random.get_rng_state().cuda()
     cuda_rng_state = torch.cuda.get_rng_state(local_rank).cuda()
@@ -209,7 +205,8 @@ def save_checkpoint(model, optimizer, epoch, config, amp_run, output_dir, model_
                       'random_rng_states_all': random_rng_states_all,
                       'config': config,
                       'state_dict': model.state_dict(),
-                      'optimizer': optimizer.state_dict()}
+                      'optimizer': optimizer.state_dict(),
+                      'scaler': scaler.state_dict()}
 
         checkpoint_filename = "checkpoint_{}_{}.pt".format(model_name, epoch)
         checkpoint_path = os.path.join(output_dir, checkpoint_filename)
@@ -237,7 +234,7 @@ def get_last_checkpoint_filename(output_dir, model_name):
         return ""
 
 
-def load_checkpoint(model, optimizer, epoch, config, amp_run, filepath, local_rank):
+def load_checkpoint(model, optimizer, scaler, epoch, filepath, local_rank):
 
     checkpoint = torch.load(filepath, map_location='cpu')
 
@@ -250,9 +247,10 @@ def load_checkpoint(model, optimizer, epoch, config, amp_run, filepath, local_ra
         torch.random.set_rng_state(checkpoint['random_rng_state'])
     else:
         raise Exception("Model checkpoint must have either 'random_rng_state' or 'random_rng_states_all' key.")
-    config = checkpoint['config']
     model.load_state_dict(checkpoint['state_dict'])
     optimizer.load_state_dict(checkpoint['optimizer'])
+    scaler.load_state_dict(checkpoint['scaler'])
+    return checkpoint['config']
 
 
 # adapted from: https://discuss.pytorch.org/t/opinion-eval-should-be-a-context-manager/18998/3
@@ -271,7 +269,7 @@ def evaluating(model):
 
 
 def validate(model, criterion, valset, epoch, batch_iter, batch_size,
-             world_size, collate_fn, distributed_run, rank, batch_to_gpu):
+             world_size, collate_fn, distributed_run, rank, batch_to_gpu, amp_run):
     """Handles all the validation scoring and printing"""
     with evaluating(model), torch.no_grad():
         val_sampler = DistributedSampler(valset) if distributed_run else None
@@ -288,8 +286,11 @@ def validate(model, criterion, valset, epoch, batch_iter, batch_size,
             iter_start_time = time.perf_counter()
 
             x, y, num_items = batch_to_gpu(batch)
-            y_pred = model(x)
-            loss = criterion(y_pred, y)
+            #AMP upstream autocast
+            with torch.cuda.amp.autocast(enabled=amp_run):
+                y_pred = model(x)
+                loss = criterion(y_pred, y)
+
             if distributed_run:
                 reduced_val_loss = reduce_tensor(loss.data, world_size).item()
                 reduced_num_items = reduce_tensor(num_items.data, 1).item()
@@ -398,9 +399,9 @@ def main():
     if args.resume_from_last:
         args.checkpoint_path = get_last_checkpoint_filename(args.output, model_name)
 
-    if args.checkpoint_path is not "":
-        load_checkpoint(model, optimizer, start_epoch, model_config,
-                        args.amp, args.checkpoint_path, local_rank)
+    if args.checkpoint_path != "":
+        model_config = load_checkpoint(model, optimizer, scaler, start_epoch,
+                                       args.checkpoint_path, local_rank)
 
     start_epoch = start_epoch[0]
 
@@ -450,9 +451,6 @@ def main():
         num_iters = 0
         reduced_loss = 0
 
-        # if overflow at the last iteration then do not save checkpoint
-        overflow = False
-
         if distributed_run:
             train_loader.sampler.set_epoch(epoch)
 
@@ -492,13 +490,13 @@ def main():
             if args.amp:
                 scaler.scale(loss).backward()
                 scaler.unscale_(optimizer)
-                grad_norm = torch.nn.utils.clip_grad_norm_(
+                torch.nn.utils.clip_grad_norm_(
                     model.parameters(), args.grad_clip_thresh)
                 scaler.step(optimizer)
                 scaler.update()
             else:
                 loss.backward()
-                grad_norm = torch.nn.utils.clip_grad_norm_(
+                torch.nn.utils.clip_grad_norm_(
                     model.parameters(), args.grad_clip_thresh)
                 optimizer.step()
 
@@ -527,12 +525,12 @@ def main():
                                                iteration, args.batch_size,
                                                world_size, collate_fn,
                                                distributed_run, local_rank,
-                                               batch_to_gpu)
+                                               batch_to_gpu,
+                                               args.amp)
 
         if (epoch % args.epochs_per_checkpoint == 0) and args.bench_class == "":
-            save_checkpoint(model, optimizer, epoch, model_config,
-                            args.amp, args.output, args.model_name,
-                            local_rank, world_size)
+            save_checkpoint(model, optimizer, scaler, epoch, model_config,
+                            args.output, args.model_name, local_rank, world_size)
         if local_rank == 0:
             DLLogger.flush()
 
@@ -548,5 +546,6 @@ def main():
     if local_rank == 0:
         DLLogger.flush()
 
+
 if __name__ == '__main__':
     main()
diff --git a/PyTorch/SpeechSynthesis/Tacotron2/waveglow/model.py b/PyTorch/SpeechSynthesis/Tacotron2/waveglow/model.py
@@ -52,7 +52,7 @@ def __init__(self, c):
                                     bias=False)
 
         # Sample a random orthonormal matrix to initialize weights
-        W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
+        W = torch.linalg.qr(torch.FloatTensor(c, c).normal_())[0]
 
         # Ensure determinant is 1.0 not -1.0
         if torch.det(W) < 0: