[upd] simplify codes & add more logs

keyu-tian · keyu-tian · commit 64d3aeeca0f5 · 2024-04-16T07:48:22.000+08:00
diff --git a/README.md b/README.md
@@ -105,7 +105,7 @@ torchrun --nproc_per_node=8 --nnodes=... --node_rank=... --master_addr=... --mas
   --depth=30 --bs=1024 --ep=350 --fp16=1 --alng=1e-5 --wpe=0.01
 ```
 A folder named `local_output` will be created to save the checkpoints and logs.
-You can monitor the training process by checking the logs in `local_output/stdout.txt`, or using `tensorboard --logdir=local_output/`.
+You can monitor the training process by checking the logs in `local_output/log.txt` and `local_output/stdout.txt`, or using `tensorboard --logdir=local_output/`.
 
 If your experiment is interrupted, just rerun the command, and the training will **automatically resume** from the last checkpoint in `local_output/ckpt*.pth` (see [utils/misc.py#L344-L357](utils/misc.py#L344-L357)).
 
diff --git a/dist.py b/dist.py
@@ -83,10 +83,6 @@ def is_local_master():
     return __local_rank == 0
 
 
-def is_visualizer():
-    return __rank == 0
-
-
 def new_group(ranks: List[int]):
     if __initialized:
         return tdist.new_group(ranks=ranks)
@@ -201,7 +197,7 @@ def wrapper(*args, **kwargs):
 def for_visualize(func):
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
-        if is_visualizer():
+        if is_master():
             # with torch.no_grad():
             ret = func(*args, **kwargs)
         else:
diff --git a/train.py b/train.py
@@ -1,28 +1,27 @@
 import gc
 import os
-import random
+import shutil
 import sys
 import time
 import warnings
 from functools import partial
 
-import numpy as np
 import torch
 from torch.utils.data import DataLoader
 
 import dist
-from utils.misc import auto_resume
 from utils import arg_util, misc
 from utils.data import build_dataset
 from utils.data_sampler import DistInfiniteBatchSampler, EvalDistributedSampler
+from utils.misc import auto_resume
 
 
 def build_everything(args: arg_util.Args):
     # resume
     auto_resume_info, start_ep, start_it, trainer_state, args_state = auto_resume(args, 'ar-ckpt*.pth')
     # create tensorboard logger
     tb_lg: misc.TensorboardLogger
-    with_tb_lg = dist.is_visualizer()
+    with_tb_lg = dist.is_master()
     if with_tb_lg:
         os.makedirs(args.tb_log_dir_path, exist_ok=True)
         # noinspection PyTypeChecker
@@ -130,7 +129,7 @@ def build_everything(args: arg_util.Args):
     
     # build trainer
     trainer = VARTrainer(
-        is_visualizer=dist.is_visualizer(), device=args.device, patch_nums=args.patch_nums, resos=args.resos,
+        device=args.device, patch_nums=args.patch_nums, resos=args.resos,
         vae_local=vae_local, var_wo_ddp=var_wo_ddp, var=var,
         var_opt=var_optim, label_smooth=args.ls,
     )
@@ -157,7 +156,7 @@ def build_everything(args: arg_util.Args):
         )
         print({k: meter.global_avg for k, meter in me.meters.items()})
         
-        tb_lg.flush(); tb_lg.close()
+        args.dump_log(); tb_lg.flush(); tb_lg.close()
         if isinstance(sys.stdout, misc.SyncPrint) and isinstance(sys.stderr, misc.SyncPrint):
             sys.stdout.close(), sys.stderr.close()
         exit(0)
@@ -169,7 +168,7 @@ def build_everything(args: arg_util.Args):
     )
 
 
-def main():
+def main_training():
     args: arg_util.Args = arg_util.init_dist_and_get_args()
     if args.local_debug:
         torch.autograd.set_detect_anomaly(True)
@@ -181,9 +180,9 @@ def main():
     ) = build_everything(args)
     
     # train
-    start_time, min_L_mean, min_L_tail, max_acc_mean, max_acc_tail = time.time(), 999., 999., -1., -1.
-    last_val_loss_mean, best_val_loss_mean, last_val_acc_mean, best_val_acc_mean = 999, 999, 0, 0
-    last_val_loss_tail, best_val_loss_tail, last_val_acc_tail, best_val_acc_tail = 999, 999, 0, 0
+    start_time = time.time()
+    best_L_mean, best_L_tail, best_acc_mean, best_acc_tail = 999., 999., -1., -1.
+    best_val_loss_mean, best_val_loss_tail, best_val_acc_mean, best_val_acc_tail = 999, 999, -1, -1
     
     L_mean, L_tail = -1, -1
     for ep in range(start_ep, args.ep):
@@ -199,49 +198,46 @@ def main():
         )
         
         L_mean, L_tail, acc_mean, acc_tail, grad_norm = stats['Lm'], stats['Lt'], stats['Accm'], stats['Acct'], stats['tnm']
-        min_L_mean, max_acc_mean, max_acc_tail = min(min_L_mean, L_mean), max(max_acc_mean, acc_mean), max(max_acc_tail, acc_tail)
-        if L_tail != -1:
-            min_L_tail = min(min_L_tail, L_tail)
-        args.min_L_mean, args.min_L_tail, args.max_acc_mean, args.max_acc_tail, args.grad_norm = min_L_mean, min_L_tail, (None if max_acc_mean < 0 else max_acc_mean), (None if max_acc_tail < 0 else max_acc_tail), grad_norm
+        best_L_mean, best_acc_mean = min(best_L_mean, L_mean), max(best_acc_mean, acc_mean)
+        if L_tail != -1: best_L_tail, best_acc_tail = min(best_L_tail, L_tail), max(best_acc_tail, acc_tail)
+        args.L_mean, args.L_tail, args.acc_mean, args.acc_tail, args.grad_norm = L_mean, L_tail, acc_mean, acc_tail, grad_norm
         args.cur_ep = f'{ep+1}/{args.ep}'
         args.remain_time, args.finish_time = remain_time, finish_time
         
-        AR_ep_loss = {}
+        AR_ep_loss = dict(L_mean=L_mean, L_tail=L_tail, acc_mean=acc_mean, acc_tail=acc_tail)
         is_val_and_also_saving = (ep + 1) % 10 == 0 or (ep + 1) == args.ep
         if is_val_and_also_saving:
-            last_val_loss_mean, last_val_loss_tail, last_val_acc_mean, last_val_acc_tail, tot, cost = trainer.eval_ep(ld_val)
-            best_val_loss_mean, best_val_loss_tail = min(best_val_loss_mean, last_val_loss_mean), min(best_val_loss_tail, last_val_loss_tail)
-            best_val_acc_mean, best_val_acc_tail = max(best_val_acc_mean, last_val_acc_mean), max(best_val_acc_tail, last_val_acc_tail)
-            AR_ep_loss['vL_mean'], AR_ep_loss['vL_tail'], AR_ep_loss['vacc_mean'], AR_ep_loss['vacc_tail'] = last_val_loss_mean, last_val_loss_tail, last_val_acc_mean, last_val_acc_tail
+            val_loss_mean, val_loss_tail, val_acc_mean, val_acc_tail, tot, cost = trainer.eval_ep(ld_val)
+            best_updated = best_val_loss_tail > val_loss_tail
+            best_val_loss_mean, best_val_loss_tail = min(best_val_loss_mean, val_loss_mean), min(best_val_loss_tail, val_loss_tail)
+            best_val_acc_mean, best_val_acc_tail = max(best_val_acc_mean, val_acc_mean), max(best_val_acc_tail, val_acc_tail)
+            AR_ep_loss.update(vL_mean=val_loss_mean, vL_tail=val_loss_tail, vacc_mean=val_acc_mean, vacc_tail=val_acc_tail)
+            args.vL_mean, args.vL_tail, args.vacc_mean, args.vacc_tail = val_loss_mean, val_loss_tail, val_acc_mean, val_acc_tail
             print(f' [*] [ep{ep}]  (val {tot})  Lm: {L_mean:.4f}, Lt: {L_tail:.4f}, Acc m&t: {acc_mean:.2f} {acc_tail:.2f},  Val cost: {cost:.2f}s')
+            
+            if dist.is_local_master():
+                local_out_ckpt = os.path.join(args.local_out_dir_path, 'ar-ckpt-last.pth')
+                local_out_ckpt_best = os.path.join(args.local_out_dir_path, 'ar-ckpt-best.pth')
+                print(f'[saving ckpt] ...', end='', flush=True)
+                torch.save({
+                    'epoch':    ep+1,
+                    'iter':     0,
+                    'trainer':  trainer.state_dict(),
+                    'args':     args.state_dict(),
+                }, local_out_ckpt)
+                if best_updated:
+                    shutil.copy(local_out_ckpt, local_out_ckpt_best)
+                print(f'     [saving ckpt](*) finished!  @ {local_out_ckpt}', flush=True, clean=True)
+            dist.barrier()
         
-        print(    f'     [ep{ep}]  (training )  Lm: {min_L_mean:.3f} ({L_mean:.3f}), Lt: {min_L_tail:.3f} ({L_tail:.3f}),  Acc m&t: {max_acc_mean:.2f} {max_acc_tail:.2f},  Remain: {remain_time},  Finish: {finish_time}', flush=True)
-        if ep > args.ep // 20 and min_L_tail < 99:
-            tb_lg.update(head='AR_y_result', step=ep+1, min_L_mean=min_L_mean, min_L_tail=min_L_tail, max_acc_mean=max_acc_mean, max_acc_tail=max_acc_tail)
-        
-        AR_ep_loss['L_mean'], AR_ep_loss['L_tail'], AR_ep_loss['acc_mean'], AR_ep_loss['acc_tail'] = L_mean, L_tail, acc_mean, acc_tail
+        print(    f'     [ep{ep}]  (training )  Lm: {best_L_mean:.3f} ({L_mean:.3f}), Lt: {best_L_tail:.3f} ({L_tail:.3f}),  Acc m&t: {best_acc_mean:.2f} {best_acc_tail:.2f},  Remain: {remain_time},  Finish: {finish_time}', flush=True)
         tb_lg.update(head='AR_ep_loss', step=ep+1, **AR_ep_loss)
         tb_lg.update(head='AR_z_burnout', step=ep+1, rest_hours=round(sec / 60 / 60, 2))
-        
-        if is_val_and_also_saving and dist.is_master():
-            local_out_ckpt = os.path.join(args.local_out_dir_path, 'ar-ckpt-last.pth')
-            torch.save({
-                'epoch':    ep+1,
-                'iter':     0,
-                'trainer':  trainer.state_dict(),
-                'args':     args.state_dict(),
-            }, local_out_ckpt)
-        
-        tb_lg.flush()
-        dist.barrier()
-    
-    tb_lg.update(head='AR_y_result_final', step=start_ep, min_L_mean=min_L_mean, min_L_tail=min_L_tail, max_acc_mean=max_acc_mean, max_acc_tail=max_acc_tail)
-    tb_lg.update(head='AR_y_result_final', step=args.ep, min_L_mean=min_L_mean, min_L_tail=min_L_tail, max_acc_mean=max_acc_mean, max_acc_tail=max_acc_tail)
-    tb_lg.flush()
+        args.dump_log(); tb_lg.flush()
     
     total_time = f'{(time.time() - start_time) / 60 / 60:.1f}h'
     print('\n\n')
-    print(f'  [*] [PT finished]  Total Time: {total_time},   Lm: {min_L_mean:.3f} ({L_mean}),   Lt: {min_L_tail:.3f} ({L_tail})')
+    print(f'  [*] [PT finished]  Total cost: {total_time},   Lm: {best_L_mean:.3f} ({L_mean}),   Lt: {best_L_tail:.3f} ({L_tail})')
     print('\n\n')
     
     del stats
@@ -250,7 +246,7 @@ def main():
     
     args.remain_time, args.finish_time = '-', time.strftime("%Y-%m-%d %H:%M", time.localtime(time.time() - 60))
     print(f'final args:\n\n{str(args)}')
-    tb_lg.flush(); tb_lg.close()
+    args.dump_log(); tb_lg.flush(); tb_lg.close()
     dist.barrier()
 
 
@@ -285,6 +281,7 @@ def train_one_ep(ep: int, is_first_ep: bool, start_it: int, args: arg_util.Args,
         
         wp_it = args.wp * iters_train
         min_tlr, max_tlr, min_twd, max_twd = lr_wd_annealing(args.sche, trainer.var_opt.optimizer, args.tlr, args.twd, args.twde, g_it, wp_it, max_it, wp0=args.wp0, wpe=args.wpe)
+        args.cur_lr, args.cur_wd = max_tlr, max_twd
         
         if args.pg: # default: 0.0, no progressive training, won't get into this
             if g_it <= wp_it: prog_si = args.pg0
@@ -310,8 +307,7 @@ def train_one_ep(ep: int, is_first_ep: bool, start_it: int, args: arg_util.Args,
         tb_lg.update(head='AR_opt_lr/lr_max', sche_tlr=max_tlr)
         tb_lg.update(head='AR_opt_wd/wd_max', sche_twd=max_twd)
         tb_lg.update(head='AR_opt_wd/wd_min', sche_twd=min_twd)
-        if scale_log2 is not None:
-            tb_lg.update(head='AR_opt_grad/fp16', scale_log2=scale_log2)
+        tb_lg.update(head='AR_opt_grad/fp16', scale_log2=scale_log2)
         
         if args.tclip > 0:
             tb_lg.update(head='AR_opt_grad/grad', grad_norm=grad_norm)
@@ -335,18 +331,7 @@ def forward(self, *args, **kwargs):
 
 
 if __name__ == '__main__':
-    try:
-        main()
-    except Exception as err:
-        time.sleep(dist.get_rank() * 1 + random.random() * 0.5)
-        try:
-            # noinspection PyArgumentList
-            print(f'[rk{dist.get_rank():2d}] {type(err).__name__}', flush=True, force=True)
-        except:
-            try: print(f'[rk{dist.get_rank():2d}] {type(err).__name__}', flush=True)
-            except: pass
-        if dist.is_master(): print(f'[err]:\n{err}')
-        raise err
+    try: main_training()
     finally:
         dist.finalize()
         if isinstance(sys.stdout, misc.SyncPrint) and isinstance(sys.stderr, misc.SyncPrint):
diff --git a/trainer.py b/trainer.py
@@ -19,7 +19,7 @@
 
 class VARTrainer(object):
     def __init__(
-        self, is_visualizer: bool, device, patch_nums: Tuple[int, ...], resos: Tuple[int, ...],
+        self, is_master: bool, device, patch_nums: Tuple[int, ...], resos: Tuple[int, ...],
         vae_local: VQVAE, var_wo_ddp: VAR, var: DDP,
         var_opt: AmpOptimizer, label_smooth: float,
     ):
@@ -30,8 +30,6 @@ def __init__(
         self.var_wo_ddp: VAR = var_wo_ddp  # after torch.compile
         self.var_opt = var_opt
         
-        self.is_visualizer = is_visualizer
-        
         del self.var_wo_ddp.rng
         self.var_wo_ddp.rng = torch.Generator(device=device)
         
@@ -112,12 +110,12 @@ def train_step(
             self.var_wo_ddp.forward
             logits_BLV = self.var(label_B, x_BLCv_wo_first_l)
             loss = self.train_loss(logits_BLV.view(-1, V), gt_BL.view(-1)).view(B, -1)
-            if prog_si >= 0:
+            if prog_si >= 0:    # in progressive training
                 bg, ed = self.begin_ends[prog_si]
                 assert logits_BLV.shape[1] == gt_BL.shape[1] == ed
                 lw = self.loss_weight[:, :ed].clone()
                 lw[:, bg:ed] *= min(max(prog_wp, 0), 1)
-            else:
+            else:               # not in progressive training
                 lw = self.loss_weight
             loss = loss.mul(lw).sum(dim=-1).mean()
         
@@ -126,33 +124,27 @@ def train_step(
         
         # log
         pred_BL = logits_BLV.data.argmax(dim=-1)
-        if it in metric_lg.log_iters:
+        if it == 0 or it in metric_lg.log_iters:
             Lmean = self.val_loss(logits_BLV.data.view(-1, V), gt_BL.view(-1)).item()
             acc_mean = (pred_BL == gt_BL).float().mean().item() * 100
-            if prog_si < 0:
+            if prog_si >= 0:    # in progressive training
+                Ltail = acc_tail = -1
+            else:               # not in progressive training
                 Ltail = self.val_loss(logits_BLV.data[:, -self.last_l:].reshape(-1, V), gt_BL[:, -self.last_l:].reshape(-1)).item()
                 acc_tail = (pred_BL[:, -self.last_l:] == gt_BL[:, -self.last_l:]).float().mean().item() * 100
-            else:
-                Ltail = acc_tail = -1
             grad_norm = grad_norm.item()
             metric_lg.update(Lm=Lmean, Lt=Ltail, Accm=acc_mean, Acct=acc_tail, tnm=grad_norm)
         
+        # log to tensorboard
         if g_it == 0 or (g_it + 1) % 500 == 0:
-            if g_it == 0:
-                prob_per_class_is_chosen = gt_BL.view(-1).bincount(minlength=V).float()
-                dist.allreduce(prob_per_class_is_chosen)
-                if self.is_visualizer:
-                    prob_per_class_is_chosen /= prob_per_class_is_chosen.sum()
-                    cluster_usage = (prob_per_class_is_chosen > 0.001 / V).float().mean().item() * 100
-                    tb_lg.update(head='AR_iter_loss', z_voc_usage=cluster_usage, step=-10000)
-                    tb_lg.update(head='AR_iter_loss', z_voc_usage=cluster_usage, step=-1000)
-            
             prob_per_class_is_chosen = pred_BL.view(-1).bincount(minlength=V).float()
             dist.allreduce(prob_per_class_is_chosen)
-            
-            if self.is_visualizer:
-                prob_per_class_is_chosen /= prob_per_class_is_chosen.sum()
-                cluster_usage = (prob_per_class_is_chosen > 0.001 / V).float().mean().item() * 100
+            prob_per_class_is_chosen /= prob_per_class_is_chosen.sum()
+            cluster_usage = (prob_per_class_is_chosen > 0.001 / V).float().mean().item() * 100
+            if dist.is_master():
+                if g_it == 0:
+                    tb_lg.update(head='AR_iter_loss', z_voc_usage=cluster_usage, step=-10000)
+                    tb_lg.update(head='AR_iter_loss', z_voc_usage=cluster_usage, step=-1000)
                 kw = dict(z_voc_usage=cluster_usage)
                 for si, (bg, ed) in enumerate(self.begin_ends):
                     if 0 <= prog_si < si: break
diff --git a/utils/arg_util.py b/utils/arg_util.py
@@ -1,3 +1,4 @@
+import json
 import os
 import random
 import re
@@ -85,11 +86,17 @@ class Args(Tap):
     branch: str = subprocess.check_output(f'git symbolic-ref --short HEAD 2>/dev/null || git rev-parse HEAD', shell=True).decode('utf-8').strip() or '[unknown]' # [automatically set; don't specify this]
     commit_id: str = subprocess.check_output(f'git rev-parse HEAD', shell=True).decode('utf-8').strip() or '[unknown]'  # [automatically set; don't specify this]
     commit_msg: str = (subprocess.check_output(f'git log -1', shell=True).decode('utf-8').strip().splitlines() or ['[unknown]'])[-1].strip()    # [automatically set; don't specify this]
-    max_acc_mean: float = None  # [automatically set; don't specify this]
-    max_acc_tail: float = None  # [automatically set; don't specify this]
-    min_L_mean: float = None    # [automatically set; don't specify this]
-    min_L_tail: float = None    # [automatically set; don't specify this]
+    acc_mean: float = None      # [automatically set; don't specify this]
+    acc_tail: float = None      # [automatically set; don't specify this]
+    L_mean: float = None        # [automatically set; don't specify this]
+    L_tail: float = None        # [automatically set; don't specify this]
+    vacc_mean: float = None     # [automatically set; don't specify this]
+    vacc_tail: float = None     # [automatically set; don't specify this]
+    vL_mean: float = None       # [automatically set; don't specify this]
+    vL_tail: float = None       # [automatically set; don't specify this]
     grad_norm: float = None     # [automatically set; don't specify this]
+    cur_lr: float = None        # [automatically set; don't specify this]
+    cur_wd: float = None        # [automatically set; don't specify this]
     cur_it: str = ''            # [automatically set; don't specify this]
     cur_ep: str = ''            # [automatically set; don't specify this]
     remain_time: str = ''       # [automatically set; don't specify this]
@@ -168,6 +175,27 @@ def set_tf32(tf32: bool):
             print(f'[tf32] [ conv ] torch.backends.cudnn.allow_tf32: {torch.backends.cudnn.allow_tf32}')
             print(f'[tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: {torch.backends.cuda.matmul.allow_tf32}')
     
+    def dump_log(self):
+        if not dist.is_local_master():
+            return
+        if '1/' in self.cur_ep: # first time to dump log
+            with open(self.log_txt_path, 'w') as fp:
+                json.dump({'is_master': dist.is_master(), 'name': self.exp_name, 'cmd': self.cmd, 'commit': self.commit_id, 'branch': self.branch, 'tb_log_dir_path': self.tb_log_dir_path}, fp, indent=0)
+                fp.write('\n')
+        
+        log_dict = {}
+        for k, v in {
+            'it': self.cur_it, 'ep': self.cur_ep,
+            'lr': self.cur_lr, 'wd': self.cur_wd, 'grad_norm': self.grad_norm,
+            'L_mean': self.L_mean, 'L_tail': self.L_tail, 'acc_mean': self.acc_mean, 'acc_tail': self.acc_tail,
+            'vL_mean': self.vL_mean, 'vL_tail': self.vL_tail, 'vacc_mean': self.vacc_mean, 'vacc_tail': self.vacc_tail,
+            'remain_time': self.remain_time, 'finish_time': self.finish_time,
+        }.items():
+            if hasattr(v, 'item'): v = v.item()
+            log_dict[k] = v
+        with open(self.log_txt_path, 'a') as fp:
+            fp.write(f'{log_dict}\n')
+    
     def __str__(self):
         s = []
         for k in self.class_variables.keys():