From 4d61489d0e5d380e60f9b547c9a5eb2d08a81dd1 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Fri, 6 Sep 2024 12:27:02 +0200 Subject: [PATCH 1/6] Fix pp naming --- src/nanotron/serialize/main.py | 1 + src/nanotron/serialize/optimizer.py | 12 +++++------- src/nanotron/trainer.py | 9 +++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/nanotron/serialize/main.py b/src/nanotron/serialize/main.py index 286008ac..346ad573 100644 --- a/src/nanotron/serialize/main.py +++ b/src/nanotron/serialize/main.py @@ -236,6 +236,7 @@ def load( load_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=root_folder) load_lr_scheduler( lr_scheduler=lr_scheduler, + parallel_context=parallel_context, root_folder=root_folder, ) return checkpoint_metadata diff --git a/src/nanotron/serialize/optimizer.py b/src/nanotron/serialize/optimizer.py index 68a3b1a0..72ed7282 100644 --- a/src/nanotron/serialize/optimizer.py +++ b/src/nanotron/serialize/optimizer.py @@ -30,9 +30,9 @@ def optimizer_filename(parallel_context: ParallelContext, is_zero: bool): return f"{ObjectType.OPTIMIZER.value}_pp-{dist.get_rank(parallel_context.pp_pg)}-of-{parallel_context.pp_pg.size()}_tp-{dist.get_rank(parallel_context.tp_pg)}-of-{parallel_context.tp_pg.size()}_exp-{dist.get_rank(parallel_context.expert_pg)}-of-{parallel_context.expert_parallel_size}.pt" -def lr_scheduler_filename(): +def lr_scheduler_filename(parallel_context: ParallelContext): """The lr_scheduler is the same for all processes.""" - return f"{ObjectType.LR_SCHEDULER.value}.pt" + return f"{ObjectType.LR_SCHEDULER.value}_pp-{dist.get_rank(parallel_context.pp_pg)}-of-{parallel_context.pp_pg.size()}.pt" def save_optimizer( @@ -109,9 +109,6 @@ def save_lr_scheduler( root_folder: Path, ): """Saves lr scheduler states""" - if dist.get_rank(parallel_context.world_pg) > 0: - # Only WORLD-RANK 0 saves the lr scheduler state - return root_folder = root_folder / "lr_scheduler" root_folder.mkdir(exist_ok=True, parents=True) @@ -119,7 +116,7 @@ def save_lr_scheduler( # We dump the optimizer state using `torch.save` torch.save( lr_scheduler.state_dict(), - root_folder / lr_scheduler_filename(), + root_folder / lr_scheduler_filename(parallel_context), ) @@ -313,9 +310,10 @@ def get_checkpoint_state_metadata(param_name: str, pp_rank: int, tp_rank: int) - def load_lr_scheduler( lr_scheduler, + parallel_context: ParallelContext, root_folder: Path, ): root_folder = root_folder / "lr_scheduler" - state_dict = torch.load(root_folder / lr_scheduler_filename()) + state_dict = torch.load(root_folder / lr_scheduler_filename(parallel_context)) lr_scheduler.load_state_dict(state_dict) diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index bef629c1..e5f6bde3 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -206,6 +206,7 @@ def __init__( if self.init_checkpoint_path is not None: load_lr_scheduler( lr_scheduler=self.lr_scheduler, + parallel_context=self.parallel_context, root_folder=self.init_checkpoint_path, ) @@ -442,10 +443,10 @@ def train( self.save_checkpoint() dist.barrier() # let's wait for everyone before leaving - + if self.config.checkpoints.save_final_state: self.save_checkpoint() - + self.post_training() def training_step( @@ -864,8 +865,8 @@ def save_checkpoint(self) -> Path: ), # We only save the weights on DP==0 should_save_optimizer=True, should_save_lr_scheduler=bool( - dist.get_rank(self.parallel_context.world_pg) == 0 - ), # We only save the lr_scheduler on world_rank==0 + dist.get_rank(self.parallel_context.dp_pg) == 0 and dist.get_rank(self.parallel_context.tp_pg) + ), # We only save the lr_scheduler on DP==0 && TP==0 should_save_config=bool( dist.get_rank(self.parallel_context.world_pg) == 0 ), # We only save the config on world_rank==0 From ef835e87eef82752a287a498093059017dac44ca Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Fri, 6 Sep 2024 14:06:22 +0200 Subject: [PATCH 2/6] Added EP==0 --- src/nanotron/trainer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index e5f6bde3..2240fcaa 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -865,8 +865,10 @@ def save_checkpoint(self) -> Path: ), # We only save the weights on DP==0 should_save_optimizer=True, should_save_lr_scheduler=bool( - dist.get_rank(self.parallel_context.dp_pg) == 0 and dist.get_rank(self.parallel_context.tp_pg) - ), # We only save the lr_scheduler on DP==0 && TP==0 + dist.get_rank(self.parallel_context.dp_pg) == 0 + and dist.get_rank(self.parallel_context.tp_pg) == 0 + and dist.get_rank(self.parallel_context.expert_pg) == 0 + ), # We only save the lr_scheduler on DP==0 && TP==0 && EP==0 should_save_config=bool( dist.get_rank(self.parallel_context.world_pg) == 0 ), # We only save the config on world_rank==0 From 51bd072a59b37eb73f980207b5dd831243332969 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Tue, 26 Nov 2024 12:11:07 +0100 Subject: [PATCH 3/6] LR Schedule same name as optimizer --- src/nanotron/serialize/optimizer.py | 17 ++++++++++++----- src/nanotron/trainer.py | 7 ++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/nanotron/serialize/optimizer.py b/src/nanotron/serialize/optimizer.py index 72ed7282..c987f227 100644 --- a/src/nanotron/serialize/optimizer.py +++ b/src/nanotron/serialize/optimizer.py @@ -30,9 +30,11 @@ def optimizer_filename(parallel_context: ParallelContext, is_zero: bool): return f"{ObjectType.OPTIMIZER.value}_pp-{dist.get_rank(parallel_context.pp_pg)}-of-{parallel_context.pp_pg.size()}_tp-{dist.get_rank(parallel_context.tp_pg)}-of-{parallel_context.tp_pg.size()}_exp-{dist.get_rank(parallel_context.expert_pg)}-of-{parallel_context.expert_parallel_size}.pt" -def lr_scheduler_filename(parallel_context: ParallelContext): - """The lr_scheduler is the same for all processes.""" - return f"{ObjectType.LR_SCHEDULER.value}_pp-{dist.get_rank(parallel_context.pp_pg)}-of-{parallel_context.pp_pg.size()}.pt" +def lr_scheduler_filename(parallel_context: ParallelContext, is_zero: bool): + if is_zero is True: + return f"{ObjectType.LR_SCHEDULER.value}_pp-{dist.get_rank(parallel_context.pp_pg)}-of-{parallel_context.pp_pg.size()}_dp-{dist.get_rank(parallel_context.dp_pg)}-of-{parallel_context.dp_pg.size()}_tp-{dist.get_rank(parallel_context.tp_pg)}-of-{parallel_context.tp_pg.size()}_exp-{dist.get_rank(parallel_context.expert_pg)}-of-{parallel_context.expert_parallel_size}.pt" + else: + return f"{ObjectType.LR_SCHEDULER.value}_pp-{dist.get_rank(parallel_context.pp_pg)}-of-{parallel_context.pp_pg.size()}_tp-{dist.get_rank(parallel_context.tp_pg)}-of-{parallel_context.tp_pg.size()}_exp-{dist.get_rank(parallel_context.expert_pg)}-of-{parallel_context.expert_parallel_size}.pt" def save_optimizer( @@ -105,10 +107,14 @@ def convert_to_string(input_item): def save_lr_scheduler( lr_scheduler, + is_zero, parallel_context: ParallelContext, root_folder: Path, ): """Saves lr scheduler states""" + if not is_zero and dist.get_rank(parallel_context.dp_pg) > 0: + # this is Zero-0, so only DP-0 saves the optimizer states + return root_folder = root_folder / "lr_scheduler" root_folder.mkdir(exist_ok=True, parents=True) @@ -116,7 +122,7 @@ def save_lr_scheduler( # We dump the optimizer state using `torch.save` torch.save( lr_scheduler.state_dict(), - root_folder / lr_scheduler_filename(parallel_context), + root_folder / lr_scheduler_filename(parallel_context, is_zero), ) @@ -310,10 +316,11 @@ def get_checkpoint_state_metadata(param_name: str, pp_rank: int, tp_rank: int) - def load_lr_scheduler( lr_scheduler, + is_zero, parallel_context: ParallelContext, root_folder: Path, ): root_folder = root_folder / "lr_scheduler" - state_dict = torch.load(root_folder / lr_scheduler_filename(parallel_context)) + state_dict = torch.load(root_folder / lr_scheduler_filename(parallel_context, is_zero)) lr_scheduler.load_state_dict(state_dict) diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index 2240fcaa..a0f577d8 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -206,6 +206,7 @@ def __init__( if self.init_checkpoint_path is not None: load_lr_scheduler( lr_scheduler=self.lr_scheduler, + is_zero=self.config.optimizer.zero_stage, parallel_context=self.parallel_context, root_folder=self.init_checkpoint_path, ) @@ -864,11 +865,7 @@ def save_checkpoint(self) -> Path: dist.get_rank(self.parallel_context.dp_pg) == 0 ), # We only save the weights on DP==0 should_save_optimizer=True, - should_save_lr_scheduler=bool( - dist.get_rank(self.parallel_context.dp_pg) == 0 - and dist.get_rank(self.parallel_context.tp_pg) == 0 - and dist.get_rank(self.parallel_context.expert_pg) == 0 - ), # We only save the lr_scheduler on DP==0 && TP==0 && EP==0 + should_save_lr_scheduler=True, should_save_config=bool( dist.get_rank(self.parallel_context.world_pg) == 0 ), # We only save the config on world_rank==0 From 1981af2262ddac82877bd9c517c380f5298c3e33 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Tue, 26 Nov 2024 12:14:35 +0100 Subject: [PATCH 4/6] Load properly --- src/nanotron/serialize/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nanotron/serialize/main.py b/src/nanotron/serialize/main.py index 346ad573..91a26efa 100644 --- a/src/nanotron/serialize/main.py +++ b/src/nanotron/serialize/main.py @@ -236,6 +236,7 @@ def load( load_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=root_folder) load_lr_scheduler( lr_scheduler=lr_scheduler, + is_zero=not optimizer.inherit_from(optim.ZeroDistributedOptimizer), parallel_context=parallel_context, root_folder=root_folder, ) From a7ca23b55550ee9e95d87a91aabf9f5a2eed42b0 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Tue, 26 Nov 2024 12:18:55 +0100 Subject: [PATCH 5/6] not --- src/nanotron/serialize/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nanotron/serialize/main.py b/src/nanotron/serialize/main.py index 91a26efa..a1cb3027 100644 --- a/src/nanotron/serialize/main.py +++ b/src/nanotron/serialize/main.py @@ -236,7 +236,7 @@ def load( load_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=root_folder) load_lr_scheduler( lr_scheduler=lr_scheduler, - is_zero=not optimizer.inherit_from(optim.ZeroDistributedOptimizer), + is_zero=optimizer.inherit_from(optim.ZeroDistributedOptimizer), parallel_context=parallel_context, root_folder=root_folder, ) From bd81b677aa8cad33bbd9b85ac383df706ae843c7 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Tue, 26 Nov 2024 12:46:32 +0100 Subject: [PATCH 6/6] ready --- src/nanotron/serialize/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nanotron/serialize/main.py b/src/nanotron/serialize/main.py index 79ae7a73..e9ad04d8 100644 --- a/src/nanotron/serialize/main.py +++ b/src/nanotron/serialize/main.py @@ -203,7 +203,7 @@ def save( dist.barrier(parallel_context.world_pg) -ckpt_path(config: Config, parallel_context: ParallelContext) -> Optional[Path]: +def parse_ckpt_path(config: Config, parallel_context: ParallelContext) -> Optional[Path]: """Parse checkpoint path from config and download checkpoint from S3 if needed. Args: