Merge pull request #3640 from flairNLP/GH-3444-save-optimizer

alanakbik · web-flow · commit 4571b4cb028c · 2025-03-26T21:46:09.000+09:00
Add option to save optimizer and scheduler state during training, and to resume training from these states
diff --git a/flair/nn/model.py b/flair/nn/model.py
@@ -32,7 +32,17 @@ class Model(torch.nn.Module, typing.Generic[DT], ABC):
     Every new type of model must implement these methods.
     """
 
-    model_card: Optional[dict[str, Any]] = None
+    def __init__(self) -> None:
+        super().__init__()
+
+        # The model card can contain training parameters and metadata
+        self.model_card: Optional[dict[str, Any]] = None
+
+        # Optimizer and scheduler states are only set during training when save_optimizer_state=True
+        # is passed to the ModelTrainer. These states allow resuming training from a checkpoint
+        # with the exact same optimizer and learning rate scheduler states.
+        self.optimizer_state_dict: Optional[dict[str, Any]] = None
+        self.scheduler_state_dict: Optional[dict[str, Any]] = None
 
     @property
     @abstractmethod
@@ -86,10 +96,26 @@ def evaluate(
         raise NotImplementedError
 
     def _get_state_dict(self) -> dict:
-        """Returns the state dictionary for this model."""
+        """Returns the state dictionary for this model.
+
+        The state dictionary contains:
+        - "state_dict": The model's parameters state dictionary
+        - "__cls__": The class name of the model for loading
+        - "optimizer_state_dict": The optimizer's state dictionary (if it exists)
+        - "scheduler_state_dict": The scheduler's state dictionary (if it exists)
+        - "model_card": Training parameters and metadata (if set)
+        """
         # Always include the name of the Model class for which the state dict holds
         state_dict = {"state_dict": self.state_dict(), "__cls__": self.__class__.__name__}
 
+        # Add optimizer state dict if it exists
+        if hasattr(self, "optimizer_state_dict") and self.optimizer_state_dict is not None:
+            state_dict["optimizer_state_dict"] = self.optimizer_state_dict
+
+        # Add scheduler state dict if it exists
+        if hasattr(self, "scheduler_state_dict") and self.scheduler_state_dict is not None:
+            state_dict["scheduler_state_dict"] = self.scheduler_state_dict
+
         return state_dict
 
     @classmethod
@@ -105,6 +131,16 @@ def _init_model_with_state_dict(cls, state: dict[str, Any], **kwargs):
 
         model.load_state_dict(state["state_dict"])
 
+        # load optimizer state if it exists in the state dict
+        if "optimizer_state_dict" in state:
+            log.debug(f"Found optimizer state in model file with keys: {state['optimizer_state_dict'].keys()}")
+            model.optimizer_state_dict = state["optimizer_state_dict"]
+
+        # load scheduler state if it exists in the state dict
+        if "scheduler_state_dict" in state:
+            log.debug(f"Found scheduler state in model file with keys: {state['scheduler_state_dict'].keys()}")
+            model.scheduler_state_dict = state["scheduler_state_dict"]
+
         return model
 
     @staticmethod
diff --git a/flair/trainers/plugins/functional/anneal_on_plateau.py b/flair/trainers/plugins/functional/anneal_on_plateau.py
@@ -58,21 +58,34 @@ def after_setup(
         anneal_mode = "min" if train_with_dev else "max"
 
         # instantiate the scheduler
-        self.scheduler: AnnealOnPlateau = AnnealOnPlateau(
+        self.scheduler = AnnealOnPlateau(
             factor=self.anneal_factor,
             patience=self.patience,
             initial_extra_patience=self.initial_extra_patience,
             mode=anneal_mode,
             optimizer=self.trainer.optimizer,
         )
 
+        # Load scheduler state if it exists
+        if hasattr(self.trainer.model, "scheduler_state_dict") and self.trainer.model.scheduler_state_dict is not None:
+            try:
+                log.info("Found saved scheduler state, loading it...")
+                self.scheduler.load_state_dict(self.trainer.model.scheduler_state_dict)
+                log.info("Scheduler state loaded successfully!")
+            except Exception as e:
+                log.warning(f"Could not load scheduler state: {e}")
+
         self.store_learning_rate()
 
     @TrainerPlugin.hook
     def after_evaluation(self, current_model_is_best, validation_scores, **kw):
         """Scheduler step of AnnealOnPlateau."""
         reduced_learning_rate: bool = self.scheduler.step(*validation_scores)
 
+        # Save scheduler state after step
+        if hasattr(self.trainer.model, "save_optimizer_state") and self.trainer.model.save_optimizer_state:
+            self.trainer.model.scheduler_state_dict = self.scheduler.state_dict()
+
         self.store_learning_rate()
 
         bad_epochs = self.scheduler.num_bad_epochs
diff --git a/flair/trainers/plugins/functional/checkpoints.py b/flair/trainers/plugins/functional/checkpoints.py
@@ -27,7 +27,9 @@ def after_training_epoch(self, epoch, **kw):
                 f"was set"
             )
             model_name = "model_epoch_" + str(epoch) + ".pt"
-            self.model.save(self.base_path / model_name, checkpoint=self.save_optimizer_state)
+
+            # Use trainer's _save_model method - we have access to trainer through self.trainer
+            self.trainer._save_model(self.base_path / model_name, save_optimizer_state=self.save_optimizer_state)
 
     @property
     def attach_to_all_processes(self) -> bool:
diff --git a/flair/trainers/plugins/functional/linear_scheduler.py b/flair/trainers/plugins/functional/linear_scheduler.py
@@ -45,6 +45,15 @@ def after_setup(
             num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, optimizer=self.trainer.optimizer
         )
 
+        # Load scheduler state if it exists
+        if hasattr(self.trainer.model, "scheduler_state_dict") and self.trainer.model.scheduler_state_dict is not None:
+            try:
+                log.info("Found saved scheduler state, loading it...")
+                self.scheduler.load_state_dict(self.trainer.model.scheduler_state_dict)
+                log.info("Scheduler state loaded successfully!")
+            except Exception as e:
+                log.warning(f"Could not load scheduler state: {e}")
+
         self.store_learning_rate()
 
     @TrainerPlugin.hook
@@ -60,6 +69,9 @@ def after_training_batch(self, optimizer_was_run: bool, **kwargs):
         if not optimizer_was_run:
             return
         self.scheduler.step()
+        # Save scheduler state after step
+        if hasattr(self.trainer.model, "save_optimizer_state") and self.trainer.model.save_optimizer_state:
+            self.trainer.model.scheduler_state_dict = self.scheduler.state_dict()
         self.store_learning_rate()
 
     def __str__(self) -> str:
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
@@ -500,6 +500,15 @@ def train_custom(
         else:
             self.optimizer = optimizer(params=self.model.parameters(), **kwargs)
 
+        # load optimizer state if it exists
+        optimizer_state_loaded = False
+        if hasattr(self.model, "optimizer_state_dict") and self.model.optimizer_state_dict is not None:
+            try:
+                self.optimizer.load_state_dict(self.model.optimizer_state_dict)
+                optimizer_state_loaded = True
+            except Exception as e:
+                log.warning(f"Found saved optimizer state from previous training but coult not load: {e}")
+
         # initialize sampler if provided
         if sampler is not None:
             # init with default values if only class is provided
@@ -561,13 +570,17 @@ def train_custom(
             log.info(f"        (train_with_dev={train_with_dev}, train_with_test={train_with_test})")
             log_line(log)
             log.info("Training Params:")
+            log.info(f' - optimizer: "{optimizer}" ')
             log.info(
                 f' - learning_rate: "{learning_rate}" '
                 f'{"(decoder: " + str(decoder_learning_rate) + ")" if decoder_learning_rate else ""}'
             )
             log.info(f' - mini_batch_size: "{mini_batch_size}"')
             log.info(f' - max_epochs: "{max_epochs}"')
             log.info(f' - shuffle: "{shuffle}"')
+            if optimizer_state_loaded:
+                log_line(log)
+                log.info("Optimizer state loaded from from previous training!")
             log_line(log)
             log.info("Plugins:")
             for plugin in plugins:
@@ -813,14 +826,14 @@ def wrapped_forward_loss(*args, **kwargs2):
 
                     if save_best_model and current_epoch_has_best_model_so_far:
                         log.info("saving best model")
-                        self._save_model(base_path / "best-model.pt", checkpoint=save_optimizer_state)
+                        self._save_model(base_path / "best-model.pt", save_optimizer_state=save_optimizer_state)
 
                 # - SWAPlugin -> restores SGD weights from SWA
                 self.dispatch("after_training_loop")
 
                 # if we do not use dev data for model selection, save final model
                 if save_final_model:
-                    self._save_model(base_path / "final-model.pt", checkpoint=save_optimizer_state)
+                    self._save_model(base_path / "final-model.pt", save_optimizer_state == save_optimizer_state)
 
             except KeyboardInterrupt:
                 log_line(log)
@@ -830,7 +843,7 @@ def wrapped_forward_loss(*args, **kwargs2):
 
                 if save_final_model:
                     log.info("Saving model ...")
-                    self._save_model(base_path / "final-model.pt", checkpoint=save_optimizer_state)
+                    self._save_model(base_path / "final-model.pt", save_optimizer_state=save_optimizer_state)
                 log.info("Done.")
 
             except TrainingInterrupt as exc:
@@ -841,7 +854,7 @@ def wrapped_forward_loss(*args, **kwargs2):
 
                 if save_final_model:
                     log.info("Saving model ...")
-                    self._save_model(base_path / "final-model.pt", checkpoint=save_optimizer_state)
+                    self._save_model(base_path / "final-model.pt", save_optimizer_state=save_optimizer_state)
                 log.info("Done.")
 
             except Exception:
@@ -989,9 +1002,19 @@ def _record(self, metric):
     def _load_model(self, model_file: Union[str, Path]) -> None:
         self.model.load_state_dict(self.model.load(model_file).state_dict())
 
-    def _save_model(self, model_file: Union[str, Path], checkpoint: bool = False) -> None:
+    def _save_model(self, model_file: Union[str, Path], save_optimizer_state: bool = False) -> None:
         if is_main_process():
-            self.model.save(model_file, checkpoint)
+            if save_optimizer_state:
+                # Save optimizer state
+                self.model.optimizer_state_dict = self.optimizer.state_dict()
+
+                # Save scheduler state from active plugins
+                for plugin in self.plugins:
+                    if hasattr(plugin, "scheduler"):
+                        self.model.scheduler_state_dict = plugin.scheduler.state_dict()
+                        break  # Only save the first scheduler we find
+
+            self.model.save(model_file)
         if torch.distributed.is_initialized():
             torch.distributed.barrier()  # Prevent any process from loading a model until writing is complete