update deepspeed

zhiyuan1i · zhiyuan1i · commit d352b4c19b39 · 2024-12-20T02:06:21.000Z
diff --git a/src/lightning/fabric/accelerators/accelerator.py b/src/lightning/fabric/accelerators/accelerator.py
@@ -49,7 +49,7 @@ def get_parallel_devices(devices: Any) -> Any:
     @staticmethod
     @abstractmethod
     def get_device_type() -> Any:
-        """Get the device for the current Accelerator."""
+        """Get the device_type for the current Accelerator."""
 
     @staticmethod
     @abstractmethod
diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py
@@ -299,6 +299,12 @@ def __init__(
 
         self._deepspeed_engine: Optional[DeepSpeedEngine] = None
 
+        if isinstance(self.accelerator, Accelerator):
+            self.device_type = self.accelerator.get_device_type()
+        else:
+            self.device_type = "cuda"
+        self.torch_lib = getattr(torch, self.device_type)
+
     @property
     def zero_stage_3(self) -> bool:
         assert isinstance(self.config, dict)
@@ -511,10 +517,8 @@ def load_checkpoint(
 
         optimzer_state_requested = any(isinstance(item, (Optimizer, DeepSpeedOptimizer)) for item in state.values())
 
-        if isinstance(self.accelerator, Accelerator) and self.accelerator.get_device_type() != "cpu":
-            getattr(torch, self.root_device.type).empty_cache()
-        else:
-            torch.cuda.empty_cache()
+        if hasattr(torch, self.device_type) and callable(self.torch_lib.empty_cache):
+            self.torch_lib.empty_cache()
 
         _, client_state = engine.load_checkpoint(
             path,
diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py
@@ -319,6 +319,12 @@ def __init__(
         self.hysteresis = hysteresis
         self.min_loss_scale = min_loss_scale
 
+        try:
+            self.device_type = self.accelerator.get_device_type()
+        except Exception:
+            self.device_type = "cuda"
+        self.torch_lib = getattr(torch, self.device_type)
+
     @override
     def setup_environment(self) -> None:
         from deepspeed.runtime.utils import get_accelerator
@@ -672,6 +678,9 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> dict[str, Any]:
 
         is_fitting = self.lightning_module.trainer.state.fn == TrainerFn.FITTING
 
+        if hasattr(torch, self.device_type) and callable(self.torch_lib.empty_cache):
+            self.torch_lib.empty_cache()
+
         _, client_state = self.deepspeed_engine.load_checkpoint(
             checkpoint_path,
             load_optimizer_states=is_fitting,