huggingface · siqi654321 · Dec 26, 2024 · Feb 5, 2025
diff --git a/docs/source/usage_guides/big_modeling.md b/docs/source/usage_guides/big_modeling.md
@@ -41,7 +41,7 @@ with init_empty_weights():
 
 Next, the weights are loaded into the model for inference.
 
-The [`load_checkpoint_and_dispatch`] method loads a checkpoint inside your empty model and dispatches the weights for each layer across all available devices, starting with the fastest devices (GPU, MPS, XPU, NPU, MLU, MUSA) first before moving to the slower ones (CPU and hard drive).
+The [`load_checkpoint_and_dispatch`] method loads a checkpoint inside your empty model and dispatches the weights for each layer across all available devices, starting with the fastest devices (GPU, MPS, XPU, NPU, MLU, SDAA, MUSA) first before moving to the slower ones (CPU and hard drive).
 
 Setting `device_map="auto"` automatically fills all available space on the GPU(s) first, then the CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.
 

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -489,7 +489,7 @@ def __init__(
             and self.distributed_type not in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM)
         ):
             self.native_amp = True
-            if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu", "musa") or is_torch_xla_available(
+            if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu", "musa", "sdaa") or is_torch_xla_available(
                 check_is_tpu=True
             ):
                 raise ValueError(f"fp16 mixed precision requires a GPU (not {self.device.type!r}).")
@@ -1151,6 +1151,7 @@ def join_uneven_inputs(self, joinables, even_batches=None):
             DistributedType.MULTI_GPU,
             DistributedType.MULTI_NPU,
             DistributedType.MULTI_MLU,
+            DistributedType.MULTI_SDAA,
             DistributedType.MULTI_MUSA,
             DistributedType.MULTI_XPU,
         ):
@@ -1454,6 +1455,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
             if self.distributed_type in (
                 DistributedType.MULTI_GPU,
                 DistributedType.MULTI_MLU,
+                DistributedType.MULTI_SDAA,
                 DistributedType.MULTI_MUSA,
                 DistributedType.MULTI_NPU,
                 DistributedType.MULTI_XPU,
@@ -3228,6 +3230,7 @@ def _inner(folder):
             if self.num_processes > 1 and self.distributed_type in (
                 DistributedType.MULTI_GPU,
                 DistributedType.MULTI_MLU,
+                DistributedType.MULTI_SDAA,
                 DistributedType.MULTI_MUSA,
                 DistributedType.MULTI_NPU,
             ):

diff --git a/src/accelerate/big_modeling.py b/src/accelerate/big_modeling.py
@@ -39,6 +39,7 @@
     infer_auto_device_map,
     is_bnb_available,
     is_mlu_available,
+    is_sdaa_available,
     is_musa_available,
     is_npu_available,
     is_torch_version,
@@ -468,6 +469,8 @@ def wrapper(*args, **kwargs):
             model.npu = add_warning(model.npu, model)
         elif is_mlu_available():
             model.mlu = add_warning(model.mlu, model)
+        elif is_sdaa_available():
+            model.sdaa = add_warning(model.sdaa, model)
         elif is_musa_available():
             model.musa = add_warning(model.musa, model)
         elif is_xpu_available():
@@ -490,6 +493,8 @@ def wrapper(*args, **kwargs):
             device = f"npu:{device}"
         elif is_mlu_available() and isinstance(device, int):
             device = f"mlu:{device}"
+        elif is_sdaa_available() and isinstance(device, int):
+            device = f"sdaa:{device}"
         elif is_musa_available() and isinstance(device, int):
             device = f"musa:{device}"
         elif is_xpu_available() and isinstance(device, int):

diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py
@@ -33,6 +33,7 @@
     WEIGHTS_NAME,
     get_pretty_name,
     is_mlu_available,
+    is_sdaa_available,
     is_torch_xla_available,
     is_xpu_available,
     load,
@@ -152,6 +153,8 @@ def save_accelerator_state(
         states["torch_xpu_manual_seed"] = torch.xpu.get_rng_state_all()
     if is_mlu_available():
         states["torch_mlu_manual_seed"] = torch.mlu.get_rng_state_all()
+    elif is_sdaa_available():
+        states["torch_sdaa_manual_seed"] = torch.sdaa.get_rng_state_all()
     else:
         states["torch_cuda_manual_seed"] = torch.cuda.get_rng_state_all()
     if is_torch_xla_available():
@@ -275,6 +278,8 @@ def load_accelerator_state(
             torch.xpu.set_rng_state_all(states["torch_xpu_manual_seed"])
         if is_mlu_available():
             torch.mlu.set_rng_state_all(states["torch_mlu_manual_seed"])
+        elif is_sdaa_available():
+            torch.sdaa.set_rng_state_all(states["torch_sdaa_manual_seed"])
         else:
             torch.cuda.set_rng_state_all(states["torch_cuda_manual_seed"])
         if is_torch_xla_available():

diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
@@ -22,6 +22,7 @@
     is_deepspeed_available,
     is_fp8_available,
     is_mlu_available,
+    is_sdaa_available,
     is_mps_available,
     is_msamp_available,
     is_musa_available,
@@ -61,6 +62,7 @@ def get_cluster_input():
             "multi-GPU",
             "multi-NPU",
             "multi-MLU",
+            "multi-SDAA",
             "multi-MUSA",
             "TPU",
         ],
@@ -80,6 +82,7 @@ def get_cluster_input():
     if distributed_type in [
         DistributedType.MULTI_GPU,
         DistributedType.MULTI_MLU,
+        DistributedType.MULTI_SDAA,
         DistributedType.MULTI_MUSA,
         DistributedType.MULTI_NPU,
         DistributedType.MULTI_XPU,
@@ -164,6 +167,7 @@ def get_cluster_input():
             DistributedType.MULTI_GPU,
             DistributedType.MULTI_NPU,
             DistributedType.MULTI_MLU,
+            DistributedType.MULTI_SDAA,
             DistributedType.XLA,
             DistributedType.MULTI_MUSA,
         ]
@@ -226,6 +230,7 @@ def get_cluster_input():
             DistributedType.MULTI_XPU,
             DistributedType.MULTI_NPU,
             DistributedType.MULTI_MLU,
+            DistributedType.MULTI_SDAA,
             DistributedType.MULTI_MUSA,
             DistributedType.NO,
         ]
@@ -380,6 +385,7 @@ def get_cluster_input():
         DistributedType.MULTI_GPU,
         DistributedType.MULTI_NPU,
         DistributedType.MULTI_MLU,
+        DistributedType.MULTI_SDAA,
         DistributedType.MULTI_MUSA,
         DistributedType.MULTI_XPU,
     ]:
@@ -552,6 +558,7 @@ def get_cluster_input():
         DistributedType.MULTI_XPU,
         DistributedType.MULTI_GPU,
         DistributedType.MULTI_MLU,
+        DistributedType.MULTI_SDAA,
         DistributedType.MULTI_MUSA,
         DistributedType.MULTI_NPU,
         DistributedType.XLA,
@@ -589,6 +596,7 @@ def get_cluster_input():
         in [
             DistributedType.MULTI_GPU,
             DistributedType.MULTI_MLU,
+            DistributedType.MULTI_SDAA,
             DistributedType.MULTI_MUSA,
             DistributedType.MULTI_NPU,
             DistributedType.MULTI_XPU,
@@ -601,6 +609,8 @@ def get_cluster_input():
             machine_type = "NPU(s)"
         elif is_mlu_available():
             machine_type = "MLU(s)"
+        elif is_sdaa_available():
+            machine_type = "SDAA(s)"
         elif is_musa_available():
             machine_type = "MUSA(s)"
         elif is_xpu_available():

diff --git a/src/accelerate/commands/config/config_utils.py b/src/accelerate/commands/config/config_utils.py
@@ -72,7 +72,7 @@ def _convert_compute_environment(value):
 def _convert_distributed_mode(value):
     value = int(value)
     return DistributedType(
-        ["NO", "MULTI_CPU", "MULTI_XPU", "MULTI_GPU", "MULTI_NPU", "MULTI_MLU", "MULTI_MUSA", "XLA"][value]
+        ["NO", "MULTI_CPU", "MULTI_XPU", "MULTI_GPU", "MULTI_NPU", "MULTI_MLU", "MULTI_SDAA", "MULTI_MUSA", "XLA"][value]
     )
 
 

diff --git a/src/accelerate/commands/config/default.py b/src/accelerate/commands/config/default.py
@@ -18,7 +18,7 @@
 
 import torch
 
-from ...utils import is_mlu_available, is_musa_available, is_npu_available, is_xpu_available
+from ...utils import is_mlu_available, is_sdaa_available, is_musa_available, is_npu_available, is_xpu_available
 from .config_args import ClusterConfig, default_json_config_file
 from .config_utils import SubcommandHelpFormatter
 
@@ -65,6 +65,14 @@ def write_basic_config(mixed_precision="no", save_location: str = default_json_c
             config["distributed_type"] = "MULTI_MLU"
         else:
             config["distributed_type"] = "NO"
+    if is_sdaa_available():
+        num_sdaas = torch.sdaa.device_count()
+        config["num_processes"] = num_sdaas
+        config["use_cpu"] = False
+        if num_sdaas > 1:
+            config["distributed_type"] = "MULTI_SDAA"
+        else:
+            config["distributed_type"] = "NO"
     elif is_musa_available():
         num_musas = torch.musa.device_count()
         config["num_processes"] = num_musas

diff --git a/src/accelerate/commands/env.py b/src/accelerate/commands/env.py
@@ -26,7 +26,7 @@
 from accelerate import __version__ as version
 from accelerate.commands.config import default_config_file, load_config_from_file
 
-from ..utils import is_mlu_available, is_musa_available, is_npu_available, is_xpu_available
+from ..utils import is_mlu_available, is_sdaa_available, is_musa_available, is_npu_available, is_xpu_available
 
 
 def env_command_parser(subparsers=None):
@@ -49,6 +49,7 @@ def env_command(args):
     pt_cuda_available = torch.cuda.is_available()
     pt_xpu_available = is_xpu_available()
     pt_mlu_available = is_mlu_available()
+    pt_sdaa_available = is_sdaa_available()
     pt_musa_available = is_musa_available()
     pt_npu_available = is_npu_available()
 
@@ -76,13 +77,16 @@ def env_command(args):
         "PyTorch XPU available": str(pt_xpu_available),
         "PyTorch NPU available": str(pt_npu_available),
         "PyTorch MLU available": str(pt_mlu_available),
+        "PyTorch SDAA available": str(pt_sdaa_available),
         "PyTorch MUSA available": str(pt_musa_available),
         "System RAM": f"{psutil.virtual_memory().total / 1024 ** 3:.2f} GB",
     }
     if pt_cuda_available:
         info["GPU type"] = torch.cuda.get_device_name()
     if pt_mlu_available:
         info["MLU type"] = torch.mlu.get_device_name()
+    if pt_sdaa_available:
+        info["SDAA type"] = torch.sdaa.get_device_name()
     if pt_npu_available:
         info["CANN version"] = torch.version.cann
 

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
@@ -40,6 +40,7 @@
     is_bf16_available,
     is_deepspeed_available,
     is_mlu_available,
+    is_sdaa_available,
     is_musa_available,
     is_npu_available,
     is_rich_available,
@@ -994,6 +995,7 @@ def _validate_launch_command(args):
                     DistributedType.MULTI_GPU,
                     DistributedType.MULTI_NPU,
                     DistributedType.MULTI_MLU,
+                    DistributedType.MULTI_SDAA,
                     DistributedType.MULTI_MUSA,
                     DistributedType.MULTI_XPU,
                 )
@@ -1076,6 +1078,8 @@ def _validate_launch_command(args):
                 args.num_processes = torch.xpu.device_count()
             elif is_mlu_available():
                 args.num_processes = torch.mlu.device_count()
+            elif is_sdaa_available():
+                args.num_processes = torch.sdaa.device_count()
             elif is_musa_available():
                 args.num_processes = torch.musa.device_count()
             elif is_npu_available():
@@ -1091,6 +1095,7 @@ def _validate_launch_command(args):
             and (
                 (args.use_xpu and is_xpu_available() and torch.xpu.device_count() > 1)
                 or (is_mlu_available() and torch.mlu.device_count() > 1)
+                or (is_sdaa_available() and torch.sdaa.device_count() > 1)
                 or (is_musa_available() and torch.musa.device_count() > 1)
                 or (is_npu_available() and torch.npu.device_count() > 1)
                 or (torch.cuda.device_count() > 1)

diff --git a/src/accelerate/hooks.py b/src/accelerate/hooks.py
@@ -31,7 +31,7 @@
 from .utils.other import recursive_getattr
 
 
-_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "musa"]
+_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "sdaa", "musa"]
 
 
 class ModelHook:

diff --git a/src/accelerate/local_sgd.py b/src/accelerate/local_sgd.py
@@ -71,6 +71,7 @@ def __init__(self, accelerator: Accelerator, model: torch.nn.Module, local_sgd_s
             DistributedType.MULTI_GPU,
             DistributedType.MULTI_XPU,
             DistributedType.MULTI_MLU,
+            DistributedType.MULTI_SDAA,
             DistributedType.MULTI_MUSA,
             DistributedType.MULTI_NPU,
         ]:

diff --git a/src/accelerate/state.py b/src/accelerate/state.py
@@ -40,6 +40,7 @@
     is_fp8_available,
     is_ipex_available,
     is_mlu_available,
+    is_sdaa_available,
     is_mps_available,
     is_musa_available,
     is_npu_available,
@@ -58,6 +59,9 @@
 if is_mlu_available(check_device=False):
     import torch_mlu  # noqa: F401
 
+if is_sdaa_available(check_device=False):
+    import torch_sdaa  # noqa: F401
+
 if is_musa_available(check_device=False):
     import torch_musa  # noqa: F401
 
@@ -201,6 +205,9 @@ def __init__(self, cpu: bool = False, **kwargs):
                         from deepspeed import comm as dist
 
                         if not dist.is_initialized():
+                            if self.backend == 'tccl':
+                                local_rank = os.environ.get("LOCAL_RANK", -1)
+                                torch.sdaa.set_device(f'sdaa:{local_rank}')
                             dist.init_distributed(dist_backend=self.backend, auto_mpi_discovery=False, **kwargs)
                         # We need to flag to `use_deepspeed` to be True to override `distributed_type` later
                         use_deepspeed = True
@@ -209,6 +216,9 @@ def __init__(self, cpu: bool = False, **kwargs):
                         self.distributed_type not in (DistributedType.MULTI_XPU, DistributedType.MULTI_CPU)
                         and not torch.distributed.is_initialized()
                     ):
+                        if self.backend == 'tccl':
+                            local_rank = os.environ.get("LOCAL_RANK", -1)
+                            torch.sdaa.set_device(f'sdaa:{local_rank}')
                         torch.distributed.init_process_group(backend=self.backend, **kwargs)
             # XPU and CPU require special env configs to be set
             if self.distributed_type in (DistributedType.MULTI_XPU, DistributedType.MULTI_CPU):
@@ -365,6 +375,7 @@ def wait_for_everyone(self):
         if self.distributed_type in (
             DistributedType.MULTI_GPU,
             DistributedType.MULTI_MLU,
+            DistributedType.MULTI_SDAA,
             DistributedType.MULTI_MUSA,
             DistributedType.MULTI_NPU,
             DistributedType.MULTI_XPU,
@@ -685,6 +696,7 @@ def default_device(self) -> torch.device:
         - MPS if `torch.backends.mps.is_available()` and `torch.backends.mps.is_built()` both return True.
         - CUDA if `torch.cuda.is_available()`
         - MLU if `is_mlu_available()`
+        - SDAA if `is_sdaa_available()`
         - MUSA if `is_musa_available()`
         - NPU if `is_npu_available()`
         - CPU otherwise
@@ -694,6 +706,8 @@ def default_device(self) -> torch.device:
             return torch.device("mps")
         elif is_mlu_available():
             return torch.device("mlu")
+        elif is_sdaa_available():
+            return torch.device("sdaa")
         elif is_musa_available():
             return torch.device("musa")
         # NPU should be checked before CUDA when using `transfer_to_npu`
@@ -724,6 +738,9 @@ def _prepare_backend(
             if is_mlu_available():
                 backend = "cncl"
                 distributed_type = DistributedType.MULTI_MLU
+            if is_sdaa_available():
+                backend = "tccl"
+                distributed_type = DistributedType.MULTI_SDAA
             elif is_musa_available():
                 backend = "mccl"
                 distributed_type = DistributedType.MULTI_MUSA
@@ -776,7 +793,7 @@ def set_device(self):
             self.device = torch.device("cpu") if self._cpu else self.default_device
             return
         device = str(self.distributed_type).split(".")[-1].replace("MULTI_", "").lower()
-        if device not in ("cpu", "gpu", "mlu", "musa", "npu", "xpu", "xla"):
+        if device not in ("cpu", "gpu", "mlu", "musa", "npu", "xpu", "xla", "sdaa"):
             raise ValueError(
                 f"Can't set device for {self.distributed_type} ({device}), verify we should be calling `_set_device()` for it!"
             )
@@ -906,6 +923,7 @@ def __init__(
             elif self.distributed_type in [
                 DistributedType.MULTI_GPU,
                 DistributedType.MULTI_MLU,
+                DistributedType.MULTI_SDAA,
                 DistributedType.MULTI_MUSA,
                 DistributedType.MULTI_NPU,
                 DistributedType.MULTI_XPU,

diff --git a/src/accelerate/test_utils/__init__.py b/src/accelerate/test_utils/__init__.py
@@ -26,6 +26,7 @@
     require_cuda,
     require_huggingface_suite,
     require_mlu,
+    require_sdaa,
     require_mps,
     require_multi_device,
     require_multi_gpu,